Using bigrams; clear text

This commit is contained in:
s426135 2020-04-27 12:15:20 +02:00
parent 4fff3f328d
commit 0c97fbd451
4 changed files with 10411 additions and 10379 deletions

31
Makefile Normal file
View File

@ -0,0 +1,31 @@
seed=123456789
CONFIG=--passes 18 -b 16 --random_seed $(seed) --link logistic --loss_function logistic -k --cache_file vw-meta-cashe
all: dev-0/out.tsv test-A/out.tsv
../geval --test-name dev-0/
echo "asd"
dev-0/data.vw: prepare_data.py
python3 prepare_data.py dev-0/in.tsv dev-0/expected.tsv > $@
echo "dev-0/data.vw created"
train/data.vw: prepare_data.py
python3 prepare_data.py train/in.tsv train/expected.tsv > $@
echo "train/data.vw created"
test-A/data.vw: prepare_data.py
python3 prepare_data.py test-A/in.tsv test-A/expected.tsv > $@
echo "test-A/data.vw created"
model.vw: train/data.vw dev-0/data.vw test-A/data.vw
echo $<
vw $< -f $@ $(CONFIG)
echo "model created"
dev-0/out.tsv: model.vw dev-0/data.vw
vw -t dev-0/data.vw -i $< --link logistic -p /dev/stdout > $@
test-A/out.tsv: model.vw test-A/data.vw
vw -t test-A/data.vw -i $< --link logistic -p /dev/stdout > $@
clean:
rm dev-0/out.tsv model.vw train/data.vw test-A/data.vw dev-0/data.vw

File diff suppressed because it is too large Load Diff

View File

@ -23,10 +23,11 @@ def create_dict(in_file, expected_file):
for line, exp in zip(in_f, exp_f): for line, exp in zip(in_f, exp_f):
line = line.rstrip('\n').split("\t")[0] line = line.rstrip('\n').split("\t")[0]
exp = exp.rstrip("\n") exp = exp.rstrip("\n")
line = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', line) #line = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', line)
line = line.replace("|", "") #line = line.replace("|", "")
line = line.replace(":", "") #line = line.replace(":", "")
post = line #post = line
post = clear_post(line)
#post = clear_post(line) #post = clear_post(line)
tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist] tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist]
bigrams = nltk.bigrams(tokenized_line) bigrams = nltk.bigrams(tokenized_line)

File diff suppressed because it is too large Load Diff