Using bigrams; clear text
This commit is contained in:
parent
4fff3f328d
commit
0c97fbd451
31
Makefile
Normal file
31
Makefile
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
seed=123456789
|
||||||
|
CONFIG=--passes 18 -b 16 --random_seed $(seed) --link logistic --loss_function logistic -k --cache_file vw-meta-cashe
|
||||||
|
|
||||||
|
all: dev-0/out.tsv test-A/out.tsv
|
||||||
|
../geval --test-name dev-0/
|
||||||
|
echo "asd"
|
||||||
|
|
||||||
|
dev-0/data.vw: prepare_data.py
|
||||||
|
python3 prepare_data.py dev-0/in.tsv dev-0/expected.tsv > $@
|
||||||
|
echo "dev-0/data.vw created"
|
||||||
|
|
||||||
|
train/data.vw: prepare_data.py
|
||||||
|
python3 prepare_data.py train/in.tsv train/expected.tsv > $@
|
||||||
|
echo "train/data.vw created"
|
||||||
|
|
||||||
|
test-A/data.vw: prepare_data.py
|
||||||
|
python3 prepare_data.py test-A/in.tsv test-A/expected.tsv > $@
|
||||||
|
echo "test-A/data.vw created"
|
||||||
|
|
||||||
|
model.vw: train/data.vw dev-0/data.vw test-A/data.vw
|
||||||
|
echo $<
|
||||||
|
vw $< -f $@ $(CONFIG)
|
||||||
|
echo "model created"
|
||||||
|
|
||||||
|
dev-0/out.tsv: model.vw dev-0/data.vw
|
||||||
|
vw -t dev-0/data.vw -i $< --link logistic -p /dev/stdout > $@
|
||||||
|
test-A/out.tsv: model.vw test-A/data.vw
|
||||||
|
vw -t test-A/data.vw -i $< --link logistic -p /dev/stdout > $@
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm dev-0/out.tsv model.vw train/data.vw test-A/data.vw dev-0/data.vw
|
10498
dev-0/out.tsv
10498
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -23,10 +23,11 @@ def create_dict(in_file, expected_file):
|
|||||||
for line, exp in zip(in_f, exp_f):
|
for line, exp in zip(in_f, exp_f):
|
||||||
line = line.rstrip('\n').split("\t")[0]
|
line = line.rstrip('\n').split("\t")[0]
|
||||||
exp = exp.rstrip("\n")
|
exp = exp.rstrip("\n")
|
||||||
line = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', line)
|
#line = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', line)
|
||||||
line = line.replace("|", "")
|
#line = line.replace("|", "")
|
||||||
line = line.replace(":", "")
|
#line = line.replace(":", "")
|
||||||
post = line
|
#post = line
|
||||||
|
post = clear_post(line)
|
||||||
#post = clear_post(line)
|
#post = clear_post(line)
|
||||||
tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist]
|
tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist]
|
||||||
bigrams = nltk.bigrams(tokenized_line)
|
bigrams = nltk.bigrams(tokenized_line)
|
||||||
|
10252
test-A/out.tsv
10252
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user