Bigrams, lemas, words
This commit is contained in:
parent
ccc9b8fe8c
commit
e72f319c75
10508
dev-0/out.tsv
10508
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -3,6 +3,8 @@ import re, sys, nltk
|
||||
from nltk.tokenize import TweetTokenizer
|
||||
from nltk.corpus import stopwords
|
||||
from string import punctuation
|
||||
from nltk.corpus import wordnet as wn
|
||||
|
||||
|
||||
def clear_post(post):
|
||||
post = post.replace('\\n', ' ')
|
||||
@ -31,6 +33,17 @@ def create_dict(in_file, expected_file):
|
||||
#post = clear_post(line)
|
||||
tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist]
|
||||
bigrams = nltk.bigrams(tokenized_line)
|
||||
lemas = {}
|
||||
for word in set(tokenized_line):
|
||||
res = wn.synsets(word)
|
||||
if not res:
|
||||
continue
|
||||
word_lem = []
|
||||
for w in res:
|
||||
le_na = w.lemma_names()
|
||||
word_lem.append(" ".join(le_na))
|
||||
lemas[word] = " ".join(word_lem)
|
||||
|
||||
if exp == "d":
|
||||
ex_val = ""
|
||||
else:
|
||||
@ -39,7 +52,7 @@ def create_dict(in_file, expected_file):
|
||||
ex_val = -1
|
||||
counter+=1
|
||||
big_merged = [i[0] +"_" + i[1] for i in list(bigrams)]
|
||||
posts[counter] = [ex_val, " ".join(tokenized_line), " ".join(big_merged)]
|
||||
posts[counter] = [ex_val, " ".join(tokenized_line), " ".join(big_merged), lemas]
|
||||
# TODO
|
||||
# Można stworzyc tutaj juz bigramy
|
||||
return posts
|
||||
@ -49,8 +62,10 @@ def create_file(posts):
|
||||
exp = v[0]
|
||||
post = v[1]
|
||||
big = v[2]
|
||||
lem = list(v[3].values())
|
||||
lem = " ".join(lem)
|
||||
#print(exp, "|", post)
|
||||
print(exp, "|words", post, "|bigrams", big)
|
||||
print(exp, "|words", post, "|bigrams", big, "|lemmas" , lem)
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 3:
|
||||
|
10258
test-A/out.tsv
10258
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user