Bigrams, lemas, words
This commit is contained in:
parent
ccc9b8fe8c
commit
e72f319c75
10508
dev-0/out.tsv
10508
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -3,6 +3,8 @@ import re, sys, nltk
|
|||||||
from nltk.tokenize import TweetTokenizer
|
from nltk.tokenize import TweetTokenizer
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
from string import punctuation
|
from string import punctuation
|
||||||
|
from nltk.corpus import wordnet as wn
|
||||||
|
|
||||||
|
|
||||||
def clear_post(post):
|
def clear_post(post):
|
||||||
post = post.replace('\\n', ' ')
|
post = post.replace('\\n', ' ')
|
||||||
@ -31,6 +33,17 @@ def create_dict(in_file, expected_file):
|
|||||||
#post = clear_post(line)
|
#post = clear_post(line)
|
||||||
tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist]
|
tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist]
|
||||||
bigrams = nltk.bigrams(tokenized_line)
|
bigrams = nltk.bigrams(tokenized_line)
|
||||||
|
lemas = {}
|
||||||
|
for word in set(tokenized_line):
|
||||||
|
res = wn.synsets(word)
|
||||||
|
if not res:
|
||||||
|
continue
|
||||||
|
word_lem = []
|
||||||
|
for w in res:
|
||||||
|
le_na = w.lemma_names()
|
||||||
|
word_lem.append(" ".join(le_na))
|
||||||
|
lemas[word] = " ".join(word_lem)
|
||||||
|
|
||||||
if exp == "d":
|
if exp == "d":
|
||||||
ex_val = ""
|
ex_val = ""
|
||||||
else:
|
else:
|
||||||
@ -39,7 +52,7 @@ def create_dict(in_file, expected_file):
|
|||||||
ex_val = -1
|
ex_val = -1
|
||||||
counter+=1
|
counter+=1
|
||||||
big_merged = [i[0] +"_" + i[1] for i in list(bigrams)]
|
big_merged = [i[0] +"_" + i[1] for i in list(bigrams)]
|
||||||
posts[counter] = [ex_val, " ".join(tokenized_line), " ".join(big_merged)]
|
posts[counter] = [ex_val, " ".join(tokenized_line), " ".join(big_merged), lemas]
|
||||||
# TODO
|
# TODO
|
||||||
# Można stworzyc tutaj juz bigramy
|
# Można stworzyc tutaj juz bigramy
|
||||||
return posts
|
return posts
|
||||||
@ -49,8 +62,10 @@ def create_file(posts):
|
|||||||
exp = v[0]
|
exp = v[0]
|
||||||
post = v[1]
|
post = v[1]
|
||||||
big = v[2]
|
big = v[2]
|
||||||
|
lem = list(v[3].values())
|
||||||
|
lem = " ".join(lem)
|
||||||
#print(exp, "|", post)
|
#print(exp, "|", post)
|
||||||
print(exp, "|words", post, "|bigrams", big)
|
print(exp, "|words", post, "|bigrams", big, "|lemmas" , lem)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
if len(sys.argv) != 3:
|
if len(sys.argv) != 3:
|
||||||
|
10258
test-A/out.tsv
10258
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user