Bigrams, lemas, words

This commit is contained in:
s426135 2020-04-27 14:24:12 +02:00
parent ccc9b8fe8c
commit e72f319c75
3 changed files with 10400 additions and 10385 deletions

File diff suppressed because it is too large Load Diff

View File

@ -3,6 +3,8 @@ import re, sys, nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from string import punctuation
from nltk.corpus import wordnet as wn
def clear_post(post):
post = post.replace('\\n', ' ')
@ -31,6 +33,17 @@ def create_dict(in_file, expected_file):
#post = clear_post(line)
tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist]
bigrams = nltk.bigrams(tokenized_line)
lemas = {}
for word in set(tokenized_line):
res = wn.synsets(word)
if not res:
continue
word_lem = []
for w in res:
le_na = w.lemma_names()
word_lem.append(" ".join(le_na))
lemas[word] = " ".join(word_lem)
if exp == "d":
ex_val = ""
else:
@ -39,7 +52,7 @@ def create_dict(in_file, expected_file):
ex_val = -1
counter+=1
big_merged = [i[0] +"_" + i[1] for i in list(bigrams)]
posts[counter] = [ex_val, " ".join(tokenized_line), " ".join(big_merged)]
posts[counter] = [ex_val, " ".join(tokenized_line), " ".join(big_merged), lemas]
# TODO
# Można stworzyc tutaj juz bigramy
return posts
@ -49,8 +62,10 @@ def create_file(posts):
exp = v[0]
post = v[1]
big = v[2]
lem = list(v[3].values())
lem = " ".join(lem)
#print(exp, "|", post)
print(exp, "|words", post, "|bigrams", big)
print(exp, "|words", post, "|bigrams", big, "|lemmas" , lem)
def main():
if len(sys.argv) != 3:

File diff suppressed because it is too large Load Diff