Bigrams, lemas, words

This commit is contained in:
s426135 2020-04-27 14:24:12 +02:00
parent ccc9b8fe8c
commit e72f319c75
3 changed files with 10400 additions and 10385 deletions

File diff suppressed because it is too large Load Diff

View File

@ -3,6 +3,8 @@ import re, sys, nltk
from nltk.tokenize import TweetTokenizer from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords from nltk.corpus import stopwords
from string import punctuation from string import punctuation
from nltk.corpus import wordnet as wn
def clear_post(post): def clear_post(post):
post = post.replace('\\n', ' ') post = post.replace('\\n', ' ')
@ -31,6 +33,17 @@ def create_dict(in_file, expected_file):
#post = clear_post(line) #post = clear_post(line)
tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist] tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist]
bigrams = nltk.bigrams(tokenized_line) bigrams = nltk.bigrams(tokenized_line)
lemas = {}
for word in set(tokenized_line):
res = wn.synsets(word)
if not res:
continue
word_lem = []
for w in res:
le_na = w.lemma_names()
word_lem.append(" ".join(le_na))
lemas[word] = " ".join(word_lem)
if exp == "d": if exp == "d":
ex_val = "" ex_val = ""
else: else:
@ -39,7 +52,7 @@ def create_dict(in_file, expected_file):
ex_val = -1 ex_val = -1
counter+=1 counter+=1
big_merged = [i[0] +"_" + i[1] for i in list(bigrams)] big_merged = [i[0] +"_" + i[1] for i in list(bigrams)]
posts[counter] = [ex_val, " ".join(tokenized_line), " ".join(big_merged)] posts[counter] = [ex_val, " ".join(tokenized_line), " ".join(big_merged), lemas]
# TODO # TODO
# Można stworzyc tutaj juz bigramy # Można stworzyc tutaj juz bigramy
return posts return posts
@ -49,8 +62,10 @@ def create_file(posts):
exp = v[0] exp = v[0]
post = v[1] post = v[1]
big = v[2] big = v[2]
lem = list(v[3].values())
lem = " ".join(lem)
#print(exp, "|", post) #print(exp, "|", post)
print(exp, "|words", post, "|bigrams", big) print(exp, "|words", post, "|bigrams", big, "|lemmas" , lem)
def main(): def main():
if len(sys.argv) != 3: if len(sys.argv) != 3:

File diff suppressed because it is too large Load Diff