Bigrams, lemas, words

2020-04-27 14:24:12 +02:00 · 2020-04-27 14:24:12 +02:00 · e72f319c75
commit e72f319c75
parent ccc9b8fe8c
3 changed files with 10400 additions and 10385 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/prepare_data.py
+++ b/prepare_data.py
@ -3,6 +3,8 @@ import re, sys, nltk
 from nltk.tokenize import TweetTokenizer
 from nltk.corpus import stopwords
 from string import punctuation
+from nltk.corpus import wordnet as wn
+

 def clear_post(post):
    post = post.replace('\\n', ' ')
@ -31,6 +33,17 @@ def create_dict(in_file, expected_file):
            #post = clear_post(line)
            tokenized_line = [token for token in tt.tokenize(post) if token not in stoplist]
            bigrams = nltk.bigrams(tokenized_line)
+            lemas = {}
+            for word in set(tokenized_line):
+                res = wn.synsets(word)
+                if not res:
+                    continue
+                word_lem = []
+                for w in res:
+                    le_na = w.lemma_names()
+                    word_lem.append(" ".join(le_na))
+                lemas[word] = " ".join(word_lem)
+
            if exp == "d":
                ex_val = ""
            else:
@ -39,7 +52,7 @@ def create_dict(in_file, expected_file):
                    ex_val = -1
            counter+=1
            big_merged = [i[0] +"_" + i[1] for i in list(bigrams)]
-            posts[counter] = [ex_val, " ".join(tokenized_line), " ".join(big_merged)]
+            posts[counter] = [ex_val, " ".join(tokenized_line), " ".join(big_merged), lemas]
            # TODO
            # Można stworzyc tutaj juz bigramy 
    return posts
@ -49,8 +62,10 @@ def create_file(posts):
        exp = v[0]
        post = v[1]
        big = v[2]
+        lem = list(v[3].values())
+        lem = " ".join(lem)
        #print(exp, "|", post)
-        print(exp, "|words", post, "|bigrams", big)
+        print(exp, "|words", post, "|bigrams", big, "|lemmas" , lem)

 def main():
    if len(sys.argv) != 3:
--- a/test-A/out.tsv
+++ b/test-A/out.tsv