import nltk import pandas as pd from sklearn.neural_network import MLPClassifier from nltk.tokenize import word_tokenize from gensim.models import Word2Vec nltk.download('punkt') # w pliku train.tsv w kolumnach 25706, 58881, 73761 trzeba zamienic w tekscie tabulator na 4 spacje train = pd.read_csv('train/train.tsv', sep='\t', names=['y', 'x'], header=None) print(train["y"][0], train["x"][0]) # https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/ slowa = [] for tekst in train["x"]: pom = [] for slowo in word_tokenize(tekst): pom.append(slowo.lower()) slowa.append(pom) print(slowa[0]) # https://radimrehurek.com/gensim/models/word2vec.html model = Word2Vec(sentences=slowa, vector_size=100, window=5, min_count=1, workers=4) model.save("word2vec.model") wektor = model.wv['przyjmujący'] print(wektor) podobne = model.wv.most_similar('przyjmujący', topn=5) print(podobne) teksty = [] for tekst in train["x"]: pom = None for slowo in word_tokenize(tekst): wektor = model.wv[slowo.lower()] if pom is None: pom = wektor else: pom = pom + wektor teksty.append(wektor) print(teksty[0]) X = teksty y = train["y"] clf = MLPClassifier() # activation="tanh" clf.fit(X, y) # w pliku in.tsv w kolumnach 1983, 5199 trzeba zamienic w tekscie tabulator na 4 spacje test = pd.read_csv('test-A/in.tsv', sep='\t', names=['x'], header=None) print(test["x"][0]) # https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/ slowa = [] for tekst in test["x"]: pom = [] for slowo in word_tokenize(tekst): pom.append(slowo.lower()) slowa.append(pom) print(slowa[0]) teksty = [] for tekst in test["x"]: pom = None for slowo in word_tokenize(tekst): wektor = None try: wektor = model.wv[slowo.lower()] except KeyError: pass if wektor is not None: if pom is None: pom = wektor else: pom = pom + wektor teksty.append(wektor) print(teksty[0]) przewidywania = clf.predict(teksty) print(przewidywania) with open("test-A/out.tsv", "w", encoding="utf-8") as uwu: for p in przewidywania: uwu.write(str(p)+"\n") ### dev-0 # w pliku in.tsv w kolumnach 1983, 5199 trzeba zamienic w tekscie tabulator na 4 spacje dev_in = pd.read_csv('dev-0/in.tsv', sep='\t', names=['x'], header=None) print(dev_in["x"][0]) dev_expected = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['y'], header=None) print(dev_expected["y"][0]) # https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/ slowa = [] for tekst in dev_in["x"]: pom = [] for slowo in word_tokenize(tekst): pom.append(slowo.lower()) slowa.append(pom) print(slowa[0]) teksty = [] for tekst in test["x"]: pom = None for slowo in word_tokenize(tekst): wektor = None try: wektor = model.wv[slowo.lower()] except KeyError: pass if wektor is not None: if pom is None: pom = wektor else: pom = pom + wektor teksty.append(wektor) print(teksty[0]) przewidywania = clf.predict(teksty) print(przewidywania) with open("dev-0/out.tsv", "w", encoding="utf-8") as uwu: for p in przewidywania: uwu.write(str(p)+"\n") for i in range(len(przewidywania)): print(przewidywania[i], dev_expected["y"][i])