import csv import nltk import pandas as pd from sklearn.neural_network import MLPClassifier from nltk.tokenize import word_tokenize from gensim.models import Word2Vec nltk.download('punkt') # w pliku train.tsv w kolumnach 25706, 58881, 73761 trzeba zamienic w tekscie tabulator na 4 spacje train = pd.read_csv('train/train.tsv', sep='\t') train.columns = ["y", "x"] print(train["y"][0], train["x"][0]) # https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/ slowa = [] for tekst in train["x"]: pom = [] for slowo in word_tokenize(tekst): pom.append(slowo.lower()) slowa.append(pom) print(slowa[0]) # https://radimrehurek.com/gensim/models/word2vec.html model = Word2Vec(sentences=slowa, vector_size=100, window=5, min_count=1, workers=4) model.save("word2vec.model") wektor = model.wv['przyjmujący'] print(wektor) podobne = model.wv.most_similar('przyjmujący', topn=5) print(podobne) teksty = [] for tekst in train["x"]: pom = None for slowo in word_tokenize(tekst): wektor = model.wv[slowo.lower()] if pom is None: pom = wektor else: pom = pom + wektor teksty.append(wektor) print(teksty[0]) X = teksty y = train["y"] clf = MLPClassifier() # activation="tanh" clf.fit(X, y) # w pliku in.tsv w kolumnach 1983, 5199 trzeba zamienic w tekscie tabulator na 4 spacje test = pd.read_csv('test-A/in.tsv', sep='\t') test.columns = ["x"] print(test["x"][0]) # https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/ slowa = [] for tekst in test["x"]: pom = [] for slowo in word_tokenize(tekst): pom.append(slowo.lower()) slowa.append(pom) print(slowa[0]) teksty = [] for tekst in test["x"]: pom = None for slowo in word_tokenize(tekst): wektor = None try: wektor = model.wv[slowo.lower()] except KeyError: pass if wektor is not None: if pom is None: pom = wektor else: pom = pom + wektor teksty.append(wektor) print(teksty[0]) przewidywania = clf.predict(teksty) print(przewidywania) with open("test-A/out.tsv", "w", encoding="utf-8") as uwu: for p in przewidywania: uwu.write(str(p)+"\n")