91 lines
2.2 KiB
Python
91 lines
2.2 KiB
Python
|
import csv
|
||
|
|
||
|
import nltk
|
||
|
import pandas as pd
|
||
|
from sklearn.neural_network import MLPClassifier
|
||
|
from nltk.tokenize import word_tokenize
|
||
|
from gensim.models import Word2Vec
|
||
|
nltk.download('punkt')
|
||
|
|
||
|
# w pliku train.tsv w kolumnach 25706, 58881, 73761 trzeba zamienic w tekscie tabulator na 4 spacje
|
||
|
train = pd.read_csv('train/train.tsv', sep='\t')
|
||
|
train.columns = ["y", "x"]
|
||
|
|
||
|
print(train["y"][0], train["x"][0])
|
||
|
|
||
|
# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
|
||
|
slowa = []
|
||
|
for tekst in train["x"]:
|
||
|
pom = []
|
||
|
for slowo in word_tokenize(tekst):
|
||
|
pom.append(slowo.lower())
|
||
|
slowa.append(pom)
|
||
|
print(slowa[0])
|
||
|
|
||
|
# https://radimrehurek.com/gensim/models/word2vec.html
|
||
|
model = Word2Vec(sentences=slowa, vector_size=100, window=5, min_count=1, workers=4)
|
||
|
model.save("word2vec.model")
|
||
|
|
||
|
wektor = model.wv['przyjmujący']
|
||
|
print(wektor)
|
||
|
|
||
|
podobne = model.wv.most_similar('przyjmujący', topn=5)
|
||
|
print(podobne)
|
||
|
|
||
|
teksty = []
|
||
|
for tekst in train["x"]:
|
||
|
pom = None
|
||
|
for slowo in word_tokenize(tekst):
|
||
|
wektor = model.wv[slowo.lower()]
|
||
|
if pom is None:
|
||
|
pom = wektor
|
||
|
else:
|
||
|
pom = pom + wektor
|
||
|
teksty.append(wektor)
|
||
|
print(teksty[0])
|
||
|
|
||
|
X = teksty
|
||
|
y = train["y"]
|
||
|
|
||
|
clf = MLPClassifier() # activation="tanh"
|
||
|
clf.fit(X, y)
|
||
|
|
||
|
# w pliku in.tsv w kolumnach 1983, 5199 trzeba zamienic w tekscie tabulator na 4 spacje
|
||
|
test = pd.read_csv('test-A/in.tsv', sep='\t')
|
||
|
test.columns = ["x"]
|
||
|
|
||
|
print(test["x"][0])
|
||
|
|
||
|
# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
|
||
|
slowa = []
|
||
|
for tekst in test["x"]:
|
||
|
pom = []
|
||
|
for slowo in word_tokenize(tekst):
|
||
|
pom.append(slowo.lower())
|
||
|
slowa.append(pom)
|
||
|
print(slowa[0])
|
||
|
|
||
|
teksty = []
|
||
|
for tekst in test["x"]:
|
||
|
pom = None
|
||
|
for slowo in word_tokenize(tekst):
|
||
|
wektor = None
|
||
|
try:
|
||
|
wektor = model.wv[slowo.lower()]
|
||
|
except KeyError:
|
||
|
pass
|
||
|
if wektor is not None:
|
||
|
if pom is None:
|
||
|
pom = wektor
|
||
|
else:
|
||
|
pom = pom + wektor
|
||
|
teksty.append(wektor)
|
||
|
print(teksty[0])
|
||
|
|
||
|
przewidywania = clf.predict(teksty)
|
||
|
print(przewidywania)
|
||
|
|
||
|
with open("test-A/out.tsv", "w", encoding="utf-8") as uwu:
|
||
|
for p in przewidywania:
|
||
|
uwu.write(str(p)+"\n")
|