???
This commit is contained in:
parent
8202edc3ba
commit
c8d7e1452c
|
@ -0,0 +1,3 @@
|
|||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
|
@ -0,0 +1,24 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<profile version="1.0">
|
||||
<option name="myName" value="Project Default" />
|
||||
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||
<option name="ignoredErrors">
|
||||
<list>
|
||||
<option value="N802" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
<inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||
<option name="ignoredIdentifiers">
|
||||
<list>
|
||||
<option value="main.PRAWO" />
|
||||
</list>
|
||||
</option>
|
||||
</inspection_tool>
|
||||
<inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
|
||||
<option name="processCode" value="true" />
|
||||
<option name="processLiterals" value="true" />
|
||||
<option name="processComments" value="true" />
|
||||
</inspection_tool>
|
||||
</profile>
|
||||
</component>
|
|
@ -0,0 +1,6 @@
|
|||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (sport-text-classification-ball-isi-public)" project-jdk-type="Python SDK" />
|
||||
</project>
|
|
@ -0,0 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/sport-text-classification-ball-isi-public.iml" filepath="$PROJECT_DIR$/.idea/sport-text-classification-ball-isi-public.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
|
@ -0,0 +1,10 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
|
@ -0,0 +1,90 @@
|
|||
import csv
|
||||
|
||||
import nltk
|
||||
import pandas as pd
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from nltk.tokenize import word_tokenize
|
||||
from gensim.models import Word2Vec
|
||||
nltk.download('punkt')
|
||||
|
||||
# w pliku train.tsv w kolumnach 25706, 58881, 73761 trzeba zamienic w tekscie tabulator na 4 spacje
|
||||
train = pd.read_csv('train/train.tsv', sep='\t')
|
||||
train.columns = ["y", "x"]
|
||||
|
||||
print(train["y"][0], train["x"][0])
|
||||
|
||||
# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
|
||||
slowa = []
|
||||
for tekst in train["x"]:
|
||||
pom = []
|
||||
for slowo in word_tokenize(tekst):
|
||||
pom.append(slowo.lower())
|
||||
slowa.append(pom)
|
||||
print(slowa[0])
|
||||
|
||||
# https://radimrehurek.com/gensim/models/word2vec.html
|
||||
model = Word2Vec(sentences=slowa, vector_size=100, window=5, min_count=1, workers=4)
|
||||
model.save("word2vec.model")
|
||||
|
||||
wektor = model.wv['przyjmujący']
|
||||
print(wektor)
|
||||
|
||||
podobne = model.wv.most_similar('przyjmujący', topn=5)
|
||||
print(podobne)
|
||||
|
||||
teksty = []
|
||||
for tekst in train["x"]:
|
||||
pom = None
|
||||
for slowo in word_tokenize(tekst):
|
||||
wektor = model.wv[slowo.lower()]
|
||||
if pom is None:
|
||||
pom = wektor
|
||||
else:
|
||||
pom = pom + wektor
|
||||
teksty.append(wektor)
|
||||
print(teksty[0])
|
||||
|
||||
X = teksty
|
||||
y = train["y"]
|
||||
|
||||
clf = MLPClassifier() # activation="tanh"
|
||||
clf.fit(X, y)
|
||||
|
||||
# w pliku in.tsv w kolumnach 1983, 5199 trzeba zamienic w tekscie tabulator na 4 spacje
|
||||
test = pd.read_csv('test-A/in.tsv', sep='\t')
|
||||
test.columns = ["x"]
|
||||
|
||||
print(test["x"][0])
|
||||
|
||||
# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
|
||||
slowa = []
|
||||
for tekst in test["x"]:
|
||||
pom = []
|
||||
for slowo in word_tokenize(tekst):
|
||||
pom.append(slowo.lower())
|
||||
slowa.append(pom)
|
||||
print(slowa[0])
|
||||
|
||||
teksty = []
|
||||
for tekst in test["x"]:
|
||||
pom = None
|
||||
for slowo in word_tokenize(tekst):
|
||||
wektor = None
|
||||
try:
|
||||
wektor = model.wv[slowo.lower()]
|
||||
except KeyError:
|
||||
pass
|
||||
if wektor is not None:
|
||||
if pom is None:
|
||||
pom = wektor
|
||||
else:
|
||||
pom = pom + wektor
|
||||
teksty.append(wektor)
|
||||
print(teksty[0])
|
||||
|
||||
przewidywania = clf.predict(teksty)
|
||||
print(przewidywania)
|
||||
|
||||
with open("test-A/out.tsv", "w", encoding="utf-8") as uwu:
|
||||
for p in przewidywania:
|
||||
uwu.write(str(p)+"\n")
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue