This commit is contained in:
Dominik Jagosz 2024-05-17 21:36:24 +02:00
parent 8202edc3ba
commit c8d7e1452c
15 changed files with 109171 additions and 2 deletions

3
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

View File

@ -0,0 +1,24 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N802" />
</list>
</option>
</inspection_tool>
<inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredIdentifiers">
<list>
<option value="main.PRAWO" />
</list>
</option>
</inspection_tool>
<inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
<option name="processCode" value="true" />
<option name="processLiterals" value="true" />
<option name="processComments" value="true" />
</inspection_tool>
</profile>
</component>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (sport-text-classification-ball-isi-public)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/sport-text-classification-ball-isi-public.iml" filepath="$PROJECT_DIR$/.idea/sport-text-classification-ball-isi-public.iml" />
</modules>
</component>
</project>

View File

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

90
mian.py Normal file
View File

@ -0,0 +1,90 @@
import csv
import nltk
import pandas as pd
from sklearn.neural_network import MLPClassifier
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
nltk.download('punkt')
# w pliku train.tsv w kolumnach 25706, 58881, 73761 trzeba zamienic w tekscie tabulator na 4 spacje
train = pd.read_csv('train/train.tsv', sep='\t')
train.columns = ["y", "x"]
print(train["y"][0], train["x"][0])
# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
slowa = []
for tekst in train["x"]:
pom = []
for slowo in word_tokenize(tekst):
pom.append(slowo.lower())
slowa.append(pom)
print(slowa[0])
# https://radimrehurek.com/gensim/models/word2vec.html
model = Word2Vec(sentences=slowa, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")
wektor = model.wv['przyjmujący']
print(wektor)
podobne = model.wv.most_similar('przyjmujący', topn=5)
print(podobne)
teksty = []
for tekst in train["x"]:
pom = None
for slowo in word_tokenize(tekst):
wektor = model.wv[slowo.lower()]
if pom is None:
pom = wektor
else:
pom = pom + wektor
teksty.append(wektor)
print(teksty[0])
X = teksty
y = train["y"]
clf = MLPClassifier() # activation="tanh"
clf.fit(X, y)
# w pliku in.tsv w kolumnach 1983, 5199 trzeba zamienic w tekscie tabulator na 4 spacje
test = pd.read_csv('test-A/in.tsv', sep='\t')
test.columns = ["x"]
print(test["x"][0])
# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
slowa = []
for tekst in test["x"]:
pom = []
for slowo in word_tokenize(tekst):
pom.append(slowo.lower())
slowa.append(pom)
print(slowa[0])
teksty = []
for tekst in test["x"]:
pom = None
for slowo in word_tokenize(tekst):
wektor = None
try:
wektor = model.wv[slowo.lower()]
except KeyError:
pass
if wektor is not None:
if pom is None:
pom = wektor
else:
pom = pom + wektor
teksty.append(wektor)
print(teksty[0])
przewidywania = clf.predict(teksty)
print(przewidywania)
with open("test-A/out.tsv", "w", encoding="utf-8") as uwu:
for p in przewidywania:
uwu.write(str(p)+"\n")

5446
test-A/out.csv Normal file

File diff suppressed because it is too large Load Diff

5446
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

98132
train/train.tsv Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

BIN
word2vec.model Normal file

Binary file not shown.

BIN
word2vec.model.syn1neg.npy Normal file

Binary file not shown.

Binary file not shown.