???
This commit is contained in:
parent
8202edc3ba
commit
c8d7e1452c
3
.idea/.gitignore
vendored
Normal file
3
.idea/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
24
.idea/inspectionProfiles/Project_Default.xml
Normal file
24
.idea/inspectionProfiles/Project_Default.xml
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<profile version="1.0">
|
||||||
|
<option name="myName" value="Project Default" />
|
||||||
|
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||||
|
<option name="ignoredErrors">
|
||||||
|
<list>
|
||||||
|
<option value="N802" />
|
||||||
|
</list>
|
||||||
|
</option>
|
||||||
|
</inspection_tool>
|
||||||
|
<inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
|
||||||
|
<option name="ignoredIdentifiers">
|
||||||
|
<list>
|
||||||
|
<option value="main.PRAWO" />
|
||||||
|
</list>
|
||||||
|
</option>
|
||||||
|
</inspection_tool>
|
||||||
|
<inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
|
||||||
|
<option name="processCode" value="true" />
|
||||||
|
<option name="processLiterals" value="true" />
|
||||||
|
<option name="processComments" value="true" />
|
||||||
|
</inspection_tool>
|
||||||
|
</profile>
|
||||||
|
</component>
|
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
4
.idea/misc.xml
Normal file
4
.idea/misc.xml
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (sport-text-classification-ball-isi-public)" project-jdk-type="Python SDK" />
|
||||||
|
</project>
|
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/sport-text-classification-ball-isi-public.iml" filepath="$PROJECT_DIR$/.idea/sport-text-classification-ball-isi-public.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
10
.idea/sport-text-classification-ball-isi-public.iml
Normal file
10
.idea/sport-text-classification-ball-isi-public.iml
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
90
mian.py
Normal file
90
mian.py
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
import csv
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.neural_network import MLPClassifier
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from gensim.models import Word2Vec
|
||||||
|
nltk.download('punkt')
|
||||||
|
|
||||||
|
# w pliku train.tsv w kolumnach 25706, 58881, 73761 trzeba zamienic w tekscie tabulator na 4 spacje
|
||||||
|
train = pd.read_csv('train/train.tsv', sep='\t')
|
||||||
|
train.columns = ["y", "x"]
|
||||||
|
|
||||||
|
print(train["y"][0], train["x"][0])
|
||||||
|
|
||||||
|
# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
|
||||||
|
slowa = []
|
||||||
|
for tekst in train["x"]:
|
||||||
|
pom = []
|
||||||
|
for slowo in word_tokenize(tekst):
|
||||||
|
pom.append(slowo.lower())
|
||||||
|
slowa.append(pom)
|
||||||
|
print(slowa[0])
|
||||||
|
|
||||||
|
# https://radimrehurek.com/gensim/models/word2vec.html
|
||||||
|
model = Word2Vec(sentences=slowa, vector_size=100, window=5, min_count=1, workers=4)
|
||||||
|
model.save("word2vec.model")
|
||||||
|
|
||||||
|
wektor = model.wv['przyjmujący']
|
||||||
|
print(wektor)
|
||||||
|
|
||||||
|
podobne = model.wv.most_similar('przyjmujący', topn=5)
|
||||||
|
print(podobne)
|
||||||
|
|
||||||
|
teksty = []
|
||||||
|
for tekst in train["x"]:
|
||||||
|
pom = None
|
||||||
|
for slowo in word_tokenize(tekst):
|
||||||
|
wektor = model.wv[slowo.lower()]
|
||||||
|
if pom is None:
|
||||||
|
pom = wektor
|
||||||
|
else:
|
||||||
|
pom = pom + wektor
|
||||||
|
teksty.append(wektor)
|
||||||
|
print(teksty[0])
|
||||||
|
|
||||||
|
X = teksty
|
||||||
|
y = train["y"]
|
||||||
|
|
||||||
|
clf = MLPClassifier() # activation="tanh"
|
||||||
|
clf.fit(X, y)
|
||||||
|
|
||||||
|
# w pliku in.tsv w kolumnach 1983, 5199 trzeba zamienic w tekscie tabulator na 4 spacje
|
||||||
|
test = pd.read_csv('test-A/in.tsv', sep='\t')
|
||||||
|
test.columns = ["x"]
|
||||||
|
|
||||||
|
print(test["x"][0])
|
||||||
|
|
||||||
|
# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
|
||||||
|
slowa = []
|
||||||
|
for tekst in test["x"]:
|
||||||
|
pom = []
|
||||||
|
for slowo in word_tokenize(tekst):
|
||||||
|
pom.append(slowo.lower())
|
||||||
|
slowa.append(pom)
|
||||||
|
print(slowa[0])
|
||||||
|
|
||||||
|
teksty = []
|
||||||
|
for tekst in test["x"]:
|
||||||
|
pom = None
|
||||||
|
for slowo in word_tokenize(tekst):
|
||||||
|
wektor = None
|
||||||
|
try:
|
||||||
|
wektor = model.wv[slowo.lower()]
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
if wektor is not None:
|
||||||
|
if pom is None:
|
||||||
|
pom = wektor
|
||||||
|
else:
|
||||||
|
pom = pom + wektor
|
||||||
|
teksty.append(wektor)
|
||||||
|
print(teksty[0])
|
||||||
|
|
||||||
|
przewidywania = clf.predict(teksty)
|
||||||
|
print(przewidywania)
|
||||||
|
|
||||||
|
with open("test-A/out.tsv", "w", encoding="utf-8") as uwu:
|
||||||
|
for p in przewidywania:
|
||||||
|
uwu.write(str(p)+"\n")
|
5446
test-A/out.csv
Normal file
5446
test-A/out.csv
Normal file
File diff suppressed because it is too large
Load Diff
5446
test-A/out.tsv
Normal file
5446
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
98132
train/train.tsv
Normal file
98132
train/train.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
BIN
word2vec.model
Normal file
BIN
word2vec.model
Normal file
Binary file not shown.
BIN
word2vec.model.syn1neg.npy
Normal file
BIN
word2vec.model.syn1neg.npy
Normal file
Binary file not shown.
BIN
word2vec.model.wv.vectors.npy
Normal file
BIN
word2vec.model.wv.vectors.npy
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user