???

2024-05-17 21:36:24 +02:00 · 2024-05-17 21:36:24 +02:00 · c8d7e1452c
commit c8d7e1452c
parent 8202edc3ba
15 changed files with 109171 additions and 2 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,3 @@
 # Default ignored files
 /shelf/
 /workspace.xml
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@ -0,0 +1,24 @@
 <component name="InspectionProjectProfileManager">
  <profile version="1.0">
    <option name="myName" value="Project Default" />
    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
      <option name="ignoredErrors">
        <list>
          <option value="N802" />
        </list>
      </option>
    </inspection_tool>
    <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
      <option name="ignoredIdentifiers">
        <list>
          <option value="main.PRAWO" />
        </list>
      </option>
    </inspection_tool>
    <inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
      <option name="processCode" value="true" />
      <option name="processLiterals" value="true" />
      <option name="processComments" value="true" />
    </inspection_tool>
  </profile>
 </component>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
 <component name="InspectionProjectProfileManager">
  <settings>
    <option name="USE_PROJECT_PROFILE" value="false" />
    <version value="1.0" />
  </settings>
 </component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,4 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (sport-text-classification-ball-isi-public)" project-jdk-type="Python SDK" />
 </project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/sport-text-classification-ball-isi-public.iml" filepath="$PROJECT_DIR$/.idea/sport-text-classification-ball-isi-public.iml" />
    </modules>
  </component>
 </project>
--- a/.idea/sport-text-classification-ball-isi-public.iml
+++ b/.idea/sport-text-classification-ball-isi-public.iml
@ -0,0 +1,10 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$">
      <excludeFolder url="file://$MODULE_DIR$/venv" />
    </content>
    <orderEntry type="inheritedJdk" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/mian.py
+++ b/mian.py
@ -0,0 +1,90 @@
 import csv
 import nltk
 import pandas as pd
 from sklearn.neural_network import MLPClassifier
 from nltk.tokenize import word_tokenize
 from gensim.models import Word2Vec
 nltk.download('punkt')
 # w pliku train.tsv w kolumnach 25706, 58881, 73761 trzeba zamienic w tekscie tabulator na 4 spacje
 train = pd.read_csv('train/train.tsv', sep='\t')
 train.columns = ["y", "x"]
 print(train["y"][0], train["x"][0])
 # https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
 slowa = []
 for tekst in train["x"]:
    pom = []
    for slowo in word_tokenize(tekst):
        pom.append(slowo.lower())
    slowa.append(pom)
 print(slowa[0])
 # https://radimrehurek.com/gensim/models/word2vec.html
 model = Word2Vec(sentences=slowa, vector_size=100, window=5, min_count=1, workers=4)
 model.save("word2vec.model")
 wektor = model.wv['przyjmujący']
 print(wektor)
 podobne = model.wv.most_similar('przyjmujący', topn=5)
 print(podobne)
 teksty = []
 for tekst in train["x"]:
    pom = None
    for slowo in word_tokenize(tekst):
        wektor = model.wv[slowo.lower()]
        if pom is None:
            pom = wektor
        else:
            pom = pom + wektor
    teksty.append(wektor)
 print(teksty[0])
 X = teksty
 y = train["y"]
 clf = MLPClassifier()   # activation="tanh"
 clf.fit(X, y)
 # w pliku in.tsv w kolumnach 1983, 5199 trzeba zamienic w tekscie tabulator na 4 spacje
 test = pd.read_csv('test-A/in.tsv', sep='\t')
 test.columns = ["x"]
 print(test["x"][0])
 # https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/
 slowa = []
 for tekst in test["x"]:
    pom = []
    for slowo in word_tokenize(tekst):
        pom.append(slowo.lower())
    slowa.append(pom)
 print(slowa[0])
 teksty = []
 for tekst in test["x"]:
    pom = None
    for slowo in word_tokenize(tekst):
        wektor = None
        try:
            wektor = model.wv[slowo.lower()]
        except KeyError:
            pass
        if wektor is not None:
            if pom is None:
                pom = wektor
            else:
                pom = pom + wektor
    teksty.append(wektor)
 print(teksty[0])
 przewidywania = clf.predict(teksty)
 print(przewidywania)
 with open("test-A/out.tsv", "w", encoding="utf-8") as uwu:
    for p in przewidywania:
        uwu.write(str(p)+"\n")
--- a/test-A/out.csv
+++ b/test-A/out.csv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/train.tsv
+++ b/train/train.tsv
--- a/train/train.tsv.gz
+++ b/train/train.tsv.gz
--- a/word2vec.model
+++ b/word2vec.model
--- a/word2vec.model.syn1neg.npy
+++ b/word2vec.model.syn1neg.npy
--- a/word2vec.model.wv.vectors.npy
+++ b/word2vec.model.wv.vectors.npy