add zadanie2

2021-05-12 22:06:57 +02:00 · 2021-05-12 22:06:57 +02:00 · 7e34b7784f
commit 7e34b7784f
parent 9cb2fb2612
14 changed files with 5551 additions and 0 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
+# Editor-based HTTP Client requests
+/httpRequests/
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_16" project-jdk-name="gpu" project-jdk-type="Python SDK">
+    <output url="file://$PROJECT_DIR$/out" />
+  </component>
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/sport-text-classification-ball-ISI-public.iml" filepath="$PROJECT_DIR$/.idea/sport-text-classification-ball-ISI-public.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/sonarlint/issuestore/2/7/2764c02f7e906d45efc284511afb241ea2809cfa
+++ b/.idea/sonarlint/issuestore/2/7/2764c02f7e906d45efc284511afb241ea2809cfa
--- a/.idea/sonarlint/issuestore/8/e/8e340683124fb2c918c0f15c14e8e793c700cb99
+++ b/.idea/sonarlint/issuestore/8/e/8e340683124fb2c918c0f15c14e8e793c700cb99
--- a/.idea/sonarlint/issuestore/8/e/8ec9a00bfd09b3190ac6b22251dbb1aa95a0579d
+++ b/.idea/sonarlint/issuestore/8/e/8ec9a00bfd09b3190ac6b22251dbb1aa95a0579d
--- a/.idea/sonarlint/issuestore/b/b/bb88d7506cfdcbc88cc950c4af72a3e28c024a77
+++ b/.idea/sonarlint/issuestore/b/b/bb88d7506cfdcbc88cc950c4af72a3e28c024a77
--- a/.idea/sonarlint/issuestore/d/c/dca2ad27be5a52717dfbc75ce4b44f220c89908b
+++ b/.idea/sonarlint/issuestore/d/c/dca2ad27be5a52717dfbc75ce4b44f220c89908b
--- a/.idea/sonarlint/issuestore/index.pb
+++ b/.idea/sonarlint/issuestore/index.pb
@ -0,0 +1,11 @@
+
+=
+
test-A/in.tsv,8/e/8e340683124fb2c918c0f15c14e8e793c700cb99
+9
+	README.md,8/e/8ec9a00bfd09b3190ac6b22251dbb1aa95a0579d
+<
+dev-0/in.tsv,2/7/2764c02f7e906d45efc284511afb241ea2809cfa
+=
+
dev-0/out.tsv,d/c/dca2ad27be5a52717dfbc75ce4b44f220c89908b
+4
+a.py,b/b/bb88d7506cfdcbc88cc950c4af72a3e28c024a77
--- a/.idea/sport-text-classification-ball-ISI-public.iml
+++ b/.idea/sport-text-classification-ball-ISI-public.iml
@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="JAVA_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
--- a/a.py
+++ b/a.py
@ -0,0 +1,51 @@
+import csv
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from stop_words import get_stop_words
+
+
+def to_n(word, n):
+    if len(word) < n + 1:
+        return word
+    else:
+        return word[:n]
+
+
+def stem(sentence):
+    return ' '.join([to_n(word, 7) for word in sentence.split()])
+
+
+def remove_specials(text):
+    to_replace = '.,<>)(*&^%$#@~;:!?-_=+/\\\'\"|{}[]012345679'
+    for spec in to_replace:
+        text = text.replace(spec, '')
+    return text
+
+
+df = pd.read_csv('train/train.tsv.gz', sep='\t', compression='gzip', names=['label', 'text'])
+
+df['text'] = [stem(remove_specials(x.lower())) for x in df['text']]
+
+vectorizer = TfidfVectorizer(stop_words=get_stop_words('polish'))
+
+x = vectorizer.fit_transform(df['text'])
+
+labels = df.pop('label')
+
+bayes = MultinomialNB()
+bayes.fit(x, labels)
+# ----------------------------------------------------------------------------------------------------------------------
+t_df = pd.read_csv('dev-0/in.tsv', sep='\t', names=['text'])
+tlabs = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['text'])
+
+t_df['text'] = [stem(remove_specials(x.lower())) for x in t_df['text']]
+
+vecs = vectorizer.transform(t_df['text'])
+
+predict = bayes.predict(vecs)
+with open('out.tsv', 'w') as f:
+    tsvf = csv.writer(f, delimiter='\n')
+    tsvf.writerow(predict)
+score = bayes.score(vecs, tlabs)
+print(score)
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/BIN
+++ b/BIN