add zadanie2
This commit is contained in:
parent
9cb2fb2612
commit
7e34b7784f
8
.idea/.gitignore
vendored
Normal file
8
.idea/.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
6
.idea/misc.xml
Normal file
6
.idea/misc.xml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectRootManager" version="2" languageLevel="JDK_16" project-jdk-name="gpu" project-jdk-type="Python SDK">
|
||||||
|
<output url="file://$PROJECT_DIR$/out" />
|
||||||
|
</component>
|
||||||
|
</project>
|
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/sport-text-classification-ball-ISI-public.iml" filepath="$PROJECT_DIR$/.idea/sport-text-classification-ball-ISI-public.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
11
.idea/sonarlint/issuestore/index.pb
Normal file
11
.idea/sonarlint/issuestore/index.pb
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
|
||||||
|
=
|
||||||
|
test-A/in.tsv,8/e/8e340683124fb2c918c0f15c14e8e793c700cb99
|
||||||
|
9
|
||||||
|
README.md,8/e/8ec9a00bfd09b3190ac6b22251dbb1aa95a0579d
|
||||||
|
<
|
||||||
|
dev-0/in.tsv,2/7/2764c02f7e906d45efc284511afb241ea2809cfa
|
||||||
|
=
|
||||||
|
dev-0/out.tsv,d/c/dca2ad27be5a52717dfbc75ce4b44f220c89908b
|
||||||
|
4
|
||||||
|
a.py,b/b/bb88d7506cfdcbc88cc950c4af72a3e28c024a77
|
9
.idea/sport-text-classification-ball-ISI-public.iml
Normal file
9
.idea/sport-text-classification-ball-ISI-public.iml
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="JAVA_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||||
|
<exclude-output />
|
||||||
|
<content url="file://$MODULE_DIR$" />
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
6
.idea/vcs.xml
Normal file
6
.idea/vcs.xml
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
51
a.py
Normal file
51
a.py
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
import csv
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from stop_words import get_stop_words
|
||||||
|
|
||||||
|
|
||||||
|
def to_n(word, n):
|
||||||
|
if len(word) < n + 1:
|
||||||
|
return word
|
||||||
|
else:
|
||||||
|
return word[:n]
|
||||||
|
|
||||||
|
|
||||||
|
def stem(sentence):
|
||||||
|
return ' '.join([to_n(word, 7) for word in sentence.split()])
|
||||||
|
|
||||||
|
|
||||||
|
def remove_specials(text):
|
||||||
|
to_replace = '.,<>)(*&^%$#@~;:!?-_=+/\\\'\"|{}[]012345679'
|
||||||
|
for spec in to_replace:
|
||||||
|
text = text.replace(spec, '')
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
df = pd.read_csv('train/train.tsv.gz', sep='\t', compression='gzip', names=['label', 'text'])
|
||||||
|
|
||||||
|
df['text'] = [stem(remove_specials(x.lower())) for x in df['text']]
|
||||||
|
|
||||||
|
vectorizer = TfidfVectorizer(stop_words=get_stop_words('polish'))
|
||||||
|
|
||||||
|
x = vectorizer.fit_transform(df['text'])
|
||||||
|
|
||||||
|
labels = df.pop('label')
|
||||||
|
|
||||||
|
bayes = MultinomialNB()
|
||||||
|
bayes.fit(x, labels)
|
||||||
|
# ----------------------------------------------------------------------------------------------------------------------
|
||||||
|
t_df = pd.read_csv('dev-0/in.tsv', sep='\t', names=['text'])
|
||||||
|
tlabs = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['text'])
|
||||||
|
|
||||||
|
t_df['text'] = [stem(remove_specials(x.lower())) for x in t_df['text']]
|
||||||
|
|
||||||
|
vecs = vectorizer.transform(t_df['text'])
|
||||||
|
|
||||||
|
predict = bayes.predict(vecs)
|
||||||
|
with open('out.tsv', 'w') as f:
|
||||||
|
tsvf = csv.writer(f, delimiter='\n')
|
||||||
|
tsvf.writerow(predict)
|
||||||
|
score = bayes.score(vecs, tlabs)
|
||||||
|
print(score)
|
5452
dev-0/out.tsv
Normal file
5452
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user