forked from kubapok/retroc2
first scores
This commit is contained in:
parent
647c099815
commit
35b17f5a44
8
.idea/.gitignore
vendored
Normal file
8
.idea/.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
6
.idea/misc.xml
Normal file
6
.idea/misc.xml
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" languageLevel="JDK_16" project-jdk-name="gpu" project-jdk-type="Python SDK">
|
||||
<output url="file://$PROJECT_DIR$/out" />
|
||||
</component>
|
||||
</project>
|
8
.idea/modules.xml
Normal file
8
.idea/modules.xml
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/retroc2.iml" filepath="$PROJECT_DIR$/.idea/retroc2.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
9
.idea/retroc2.iml
Normal file
9
.idea/retroc2.iml
Normal file
@ -0,0 +1,9 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="JAVA_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
0
.idea/sonarlint/issuestore/index.pb
Normal file
0
.idea/sonarlint/issuestore/index.pb
Normal file
6
.idea/vcs.xml
Normal file
6
.idea/vcs.xml
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
20000
dev-0/out.tsv
Normal file
20000
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11563
dev-1/out.tsv
Normal file
11563
dev-1/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
69
spp.py
Normal file
69
spp.py
Normal file
@ -0,0 +1,69 @@
|
||||
import csv
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.sparse import vstack
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from stop_words import get_stop_words
|
||||
|
||||
|
||||
def to_n(word, n):
|
||||
if len(word) < n + 1:
|
||||
return word
|
||||
else:
|
||||
return word[:n]
|
||||
|
||||
|
||||
def stem(sentence):
|
||||
return ' '.join([to_n(word, 7) for word in sentence.split()])
|
||||
|
||||
|
||||
def remove_specials(text):
|
||||
to_replace = '.,<>)(*&^%$#@~;:!?-_=+/\\\'\"|{}[]012345679'
|
||||
for spec in to_replace:
|
||||
text = text.replace(spec, '')
|
||||
return text
|
||||
|
||||
|
||||
df = pd.read_csv('train/train.tsv.xz',
|
||||
sep='\t',
|
||||
compression='xz',
|
||||
names=['date_from', 'date_to', 'title', 'source', 'text'])
|
||||
|
||||
df['text'] = [stem(remove_specials(x.lower())) for x in df['text']]
|
||||
|
||||
vectorizer = TfidfVectorizer(stop_words=get_stop_words('polish'))
|
||||
|
||||
x = vectorizer.fit_transform(df['text'])
|
||||
x = vstack([x, x])
|
||||
|
||||
labels1 = df.pop('date_from')
|
||||
labels2 = df.pop('date_to')
|
||||
labels = np.concatenate((labels1, labels2), axis=0) #todo
|
||||
|
||||
lin_reg = LinearRegression()
|
||||
lin_reg.fit(x, labels)
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
t_df = pd.read_csv('dev-0/in.tsv', sep='\t', names=['text'])
|
||||
tlabs = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['date'])
|
||||
|
||||
t_df['text'] = [stem(remove_specials(x.lower())) for x in t_df['text']]
|
||||
|
||||
vecs = vectorizer.transform(t_df['text'])
|
||||
|
||||
predict = lin_reg.predict(vecs)
|
||||
with open('dev-0/out.tsv', 'w') as f:
|
||||
tsvf = csv.writer(f, delimiter='\n')
|
||||
tsvf.writerow(predict)
|
||||
# ----------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
t_df = pd.read_csv('test-A/in.tsv', sep='\t', names=['text'])
|
||||
|
||||
t_df['text'] = [stem(remove_specials(x.lower())) for x in t_df['text']]
|
||||
|
||||
vecs = vectorizer.transform(t_df['text'])
|
||||
|
||||
predict = lin_reg.predict(vecs)
|
||||
with open('test-A/out.tsv', 'w') as f:
|
||||
tsvf = csv.writer(f, delimiter='\n')
|
||||
tsvf.writerow(predict)
|
14220
test-A/out.tsv
Normal file
14220
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user