s444421
This commit is contained in:
parent
33b70ce7b1
commit
df7c995a83
20000
dev-0/out.tsv
Normal file
20000
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11563
dev-1/out.tsv
Normal file
11563
dev-1/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
96
run.py
Normal file
96
run.py
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
# In[59]:
|
||||||
|
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
|
||||||
|
from sklearn.metrics import mean_squared_error
|
||||||
|
from sklearn.pipeline import make_pipeline
|
||||||
|
|
||||||
|
|
||||||
|
# In[60]:
|
||||||
|
|
||||||
|
|
||||||
|
colnames = ['start', 'text']
|
||||||
|
data = pd.read_csv('train/train.tsv', sep='\t', names=colnames, usecols=[0, 4])
|
||||||
|
|
||||||
|
|
||||||
|
# In[66]:
|
||||||
|
|
||||||
|
|
||||||
|
x_train = data['text']
|
||||||
|
y_train = data['start']
|
||||||
|
|
||||||
|
|
||||||
|
# In[67]:
|
||||||
|
|
||||||
|
|
||||||
|
tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
|
||||||
|
tfidf_vectorizer.fit_transform(x_train.values)
|
||||||
|
x_train_prepared = tfidf_vectorizer.transform(x_train.values)
|
||||||
|
|
||||||
|
|
||||||
|
# In[68]:
|
||||||
|
|
||||||
|
|
||||||
|
lr = LinearRegression()
|
||||||
|
model = lr.fit(x_train_prepared, y_train)
|
||||||
|
|
||||||
|
|
||||||
|
# In[69]:
|
||||||
|
|
||||||
|
|
||||||
|
y_dev0_exp = pd.read_csv('dev-0/expected.tsv', sep='\t', names=['text'])
|
||||||
|
|
||||||
|
f = open("dev-0/in.tsv", "r", encoding='utf-8')
|
||||||
|
lines_dev_0 = f.readlines()
|
||||||
|
x_dev0 = pd.DataFrame(lines_dev_0)
|
||||||
|
x_dev0.rename(columns = {0 : 'text'}, inplace = True)
|
||||||
|
|
||||||
|
x_dev0_prepared = tfidf_vectorizer.transform(x_dev0['text'].values)
|
||||||
|
y_dev0_pred = model.predict(x_dev0_prepared)
|
||||||
|
|
||||||
|
file = open('dev-0/out.tsv', 'w')
|
||||||
|
for y in y_dev0_pred:
|
||||||
|
file.write(f'{y}\n')
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
|
||||||
|
# In[74]:
|
||||||
|
|
||||||
|
|
||||||
|
y_dev1_exp = pd.read_csv('dev-1/expected.tsv', sep='\t', names=['text'])
|
||||||
|
|
||||||
|
f = open("dev-1/in.tsv", "r", encoding='utf-8')
|
||||||
|
lines_dev_1 = f.readlines()
|
||||||
|
x_dev1 = pd.DataFrame(lines_dev_1)
|
||||||
|
x_dev1.rename(columns = {0 : 'text'}, inplace = True)
|
||||||
|
|
||||||
|
x_dev1_prepared = tfidf_vectorizer.transform(x_dev1['text'].values)
|
||||||
|
y_dev1_pred = model.predict(x_dev1_prepared)
|
||||||
|
|
||||||
|
file = open('dev-1/out.tsv', 'w')
|
||||||
|
for y in y_dev1_pred:
|
||||||
|
file.write(f'{y}\n')
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
|
||||||
|
# In[76]:
|
||||||
|
|
||||||
|
|
||||||
|
f = open("test-A/in.tsv", "r", encoding='utf-8')
|
||||||
|
lines_test = f.readlines()
|
||||||
|
x_test = pd.DataFrame(lines_test)
|
||||||
|
x_test.rename(columns = {0 : 'text'}, inplace = True)
|
||||||
|
|
||||||
|
x_test_prepared = tfidf_vectorizer.transform(x_test['text'].values)
|
||||||
|
y_test_pred = model.predict(x_test_prepared)
|
||||||
|
|
||||||
|
file = open('test-A/out.tsv', 'w')
|
||||||
|
for y in y_test_pred:
|
||||||
|
file.write(f'{y}\n')
|
||||||
|
file.close()
|
||||||
|
|
14220
test-A/out.tsv
Normal file
14220
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user