retroc2/run.py
Adrian Charkiewicz f69ed316f2 s444354 larger_set
2022-05-14 03:41:15 +02:00

62 lines
1.2 KiB
Python

#!/usr/bin/env python
# coding: utf-8
# In[1]:
import lzma
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
import pandas as pd
# In[13]:
with lzma.open('train/train.tsv.xz', 'rt', encoding="utf-8") as f:
df = pd.read_csv(f, sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])
# In[3]:
def readFile(filename):
result = []
with open(filename, 'r', encoding="utf-8") as f:
for line in f:
text = line.split("\t")[0].strip()
result.append(text)
return result
# In[11]:
def predict(filename, predictions):
with open(filename, "w") as f:
for p in predictions:
f.write(str(p) + "\n")
# In[15]:
df = df[['Text', 'Begin']]
X_train = df['Text']
y_train = df['Begin']
model = make_pipeline(TfidfVectorizer(), LinearRegression())
model.fit(X_train, y_train)
# In[ ]:
filenames=[('dev-0/in.tsv',"dev-0/out.tsv"), ('dev-1/in.tsv', "dev-1/out.tsv"), ('test-A/in.tsv', 'test-A/out.tsv')]
for filename in filenames:
f=readFile(filename[0])
y_predict=model.predict(f)
predict(filename[1],y_predict)