62 lines
1.2 KiB
Python
62 lines
1.2 KiB
Python
#!/usr/bin/env python
|
|
# coding: utf-8
|
|
|
|
# In[1]:
|
|
|
|
|
|
import lzma
|
|
from sklearn.linear_model import LinearRegression
|
|
from sklearn.pipeline import make_pipeline
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.metrics import mean_squared_error
|
|
import pandas as pd
|
|
|
|
|
|
# In[13]:
|
|
|
|
|
|
with lzma.open('train/train.tsv.xz', 'rt', encoding="utf-8") as f:
|
|
df = pd.read_csv(f, sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])
|
|
|
|
|
|
# In[3]:
|
|
|
|
|
|
def readFile(filename):
|
|
result = []
|
|
with open(filename, 'r', encoding="utf-8") as f:
|
|
for line in f:
|
|
text = line.split("\t")[0].strip()
|
|
result.append(text)
|
|
return result
|
|
|
|
|
|
# In[11]:
|
|
|
|
|
|
def predict(filename, predictions):
|
|
with open(filename, "w") as f:
|
|
for p in predictions:
|
|
f.write(str(p) + "\n")
|
|
|
|
|
|
# In[15]:
|
|
|
|
|
|
df = df[['Text', 'Begin']]
|
|
X_train = df['Text']
|
|
y_train = df['Begin']
|
|
model = make_pipeline(TfidfVectorizer(), LinearRegression())
|
|
model.fit(X_train, y_train)
|
|
|
|
|
|
# In[ ]:
|
|
|
|
|
|
filenames=[('dev-0/in.tsv',"dev-0/out.tsv"), ('dev-1/in.tsv', "dev-1/out.tsv"), ('test-A/in.tsv', 'test-A/out.tsv')]
|
|
for filename in filenames:
|
|
f=readFile(filename[0])
|
|
y_predict=model.predict(f)
|
|
predict(filename[1],y_predict)
|
|
|