s444386 run file
This commit is contained in:
parent
d3275f4da1
commit
12afc815f2
50
run.py
Normal file
50
run.py
Normal file
@ -0,0 +1,50 @@
|
||||
import lzma
|
||||
import math
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.metrics import mean_squared_error
|
||||
import pandas as pd
|
||||
|
||||
X_train = []
|
||||
Y_train = []
|
||||
|
||||
stop = 0
|
||||
|
||||
with lzma.open('train/train.tsv.xz', 'rt', encoding="utf-8") as f:
|
||||
data = pd.read_csv(f, sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])
|
||||
|
||||
|
||||
data = data[['Text', 'Begin']]
|
||||
data = data[0:50000]
|
||||
|
||||
X = data['Text']
|
||||
y = data['Begin']
|
||||
|
||||
model = make_pipeline(TfidfVectorizer(), LinearRegression())
|
||||
model.fit(X, y)
|
||||
|
||||
def readFile(filename):
|
||||
X_dev = []
|
||||
with open(filename, 'r', encoding="utf-8") as dev_in:
|
||||
for line in dev_in:
|
||||
text = line.split("\t")[0].strip()
|
||||
X_dev.append(text)
|
||||
return X_dev
|
||||
|
||||
def writePred(filename, predictions):
|
||||
with open(filename, "w") as out_file:
|
||||
for pred in predictions:
|
||||
out_file.write(str(pred) + "\n")
|
||||
|
||||
x = readFile('dev-0/in.tsv')
|
||||
pred = model.predict(x)
|
||||
writePred('dev-0/out.tsv',pred)
|
||||
|
||||
x = readFile('dev-1/in.tsv')
|
||||
pred = model.predict(x)
|
||||
writePred('dev-1/out.tsv',pred)
|
||||
|
||||
x = readFile('test-A/in.tsv')
|
||||
pred = model.predict(x)
|
||||
writePred('test-A/out.tsv',pred)
|
Loading…
Reference in New Issue
Block a user