diff --git a/main.py b/main.py new file mode 100644 index 0000000..912bcc7 --- /dev/null +++ b/main.py @@ -0,0 +1,48 @@ +import lzma +import csv +import pandas as pd +from sklearn.linear_model import LinearRegression +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.pipeline import Pipeline + + +def readInput(dir): + X = [] + if 'xz' in dir: + with lzma.open(dir) as f: + for line in f: + text = line.decode('utf-8') + text = text.split('\t') + X.append(text) + else: + with open(dir, encoding='utf8', errors='ignore') as f: + for line in f: + X. append(line.replace('\n','')) + return X + +def writeOutput(output, dir): + with open(dir, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerows(output) + + +if __name__ == '__main__': + train = pd.DataFrame(readInput('train/train.tsv.xz'), + columns=['Beginning', 'End', 'Title', 'Source', 'X']) + train['Y'] = train.apply(lambda x: (float(x.Beginning) + float(x.End))/2, axis=1) + train = train.drop(columns=['Beginning', 'End', 'Title', 'Source']) + + model = Pipeline([TfidfVectorizer(), LinearRegression()]) + model.fit(train.X, train.Y) + + # dev-0 + testX = readInput('dev-0/in.tsv') + writeOutput(model.predict(testX), 'dev-0/out.tsv') + + # dev-1 + testX = readInput('dev-1/in.tsv') + writeOutput(model.predict(testX), 'dev-1/out.tsv') + + # test-A + testX = readInput('test-A/in.tsv') + writeOutput(model.predict(testX), 'test-A/out.tsv') \ No newline at end of file