forked from kubapok/retroc2
final
This commit is contained in:
parent
647c099815
commit
eeb51fec1d
1951
.ipynb_checkpoints/model-checkpoint.ipynb
Normal file
1951
.ipynb_checkpoints/model-checkpoint.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
49
.ipynb_checkpoints/run-checkpoint.py
Normal file
49
.ipynb_checkpoints/run-checkpoint.py
Normal file
@ -0,0 +1,49 @@
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.metrics import mean_squared_error
|
||||
|
||||
with open('train/train.tsv', 'r', encoding='utf8') as file:
|
||||
train_data = pd.read_csv(file, sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])
|
||||
|
||||
|
||||
def readFile(filename):
|
||||
result = []
|
||||
with open(filename, 'r', encoding="utf-8") as file:
|
||||
for line in file:
|
||||
text = line.split("\t")[0].strip()
|
||||
result.append(text)
|
||||
return result
|
||||
|
||||
def write_pred(filename, predictions):
|
||||
with open(filename, "w") as file:
|
||||
for pred in predictions:
|
||||
file.write(str(pred) + "\n")
|
||||
|
||||
|
||||
|
||||
train_data = train_data[:10000]
|
||||
|
||||
X = train_data['Text']
|
||||
Y = train_data['Begin']
|
||||
|
||||
|
||||
model = make_pipeline(TfidfVectorizer(), LinearRegression())
|
||||
model.fit(X, Y)
|
||||
|
||||
|
||||
|
||||
dev_0 = readFile('dev-0/in.tsv')
|
||||
predict_dev_0 = model.predict(dev_0)
|
||||
write_pred('dev-0/out.tsv', predict_dev_0)
|
||||
|
||||
dev_1 = readFile('dev-1/in.tsv')
|
||||
predict_dev_1 = model.predict(dev_1)
|
||||
write_pred('dev-1/out.tsv', predict_dev_1)
|
||||
|
||||
test_A = readFile('test-A/in.tsv')
|
||||
predict_test_A = model.predict(test_A)
|
||||
write_pred('test-A/out.tsv', predict_test_A)
|
||||
|
||||
|
20000
dev-0/.ipynb_checkpoints/expected-checkpoint.tsv
Normal file
20000
dev-0/.ipynb_checkpoints/expected-checkpoint.tsv
Normal file
File diff suppressed because it is too large
Load Diff
20000
dev-0/.ipynb_checkpoints/in-checkpoint.tsv
Normal file
20000
dev-0/.ipynb_checkpoints/in-checkpoint.tsv
Normal file
File diff suppressed because one or more lines are too long
20000
dev-0/meta.tsv
Normal file
20000
dev-0/meta.tsv
Normal file
File diff suppressed because it is too large
Load Diff
20000
dev-0/out.tsv
Normal file
20000
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11563
dev-1/.ipynb_checkpoints/expected-checkpoint.tsv
Normal file
11563
dev-1/.ipynb_checkpoints/expected-checkpoint.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11563
dev-1/.ipynb_checkpoints/in-checkpoint.tsv
Normal file
11563
dev-1/.ipynb_checkpoints/in-checkpoint.tsv
Normal file
File diff suppressed because one or more lines are too long
11563
dev-1/meta.tsv
Normal file
11563
dev-1/meta.tsv
Normal file
File diff suppressed because it is too large
Load Diff
11563
dev-1/out.tsv
Normal file
11563
dev-1/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
118
model.ipynb
Normal file
118
model.ipynb
Normal file
@ -0,0 +1,118 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 99,
|
||||
"id": "8f5480f9-fa82-4150-acff-9309fdc43690",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"107463\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),\n",
|
||||
" ('linearregression', LinearRegression())])"
|
||||
]
|
||||
},
|
||||
"execution_count": 99,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"from sklearn.pipeline import make_pipeline\n",
|
||||
"from sklearn.linear_model import LinearRegression\n",
|
||||
"from sklearn.metrics import mean_squared_error\n",
|
||||
"\n",
|
||||
"with open('train/train.tsv', 'r', encoding='utf8') as file:\n",
|
||||
" train_data = pd.read_csv(file, sep='\\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])\n",
|
||||
"\n",
|
||||
"print(len(train_data)) \n",
|
||||
"train_data = train_data[:10000]\n",
|
||||
" \n",
|
||||
"X = train_data['Text']\n",
|
||||
"Y = train_data['Begin']\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"model = make_pipeline(TfidfVectorizer(), LinearRegression())\n",
|
||||
"model.fit(X, Y)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 93,
|
||||
"id": "02e89f1c-a2d0-4d41-94a2-aa86b257069d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def readFile(filename):\n",
|
||||
" result = []\n",
|
||||
" with open(filename, 'r', encoding=\"utf-8\") as file:\n",
|
||||
" for line in file:\n",
|
||||
" text = line.split(\"\\t\")[0].strip()\n",
|
||||
" result.append(text)\n",
|
||||
" return result\n",
|
||||
"\n",
|
||||
"def write_pred(filename, predictions):\n",
|
||||
" with open(filename, \"w\") as file:\n",
|
||||
" for pred in predictions:\n",
|
||||
" file.write(str(pred) + \"\\n\")\n",
|
||||
" \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 100,
|
||||
"id": "b85f5e22-eafb-41ee-aa2c-20c338d42701",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"dev_0 = readFile('dev-0/in.tsv')\n",
|
||||
"predict_dev_0 = model.predict(dev_0)\n",
|
||||
"write_pred('dev-0/out.tsv', predict_dev_0)\n",
|
||||
"\n",
|
||||
"dev_1 = readFile('dev-1/in.tsv')\n",
|
||||
"predict_dev_1 = model.predict(dev_1)\n",
|
||||
"write_pred('dev-1/out.tsv', predict_dev_1)\n",
|
||||
"\n",
|
||||
"test_A = readFile('test-A/in.tsv')\n",
|
||||
"predict_test_A = model.predict(test_A)\n",
|
||||
"write_pred('test-A/out.tsv', predict_test_A)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
49
run.py
Normal file
49
run.py
Normal file
@ -0,0 +1,49 @@
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.metrics import mean_squared_error
|
||||
|
||||
with open('train/train.tsv', 'r', encoding='utf8') as file:
|
||||
train_data = pd.read_csv(file, sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])
|
||||
|
||||
|
||||
def readFile(filename):
|
||||
result = []
|
||||
with open(filename, 'r', encoding="utf-8") as file:
|
||||
for line in file:
|
||||
text = line.split("\t")[0].strip()
|
||||
result.append(text)
|
||||
return result
|
||||
|
||||
def write_pred(filename, predictions):
|
||||
with open(filename, "w") as file:
|
||||
for pred in predictions:
|
||||
file.write(str(pred) + "\n")
|
||||
|
||||
|
||||
|
||||
train_data = train_data[:10000]
|
||||
|
||||
X = train_data['Text']
|
||||
Y = train_data['Begin']
|
||||
|
||||
|
||||
model = make_pipeline(TfidfVectorizer(), LinearRegression())
|
||||
model.fit(X, Y)
|
||||
|
||||
|
||||
|
||||
dev_0 = readFile('dev-0/in.tsv')
|
||||
predict_dev_0 = model.predict(dev_0)
|
||||
write_pred('dev-0/out.tsv', predict_dev_0)
|
||||
|
||||
dev_1 = readFile('dev-1/in.tsv')
|
||||
predict_dev_1 = model.predict(dev_1)
|
||||
write_pred('dev-1/out.tsv', predict_dev_1)
|
||||
|
||||
test_A = readFile('test-A/in.tsv')
|
||||
predict_test_A = model.predict(test_A)
|
||||
write_pred('test-A/out.tsv', predict_test_A)
|
||||
|
||||
|
14220
test-A/.ipynb_checkpoints/in-checkpoint.tsv
Normal file
14220
test-A/.ipynb_checkpoints/in-checkpoint.tsv
Normal file
File diff suppressed because one or more lines are too long
14220
test-A/out.tsv
Normal file
14220
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
107471
train/.ipynb_checkpoints/meta-checkpoint.tsv
Normal file
107471
train/.ipynb_checkpoints/meta-checkpoint.tsv
Normal file
File diff suppressed because it is too large
Load Diff
107471
train/.ipynb_checkpoints/train-checkpoint.tsv
Normal file
107471
train/.ipynb_checkpoints/train-checkpoint.tsv
Normal file
File diff suppressed because one or more lines are too long
107471
train/meta.tsv
Normal file
107471
train/meta.tsv
Normal file
File diff suppressed because it is too large
Load Diff
107471
train/train.tsv
Normal file
107471
train/train.tsv
Normal file
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user