ten thousand

This commit is contained in:
korne 2022-05-17 22:07:26 +02:00
parent 647c099815
commit 5389bd1d5b
8 changed files with 176830 additions and 0 deletions

View File

@ -0,0 +1,285 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 15,
"id": "greenhouse-technician",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sklearn\n",
"import pandas as pd\n",
"from gzip import open as open_gz\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "acoustic-dividend",
"metadata": {},
"outputs": [],
"source": [
"def predict_year(x, path_out, model):\n",
" \n",
" results = model.predict(x)\n",
"\n",
" with open(path_out, 'wt') as file:\n",
" for r in results:\n",
" #if r%1==0:\n",
" # r = r+0,5\n",
" file.write(str(r) + '\\n')\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "senior-harassment",
"metadata": {},
"outputs": [],
"source": [
"with open('train.tsv', 'r', encoding='utf8') as file:\n",
" train = pd.read_csv(file, sep='\\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])\n",
" \n",
"#with open('train/train.tsv', 'r', encoding='utf8') as file:\n",
"# train = pd.read_csv(file, sep='\\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])\n",
"train = train[0:2000]\n",
"train_x=train['Text'] \n",
"train_y=train['Begin']"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "beneficial-traveler",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 nowią część kultury. U nas już nikt ich nie ch...\n",
"1 hlstorja znana w okresie piramid, jak wlaśclcl...\n",
"2 działek. Idąc dalej w swych hipotetycznych roz...\n",
"3 w Warszawie o stosunkach domowych dziatwy szko...\n",
"4 \\\\'iykład: \"Cywilizacyjna Koncepcja dziejów ¥e...\n",
" ... \n",
"1995 i' padną dobitne rozkazy. Od modlitwy nie poni...\n",
"1996 WOJEWÖDZKI roz. 28 N'r. 7 III. IW slüsUlnlku d...\n",
"1997 główną wagę kłaść należy na wspomniane w Bibli...\n",
"1998 dniu 1'3 tym marca rozchwytywali broil z miejs...\n",
"1999 ubezpieczenie społeczne, po rozpoznaniu na pos...\n",
"Name: Text, Length: 2000, dtype: object"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_x"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "polyphonic-coach",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(memory=None,\n",
" steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
" dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',\n",
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
" ngram_range=(1, 1), norm='l2', preprocessor=None, smooth...ression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n",
" normalize=False))])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = make_pipeline(TfidfVectorizer(), LinearRegression())\n",
"model.fit(train_x, train_y)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "varying-wright",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 Gazet, a tam o osobie zamformuie się. Uwiadomi...\n",
"1 materiały, która wyniesie na rok w przybliżeni...\n",
"2 były nawet w posiadaniu miejscowego polskiego ...\n",
"3 Usuwanie nawarstwień... 105 powania nieudowodn...\n",
"4 nie słyszał odC .S' źnniefAle'ObJ—A.\" \"hOdZI Ś...\n",
" ... \n",
"19993 wypoczęci! wzmocnieni, pełni najiepszych chęci...\n",
"19994 ten krok draltyczny, ahby na tirazniejllzej je...\n",
"19995 47 ust. 1 pkt 2 tej ustawy, obowiązkiem nałożo...\n",
"19996 w, mmm ”w\" w „|. ..no-ń r. .I. Lennobrovi jako...\n",
"19997 lat, przy czym za kolejny rok wnoszona jest do...\n",
"Name: 0, Length: 19998, dtype: object"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"with open('devin.tsv', 'r', encoding='utf8') as file:\n",
" x_dev = pd.read_csv(file, header=None, sep='\\t')\n",
"x_dev = x_dev[0] \n",
"x_dev"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "frozen-ticket",
"metadata": {},
"outputs": [],
"source": [
"predict_year(x_dev, 'devout.tsv', model)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "traditional-amount",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 1862.531349\n",
"1 1962.791429\n",
"2 1950.953131\n",
"3 1965.496217\n",
"4 1920.848072\n",
" ... \n",
"19993 1914.228771\n",
"19994 1902.264257\n",
"19995 2009.252595\n",
"19996 1918.643586\n",
"19997 1963.890277\n",
"Name: 0, Length: 19998, dtype: float64"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_dev = pd.read_csv('devout.tsv',header = None, sep = '/t',engine = 'python')\n",
"y_dev = y_dev[0]\n",
"y_dev"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "cordless-maker",
"metadata": {},
"outputs": [],
"source": [
"y_dev_exp = pd.read_csv('expected.tsv',header = None, sep = '/t',engine = 'python')\n",
"y_dev_exp = y_dev_exp[0:19998]\n",
"y_dev_exp = y_dev_exp[0]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "authorized-basics",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3631.1358243407444"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"RMSE_dev = mean_squared_error(y_dev_exp, y_dev)\n",
"RMSE_dev "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "congressional-requirement",
"metadata": {},
"outputs": [],
"source": [
"x_dev = pd.read_csv('dev-0/in.tsv',header = None, sep = '/t',engine = 'python')\n",
"x_dev = x_dev[0]\n",
"x_dev = pd.read_csv('dev-1/in.tsv',header = None, sep = '/t',engine = 'python')\n",
"x_dev = x_dev[0]\n",
"x_test = pd.read_csv('test-A/in.tsv',header = None, sep = '/t',engine = 'python')\n",
"x_test = x_test[0]\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "close-clinton",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "official-sweet",
"metadata": {},
"outputs": [],
"source": [
"#evaluation(x_dev,'dev-0/out.tsv', model)\n",
"#evaluation(x_test,'test-A/out.tsv', model)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

20000
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

11562
dev-1/out.tsv Normal file

File diff suppressed because it is too large Load Diff

167
retroc2.ipynb Normal file
View File

@ -0,0 +1,167 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "greenhouse-technician",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sklearn\n",
"import pandas as pd\n",
"from gzip import open as open_gz\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "acoustic-dividend",
"metadata": {},
"outputs": [],
"source": [
"def predict_year(x, path_out, model):\n",
" results = model.predict(x)\n",
" with open(path_out, 'wt') as file:\n",
" for r in results:\n",
" file.write(str(r) + '\\n') "
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "senior-harassment",
"metadata": {},
"outputs": [],
"source": [
"with open('train/train.tsv', 'r', encoding='utf8') as file:\n",
" train = pd.read_csv(file, sep='\\t', names=['Date1', 'Date2', 'Title', 'Author', 'Text'])\n",
" \n",
"train = train[0:10000]\n",
"train_x = train['Text']\n",
"train['Date'] = (train['Date1'].astype(float) + train['Date2'].astype(float))/2\n",
"train_y=train['Date']"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "polyphonic-coach",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),\n",
" ('linearregression', LinearRegression())])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model = make_pipeline(TfidfVectorizer(), LinearRegression())\n",
"model.fit(train_x, train_y)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "varying-wright",
"metadata": {},
"outputs": [],
"source": [
"with open('dev-0/in.tsv', 'r', encoding='utf8') as file:\n",
" x_dev0 = pd.read_csv(file, header=None, sep='\\t')\n",
"x_dev0 = x_dev0[0] \n",
"x_dev0[19999] = 'jest'\n",
"x_dev0[20000] = 'nie'"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "frozen-ticket",
"metadata": {},
"outputs": [],
"source": [
"with open('dev-1/in.tsv', 'r', encoding='utf8') as file:\n",
" x_dev1 = pd.read_csv(file, header=None, sep='\\t')\n",
"x_dev1 = x_dev1[0] "
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "8e3a18db-f966-45e4-b881-4b336f188055",
"metadata": {},
"outputs": [],
"source": [
"with open('test-A/in.tsv', 'r', encoding='utf8') as file:\n",
" x_test = pd.read_csv(file, header=None, sep='\\t')\n",
"x_test = x_test[0] "
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "traditional-amount",
"metadata": {},
"outputs": [],
"source": [
"#y_dev = pd.read_csv('dev-0/out.tsv',header = None, sep = '/t',engine = 'python')\n",
"#y_dev = y_dev[0]\n",
"#y_dev_exp = pd.read_csv('dev-0/expected.tsv',header = None, sep = '/t',engine = 'python')\n",
"#y_dev_exp = y_dev_exp[0]\n",
"#RMSE_dev = mean_squared_error(y_dev_exp, y_dev) "
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "close-clinton",
"metadata": {},
"outputs": [],
"source": [
"predict_year(x_dev0, 'dev-0/out.tsv', model)\n",
"predict_year(x_dev1,'dev-1/out.tsv', model)\n",
"predict_year(x_test,'test-A/out.tsv', model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "official-sweet",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

14219
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

107471
train/train.tsv Normal file

File diff suppressed because one or more lines are too long