forked from kubapok/retroc2
1476 lines
40 KiB
Plaintext
1476 lines
40 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# retroc2"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import lzma\n",
|
||
"import csv\n",
|
||
"from stop_words import get_stop_words\n",
|
||
"import gensim\n",
|
||
"import itertools\n",
|
||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||
"import pandas as pd\n",
|
||
"from sklearn.linear_model import LinearRegression"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 68,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def read_data(filename):\n",
|
||
" all_data = lzma.open(filename).read().decode('UTF-8').split('\\n')\n",
|
||
" return [line.split('\\t') for line in all_data][:-1]\n",
|
||
"\n",
|
||
"train_data = read_data('train/train.tsv.xz')[::250]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 69,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"['1985.49315068493',\n",
|
||
" '1985.49589037925',\n",
|
||
" 'PRZEKRÓJ',\n",
|
||
" 'MBC',\n",
|
||
" 'nowią część kultury. U nas już nikt ich nie chce oglądać. Chciałam osiągnąć coś wprost przeciwnego: przywrócić kobietom zaufanie do samych siebie, do własnych mo!liwości. Katharłne Hepburn powłedziala. kłedyi, łe najtrudnłej$ze to ..aprzedawanłe debie jak bukietu Awłeźych kwiatów\". Czy pant nie myllt. tak aamo7 Jestem bardziej odprężona niż Katharine. Gwiazdy jej generacji były większymi gwiazdami i musiały być całkiem nadzwyczajne. Nasze pokolenie jest banalniejsze. Jako kobieta i jako aktorka najlepiej czuję się w tłumie. --. Jest szalona rolnica między tym co ludzie o panł myl\\'ą. a tllm. kim panł jeBt naprClwdę. Ja tego nie widzę. Był taki okres w naszym ż\\'yciu, że Tom i ja n e mieliśmy pieniędzy. NIe pracowałam. Zyliśmy z koie zności bardzo skrom- -. -... .. nie. Zresztą dotychC\" as zy- . popiół znad ruin miasta. Ogromny teren, obejmuJący około 58 km t został zamieniony w dymiące pogorzelisko. Ulice miasta pokryte były zwęglonymi zwłokami mieszkańc6w, kt6re w wielu miejscach tworzyły makabryczne stosy. Wśród ofiar znaleźli się wszyscy dostojnicy przybyli poprzedniego dnia z Fort de France. Przez pierwsze dwa dni trwała akcja ratunkowa, nie udało się jednak znale:fć ani jednej żywej istoty. Dopiero w niedzielę, 11 maja, usłyszano czyjeŚ jęki. Odrzucając głazy i gorący jeszcze popiół, odnaleziono mocno poparzonego i całkowicie wyczerpanego młodego człowieka. Okazało si że jest to więzień pochodzący z leo Precheur. Skazano go na tygodniowy pobyt w karnej celi (ciemnicy) za samowolne opuszczenie więzienia. Ta niesubordynacja okazała się dla Sylbarisa zbawienna. Grube mury celi, Rołożonej u pod!1 óża g?ry, uchroniły go od zrmażdźenla i od spalenia\\'. Uratowany tak opisał nieprawdopodobną tragedię miasta: To btllo okolo 8 rano... nagle usłyszałem ogromny huk, a potem pTzeraźliwe krzyki ludzi. W sZ]lScy . l .\\' , P walali: pa ę nę.... umIeram.... o kilku minutach. WSZ1łstkie ucichły. Wszystkie... :z 1D1/;qtkiem mo;ego... Ogień pochłonął miasto i jego mieszkańców. Spełniła się klątwa rzucona przez wodza karaibskiego przed nies łna ćwierć wiekiem. ANDRZEJ VORBRODT jemy o wiele skromniej, niż większość ludzi z Hollywood. Moje. dzieci chodzą do publicznej szkoły, nie chcę, by wyrastały na .snobów. Myślę, że każda aktorka chyba że gra wyłącznie kr6lowe i księżniczki musi pozostawać w kontakcie z normalnymi ludźmi i z normalnym życiem. Zresztą, gdybym nagle zdecydowała się żyć luksusowo, Tom niechybnie opuściłby mnie\\' w mgnieniu oka. Wydawalo mł się nłer4%, e ma pant paC2. UC\"ic winy z powodu awołch ]Jłeniędzy... Nic podobnego. Jestem dumna ze sposobu, w jaki wydaję moje pieniądze. Używam ich na cele? w które wierzę i o ktore walczę. - czy t,o prawda. te sfinanaowała pant calkouńcie kampanię elektoralną Toma przy pomocy płenłędZ1l zarobionych na aerobiku\\' Tak. czy zna pani włelko\\' swojej fortuny? ..:.. Mniej więcej. Przed Tomem byl Vad\\'m; Paryt. cyganeria artystyczna, latwe tycie... Była pant kim innym. Jak doszlo do takiej zmiany? Dwadzie cia lat temu nie wiedziałam kim jestem. Byłam całkiem apolityczna. Kiedy wybuchła wojna w Wietnamie, n!e wiedziałam nawet gdzie leży Wietnam. A kiedy zrozumiałam, co naprawdę się dzieje w Wietnamie nie umiałam się wyłączyć j przestać walczyć o to, co Ic-uważalam za swój 000- wiązek. To calkowicle zmieniło']"
|
||
]
|
||
},
|
||
"execution_count": 69,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"train_data[0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 70,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"['ach', 'aj', 'albo', 'bardzo', 'bez', 'bo', 'być', 'ci', 'cię', 'ciebie', 'co', 'czy', 'daleko', 'dla', 'dlaczego', 'dlatego', 'do', 'dobrze', 'dokąd', 'dość', 'dużo', 'dwa', 'dwaj', 'dwie', 'dwoje', 'dziś', 'dzisiaj', 'gdyby', 'gdzie', 'go', 'ich', 'ile', 'im', 'inny', 'ja', 'ją', 'jak', 'jakby', 'jaki', 'je', 'jeden', 'jedna', 'jedno', 'jego', 'jej', 'jemu', 'jeśli', 'jest', 'jestem', 'jeżeli', 'już', 'każdy', 'kiedy', 'kierunku', 'kto', 'ku', 'lub', 'ma', 'mają', 'mam', 'mi', 'mną', 'mnie', 'moi', 'mój', 'moja', 'moje', 'może', 'mu', 'my', 'na', 'nam', 'nami', 'nas', 'nasi', 'nasz', 'nasza', 'nasze', 'natychmiast', 'nią', 'nic', 'nich', 'nie', 'niego', 'niej', 'niemu', 'nigdy', 'nim', 'nimi', 'niż', 'obok', 'od', 'około', 'on', 'ona', 'one', 'oni', 'ono', 'owszem', 'po', 'pod', 'ponieważ', 'przed', 'przedtem', 'są', 'sam', 'sama', 'się', 'skąd', 'tak', 'taki', 'tam', 'ten', 'to', 'tobą', 'tobie', 'tu', 'tutaj', 'twoi', 'twój', 'twoja', 'twoje', 'ty', 'wam', 'wami', 'was', 'wasi', 'wasz', 'wasza', 'wasze', 'we', 'więc', 'wszystko', 'wtedy', 'wy', 'żaden', 'zawsze', 'że', 'a', 'u', 'i', 'z', 'w', 'o']\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"stop_words = get_stop_words('pl') + ['a', 'u', 'i', 'z', 'w', 'o']\n",
|
||
"print(stop_words)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 71,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"train_data_tokenized = [list(set(gensim.utils.tokenize(x[4], lowercase = True))) for x in train_data]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 72,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"['hepburn',\n",
|
||
" 'przestać',\n",
|
||
" 'skazano',\n",
|
||
" 'l',\n",
|
||
" 'chce',\n",
|
||
" 'ic',\n",
|
||
" 'miasto',\n",
|
||
" 'ta',\n",
|
||
" 'aerobiku',\n",
|
||
" 'czy',\n",
|
||
" 'wszyscy',\n",
|
||
" 'jestem',\n",
|
||
" 'już',\n",
|
||
" 'dymiące',\n",
|
||
" 'zarobionych',\n",
|
||
" 'katharine',\n",
|
||
" 'dwadzie',\n",
|
||
" 'zdecydowała',\n",
|
||
" 'normalnym',\n",
|
||
" 'potem',\n",
|
||
" 'jednej',\n",
|
||
" 'widzę',\n",
|
||
" 'tygodniowy',\n",
|
||
" 'toma',\n",
|
||
" 'byl',\n",
|
||
" 'huk',\n",
|
||
" 'liwości',\n",
|
||
" 'te',\n",
|
||
" 'kt',\n",
|
||
" 'mury',\n",
|
||
" 'fort',\n",
|
||
" 'więzienia',\n",
|
||
" 'okolo',\n",
|
||
" 'walczę',\n",
|
||
" 'o',\n",
|
||
" 'ani',\n",
|
||
" 'znaleźli',\n",
|
||
" 'dzieje',\n",
|
||
" 'okazała',\n",
|
||
" 'człowieka',\n",
|
||
" 'maja',\n",
|
||
" 'nawet',\n",
|
||
" 'wydawalo',\n",
|
||
" 'musiały',\n",
|
||
" 'minutach',\n",
|
||
" 'głazy',\n",
|
||
" 'qtkiem',\n",
|
||
" 'zbawienna',\n",
|
||
" 'wprost',\n",
|
||
" 'jednak',\n",
|
||
" 'wśród',\n",
|
||
" 'samowolne',\n",
|
||
" 'życiem',\n",
|
||
" 'kobieta',\n",
|
||
" 'g',\n",
|
||
" 'aprzedawanłe',\n",
|
||
" 'odrzucając',\n",
|
||
" 'dostojnicy',\n",
|
||
" 'uważalam',\n",
|
||
" 'paryt',\n",
|
||
" 'włelko',\n",
|
||
" 'luksusowo',\n",
|
||
" 'podobnego',\n",
|
||
" 'własnych',\n",
|
||
" 'katharłne',\n",
|
||
" 'karaibskiego',\n",
|
||
" 'samych',\n",
|
||
" 'wybuchła',\n",
|
||
" 'nic',\n",
|
||
" 'dla',\n",
|
||
" 'de',\n",
|
||
" 'księżniczki',\n",
|
||
" 'łna',\n",
|
||
" 'klątwa',\n",
|
||
" 'sfinanaowała',\n",
|
||
" 'powłedziala',\n",
|
||
" 'pani',\n",
|
||
" 'część',\n",
|
||
" 'wiązek',\n",
|
||
" 'wyłącznie',\n",
|
||
" 'rzucona',\n",
|
||
" 'akcja',\n",
|
||
" 'opuściłby',\n",
|
||
" 'karnej',\n",
|
||
" 'myl',\n",
|
||
" 'p',\n",
|
||
" 'leo',\n",
|
||
" 'zwłokami',\n",
|
||
" 'pokolenie',\n",
|
||
" 'miejscach',\n",
|
||
" 'spalenia',\n",
|
||
" 'kłedyi',\n",
|
||
" 'mieliśmy',\n",
|
||
" 'koie',\n",
|
||
" 'takiej',\n",
|
||
" 'walali',\n",
|
||
" 'rano',\n",
|
||
" 'naprawdę',\n",
|
||
" 'większymi',\n",
|
||
" 'na',\n",
|
||
" 'zna',\n",
|
||
" 'si',\n",
|
||
" 'normalnymi',\n",
|
||
" 'były',\n",
|
||
" 'apolityczna',\n",
|
||
" 'zaufanie',\n",
|
||
" 'całkiem',\n",
|
||
" 'zyliśmy',\n",
|
||
" 'ptzeraźliwe',\n",
|
||
" 'udało',\n",
|
||
" 'tym',\n",
|
||
" 're',\n",
|
||
" 'osiągnąć',\n",
|
||
" 'mgnieniu',\n",
|
||
" 'pochłonął',\n",
|
||
" 'naszym',\n",
|
||
" 'rołożonej',\n",
|
||
" 'grube',\n",
|
||
" 'oka',\n",
|
||
" 'tworzyły',\n",
|
||
" 'istoty',\n",
|
||
" 'debie',\n",
|
||
" 'każda',\n",
|
||
" 'zy',\n",
|
||
" 'mniej',\n",
|
||
" 'chciałam',\n",
|
||
" 'popiół',\n",
|
||
" 'leży',\n",
|
||
" 'najtrudnłej',\n",
|
||
" 'znad',\n",
|
||
" 'jego',\n",
|
||
" 'pochodzący',\n",
|
||
" 'zmieniło',\n",
|
||
" 'snobów',\n",
|
||
" 'dumna',\n",
|
||
" 'byłam',\n",
|
||
" 'yciu',\n",
|
||
" 'pieniądze',\n",
|
||
" 'pokryte',\n",
|
||
" 'bardziej',\n",
|
||
" 'm',\n",
|
||
" 'do',\n",
|
||
" 'bukietu',\n",
|
||
" 'żyć',\n",
|
||
" 'poparzonego',\n",
|
||
" 'w',\n",
|
||
" 'ę',\n",
|
||
" 'calkowicle',\n",
|
||
" 'vad',\n",
|
||
" 'tak',\n",
|
||
" 'gdzie',\n",
|
||
" 'kampanię',\n",
|
||
" 'celi',\n",
|
||
" 'pozostawać',\n",
|
||
" 'sylbarisa',\n",
|
||
" 'nieprawdopodobną',\n",
|
||
" 'nie',\n",
|
||
" 'około',\n",
|
||
" 'wojna',\n",
|
||
" 'calkouńcie',\n",
|
||
" 'odnaleziono',\n",
|
||
" 'uc',\n",
|
||
" 'które',\n",
|
||
" 'poprzedniego',\n",
|
||
" 'dzieci',\n",
|
||
" 'wietnam',\n",
|
||
" 'płenłędz',\n",
|
||
" 'publicznej',\n",
|
||
" 'odprężona',\n",
|
||
" 'spełniła',\n",
|
||
" 'ja',\n",
|
||
" 'nę',\n",
|
||
" 'stosy',\n",
|
||
" 'jęki',\n",
|
||
" 'wyłączyć',\n",
|
||
" 'chyba',\n",
|
||
" 'skrom',\n",
|
||
" 'jemy',\n",
|
||
" 'jak',\n",
|
||
" 'więcej',\n",
|
||
" 'była',\n",
|
||
" 'jłeniędzy',\n",
|
||
" 'przed',\n",
|
||
" 'nadzwyczajne',\n",
|
||
" 'musi',\n",
|
||
" 'młodego',\n",
|
||
" 'używam',\n",
|
||
" 'szalona',\n",
|
||
" 'przeciwnego',\n",
|
||
" 'naprclwdę',\n",
|
||
" 'to',\n",
|
||
" 'tom',\n",
|
||
" 'fć',\n",
|
||
" 'myślę',\n",
|
||
" 'wiedziałam',\n",
|
||
" 'za',\n",
|
||
" 'niesubordynacja',\n",
|
||
" 'nies',\n",
|
||
" 'by',\n",
|
||
" 'chcę',\n",
|
||
" 'ucichły',\n",
|
||
" 'lowe',\n",
|
||
" 'precheur',\n",
|
||
" 'zresztą',\n",
|
||
" 'dopiero',\n",
|
||
" 'winy',\n",
|
||
" 'j',\n",
|
||
" 'zności',\n",
|
||
" 'zamieniony',\n",
|
||
" 'mł',\n",
|
||
" 'ulice',\n",
|
||
" 'czyjeś',\n",
|
||
" 'taki',\n",
|
||
" 'ogień',\n",
|
||
" 'ze',\n",
|
||
" 'óża',\n",
|
||
" 'fortuny',\n",
|
||
" 'nas',\n",
|
||
" 'kwiatów',\n",
|
||
" 'usłyszano',\n",
|
||
" 'kim',\n",
|
||
" 'został',\n",
|
||
" 'ry',\n",
|
||
" 'as',\n",
|
||
" 'france',\n",
|
||
" 'moje',\n",
|
||
" 'ludzi',\n",
|
||
" 'n',\n",
|
||
" 'niż',\n",
|
||
" 'nłer',\n",
|
||
" 'jaki',\n",
|
||
" 'chodzą',\n",
|
||
" 'go',\n",
|
||
" 'makabryczne',\n",
|
||
" 'tomem',\n",
|
||
" 'siebie',\n",
|
||
" 'ogromny',\n",
|
||
" 'opuszczenie',\n",
|
||
" 'dotychc',\n",
|
||
" 'nikt',\n",
|
||
" 'panł',\n",
|
||
" 'tego',\n",
|
||
" 'pieniędzy',\n",
|
||
" 'wydaję',\n",
|
||
" 'jest',\n",
|
||
" 'pa',\n",
|
||
" 'skromniej',\n",
|
||
" 'bardzo',\n",
|
||
" 'powodu',\n",
|
||
" 'wiele',\n",
|
||
" 'aamo',\n",
|
||
" 'btllo',\n",
|
||
" 'przy',\n",
|
||
" 'latwe',\n",
|
||
" 'żywej',\n",
|
||
" 'sz',\n",
|
||
" 'gwiazdami',\n",
|
||
" 'ktore',\n",
|
||
" 'pobyt',\n",
|
||
" 'e',\n",
|
||
" 'elektoralną',\n",
|
||
" 'nagle',\n",
|
||
" 'tłumie',\n",
|
||
" 'pierwsze',\n",
|
||
" 'krzyki',\n",
|
||
" 'niedzielę',\n",
|
||
" 'wiekiem',\n",
|
||
" 'zwęglonymi',\n",
|
||
" 'pomocy',\n",
|
||
" 'ą',\n",
|
||
" 'tragedię',\n",
|
||
" 'teren',\n",
|
||
" 'ludźmi',\n",
|
||
" 'sposobu',\n",
|
||
" 'trwała',\n",
|
||
" 'łe',\n",
|
||
" 'artystyczna',\n",
|
||
" 'wielu',\n",
|
||
" 'i',\n",
|
||
" 'przybyli',\n",
|
||
" 'zrozumiałam',\n",
|
||
" 'mieszkańców',\n",
|
||
" 'okazało',\n",
|
||
" 'ma',\n",
|
||
" 'wyrastały',\n",
|
||
" 'lat',\n",
|
||
" 'wsz',\n",
|
||
" 'niechybnie',\n",
|
||
" 'mnie',\n",
|
||
" 'jeszcze',\n",
|
||
" 'wietnamie',\n",
|
||
" 'wodza',\n",
|
||
" 'cia',\n",
|
||
" 'temu',\n",
|
||
" 'myllt',\n",
|
||
" 'łstkie',\n",
|
||
" 'mo',\n",
|
||
" 'nowią',\n",
|
||
" 'kiedy',\n",
|
||
" 'pod',\n",
|
||
" 'vorbrodt',\n",
|
||
" 'od',\n",
|
||
" 'zmiany',\n",
|
||
" 'generacji',\n",
|
||
" 'tycie',\n",
|
||
" 'gra',\n",
|
||
" 'jebt',\n",
|
||
" 'pogorzelisko',\n",
|
||
" 't',\n",
|
||
" 'przez',\n",
|
||
" 'pant',\n",
|
||
" 'ż',\n",
|
||
" 'umieram',\n",
|
||
" 'okres',\n",
|
||
" 'hollywood',\n",
|
||
" 'ruin',\n",
|
||
" 'przywrócić',\n",
|
||
" 'opisał',\n",
|
||
" 'kultury',\n",
|
||
" 'czuję',\n",
|
||
" 'że',\n",
|
||
" 'mieszkańc',\n",
|
||
" 'pac',\n",
|
||
" 'cyganeria',\n",
|
||
" 'obejmujący',\n",
|
||
" 'pracowałam',\n",
|
||
" 'innym',\n",
|
||
" 'rolnica',\n",
|
||
" 'prawda',\n",
|
||
" 'swój',\n",
|
||
" 'nasze',\n",
|
||
" 'swojej',\n",
|
||
" 'większość',\n",
|
||
" 'uchroniły',\n",
|
||
" 'kobietom',\n",
|
||
" 'a',\n",
|
||
" 'oglądać',\n",
|
||
" 'znale',\n",
|
||
" 'wyczerpanego',\n",
|
||
" 'd',\n",
|
||
" 'ofiar',\n",
|
||
" 'co',\n",
|
||
" 'był',\n",
|
||
" 'aktorka',\n",
|
||
" 'z',\n",
|
||
" 'wszystkie',\n",
|
||
" 'szkoły',\n",
|
||
" 'uratowany',\n",
|
||
" 'między',\n",
|
||
" 'dwa',\n",
|
||
" 'km',\n",
|
||
" 'umiałam',\n",
|
||
" 'miasta',\n",
|
||
" 'kr',\n",
|
||
" 'gdybym',\n",
|
||
" 'awłeźych',\n",
|
||
" 'ich',\n",
|
||
" 'awołch',\n",
|
||
" 'doszlo',\n",
|
||
" 'więzień',\n",
|
||
" 'kontakcie',\n",
|
||
" 'jako',\n",
|
||
" 'zrmażdźenla',\n",
|
||
" 'usłyszałem',\n",
|
||
" 'ćwierć',\n",
|
||
" 'wierzę',\n",
|
||
" 'się',\n",
|
||
" 'mocno',\n",
|
||
" 'kilku',\n",
|
||
" 'coś',\n",
|
||
" 'ego',\n",
|
||
" 'być',\n",
|
||
" 'andrzej',\n",
|
||
" 'jej',\n",
|
||
" 'gwiazdy',\n",
|
||
" 'całkowicie',\n",
|
||
" 'tllm',\n",
|
||
" 'dni',\n",
|
||
" 'dnia',\n",
|
||
" 'walczyć',\n",
|
||
" 'ratunkowa',\n",
|
||
" 'lscy',\n",
|
||
" 'cele',\n",
|
||
" 'u',\n",
|
||
" 'banalniejsze',\n",
|
||
" 'ludzie',\n",
|
||
" 'gorący',\n",
|
||
" 'najlepiej',\n",
|
||
" 'ciemnicy']"
|
||
]
|
||
},
|
||
"execution_count": 72,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"train_data_tokenized[0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 73,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"['zreszt',\n",
|
||
" 'l',\n",
|
||
" 'kontak',\n",
|
||
" 'chce',\n",
|
||
" 'ic',\n",
|
||
" 'miasto',\n",
|
||
" 'sposob',\n",
|
||
" 'ta',\n",
|
||
" 'uchron',\n",
|
||
" 'kultur',\n",
|
||
" 'wszysc',\n",
|
||
" 'pozost',\n",
|
||
" 'potem',\n",
|
||
" 'jednej',\n",
|
||
" 'widzę',\n",
|
||
" 'toma',\n",
|
||
" 'spełni',\n",
|
||
" 'byl',\n",
|
||
" 'vorbro',\n",
|
||
" 'huk',\n",
|
||
" 'okazał',\n",
|
||
" 'te',\n",
|
||
" 'kt',\n",
|
||
" 'mury',\n",
|
||
" 'fort',\n",
|
||
" 'okolo',\n",
|
||
" 'odpręż',\n",
|
||
" 'nadzwy',\n",
|
||
" 'walczę',\n",
|
||
" 'ani',\n",
|
||
" 'mieliś',\n",
|
||
" 'dzieje',\n",
|
||
" 'odrzuc',\n",
|
||
" 'podobn',\n",
|
||
" 'maja',\n",
|
||
" 'nawet',\n",
|
||
" 'głazy',\n",
|
||
" 'qtkiem',\n",
|
||
" 'wprost',\n",
|
||
" 'jednak',\n",
|
||
" 'wśród',\n",
|
||
" 'karaib',\n",
|
||
" 'genera',\n",
|
||
" 'osiągn',\n",
|
||
" 'życiem',\n",
|
||
" 'g',\n",
|
||
" 'rzucon',\n",
|
||
" 'paryt',\n",
|
||
" 'włelko',\n",
|
||
" 'dymiąc',\n",
|
||
" 'ogląda',\n",
|
||
" 'awłeźy',\n",
|
||
" 'niechy',\n",
|
||
" 'samych',\n",
|
||
" 'gwiazd',\n",
|
||
" 'de',\n",
|
||
" 'łna',\n",
|
||
" 'hollyw',\n",
|
||
" 'pochod',\n",
|
||
" 'klątwa',\n",
|
||
" 'samowo',\n",
|
||
" 'sylbar',\n",
|
||
" 'pani',\n",
|
||
" 'część',\n",
|
||
" 'wiązek',\n",
|
||
" 'akcja',\n",
|
||
" 'niedzi',\n",
|
||
" 'karnej',\n",
|
||
" 'myl',\n",
|
||
" 'szalon',\n",
|
||
" 'p',\n",
|
||
" 'leo',\n",
|
||
" 'rołożo',\n",
|
||
" 'całkie',\n",
|
||
" 'kłedyi',\n",
|
||
" 'koie',\n",
|
||
" 'takiej',\n",
|
||
" 'elekto',\n",
|
||
" 'walali',\n",
|
||
" 'rano',\n",
|
||
" 'zna',\n",
|
||
" 'ucichł',\n",
|
||
" 'si',\n",
|
||
" 'zamien',\n",
|
||
" 'były',\n",
|
||
" 'wyczer',\n",
|
||
" 'całkow',\n",
|
||
" 'udało',\n",
|
||
" 'tym',\n",
|
||
" 'naprcl',\n",
|
||
" 'znaleź',\n",
|
||
" 'mieszk',\n",
|
||
" 'calkow',\n",
|
||
" 're',\n",
|
||
" 'naszym',\n",
|
||
" 'grube',\n",
|
||
" 'oka',\n",
|
||
" 'liwośc',\n",
|
||
" 'umiała',\n",
|
||
" 'istoty',\n",
|
||
" 'debie',\n",
|
||
" 'każda',\n",
|
||
" 'zy',\n",
|
||
" 'mniej',\n",
|
||
" 'popiół',\n",
|
||
" 'miejsc',\n",
|
||
" 'leży',\n",
|
||
" 'znad',\n",
|
||
" 'andrze',\n",
|
||
" 'wyłącz',\n",
|
||
" 'snobów',\n",
|
||
" 'dumna',\n",
|
||
" 'byłam',\n",
|
||
" 'przeci',\n",
|
||
" 'dopier',\n",
|
||
" 'odnale',\n",
|
||
" 'yciu',\n",
|
||
" 'calkou',\n",
|
||
" 'najtru',\n",
|
||
" 'm',\n",
|
||
" 'zarobi',\n",
|
||
" 'chciał',\n",
|
||
" 'żyć',\n",
|
||
" 'ę',\n",
|
||
" 'vad',\n",
|
||
" 'hepbur',\n",
|
||
" 'celi',\n",
|
||
" 'przywr',\n",
|
||
" 'wojna',\n",
|
||
" 'opuszc',\n",
|
||
" 'dwadzi',\n",
|
||
" 'ptzera',\n",
|
||
" 'uc',\n",
|
||
" 'które',\n",
|
||
" 'dzieci',\n",
|
||
" 'zrozum',\n",
|
||
" 'musiał',\n",
|
||
" 'zbawie',\n",
|
||
" 'bardzi',\n",
|
||
" 'nę',\n",
|
||
" 'stosy',\n",
|
||
" 'jęki',\n",
|
||
" 'zwęglo',\n",
|
||
" 'młodeg',\n",
|
||
" 'poparz',\n",
|
||
" 'chyba',\n",
|
||
" 'aprzed',\n",
|
||
" 'skrom',\n",
|
||
" 'jemy',\n",
|
||
" 'skromn',\n",
|
||
" 'więcej',\n",
|
||
" 'była',\n",
|
||
" 'większ',\n",
|
||
" 'kwiató',\n",
|
||
" 'musi',\n",
|
||
" 'używam',\n",
|
||
" 'zwłoka',\n",
|
||
" 'wybuch',\n",
|
||
" 'tygodn',\n",
|
||
" 'niepra',\n",
|
||
" 'wietna',\n",
|
||
" 'cygane',\n",
|
||
" 'tom',\n",
|
||
" 'fć',\n",
|
||
" 'człowi',\n",
|
||
" 'myślę',\n",
|
||
" 'za',\n",
|
||
" 'nies',\n",
|
||
" 'by',\n",
|
||
" 'pokryt',\n",
|
||
" 'chcę',\n",
|
||
" 'lowe',\n",
|
||
" 'winy',\n",
|
||
" 'j',\n",
|
||
" 'zdecyd',\n",
|
||
" 'zności',\n",
|
||
" 'mł',\n",
|
||
" 'ulice',\n",
|
||
" 'czyjeś',\n",
|
||
" 'ogień',\n",
|
||
" 'ze',\n",
|
||
" 'makabr',\n",
|
||
" 'óża',\n",
|
||
" 'kim',\n",
|
||
" 'został',\n",
|
||
" 'ry',\n",
|
||
" 'as',\n",
|
||
" 'france',\n",
|
||
" 'ludzi',\n",
|
||
" 'n',\n",
|
||
" 'umiera',\n",
|
||
" 'nłer',\n",
|
||
" 'chodzą',\n",
|
||
" 'pienię',\n",
|
||
" 'tomem',\n",
|
||
" 'kobiet',\n",
|
||
" 'siebie',\n",
|
||
" 'wiekie',\n",
|
||
" 'sfinan',\n",
|
||
" 'nikt',\n",
|
||
" 'panł',\n",
|
||
" 'tego',\n",
|
||
" 'wydaję',\n",
|
||
" 'ogromn',\n",
|
||
" 'rolnic',\n",
|
||
" 'pa',\n",
|
||
" 'ratunk',\n",
|
||
" 'powodu',\n",
|
||
" 'artyst',\n",
|
||
" 'wiele',\n",
|
||
" 'zaufan',\n",
|
||
" 'public',\n",
|
||
" 'aamo',\n",
|
||
" 'btllo',\n",
|
||
" 'przy',\n",
|
||
" 'latwe',\n",
|
||
" 'żywej',\n",
|
||
" 'skazan',\n",
|
||
" 'sz',\n",
|
||
" 'ktore',\n",
|
||
" 'minuta',\n",
|
||
" 'pobyt',\n",
|
||
" 'e',\n",
|
||
" 'powłed',\n",
|
||
" 'pogorz',\n",
|
||
" 'jłenię',\n",
|
||
" 'apolit',\n",
|
||
" 'ciemni',\n",
|
||
" 'nagle',\n",
|
||
" 'najlep',\n",
|
||
" 'tłumie',\n",
|
||
" 'krzyki',\n",
|
||
" 'usłysz',\n",
|
||
" 'jeszcz',\n",
|
||
" 'pomocy',\n",
|
||
" 'przyby',\n",
|
||
" 'ą',\n",
|
||
" 'teren',\n",
|
||
" 'ludźmi',\n",
|
||
" 'trwała',\n",
|
||
" 'zrmażd',\n",
|
||
" 'łe',\n",
|
||
" 'walczy',\n",
|
||
" 'wielu',\n",
|
||
" 'dotych',\n",
|
||
" 'tworzy',\n",
|
||
" 'lat',\n",
|
||
" 'wsz',\n",
|
||
" 'banaln',\n",
|
||
" 'wyrast',\n",
|
||
" 'wszyst',\n",
|
||
" 'wodza',\n",
|
||
" 'cia',\n",
|
||
" 'temu',\n",
|
||
" 'myllt',\n",
|
||
" 'własny',\n",
|
||
" 'normal',\n",
|
||
" 'łstkie',\n",
|
||
" 'dostoj',\n",
|
||
" 'uważal',\n",
|
||
" 'mo',\n",
|
||
" 'nowią',\n",
|
||
" 'wiedzi',\n",
|
||
" 'aerobi',\n",
|
||
" 'pracow',\n",
|
||
" 'płenłę',\n",
|
||
" 'zmiany',\n",
|
||
" 'tycie',\n",
|
||
" 'gra',\n",
|
||
" 'opuści',\n",
|
||
" 'jebt',\n",
|
||
" 't',\n",
|
||
" 'przez',\n",
|
||
" 'pant',\n",
|
||
" 'ż',\n",
|
||
" 'okres',\n",
|
||
" 'spalen',\n",
|
||
" 'ruin',\n",
|
||
" 'opisał',\n",
|
||
" 'więzie',\n",
|
||
" 'czuję',\n",
|
||
" 'luksus',\n",
|
||
" 'pac',\n",
|
||
" 'mgnien',\n",
|
||
" 'innym',\n",
|
||
" 'kampan',\n",
|
||
" 'prawda',\n",
|
||
" 'aktork',\n",
|
||
" 'swój',\n",
|
||
" 'obejmu',\n",
|
||
" 'swojej',\n",
|
||
" 'znale',\n",
|
||
" 'zyliśm',\n",
|
||
" 'kathar',\n",
|
||
" 'd',\n",
|
||
" 'ofiar',\n",
|
||
" 'pierws',\n",
|
||
" 'napraw',\n",
|
||
" 'traged',\n",
|
||
" 'był',\n",
|
||
" 'zmieni',\n",
|
||
" 'szkoły',\n",
|
||
" 'między',\n",
|
||
" 'km',\n",
|
||
" 'miasta',\n",
|
||
" 'kr',\n",
|
||
" 'gdybym',\n",
|
||
" 'przest',\n",
|
||
" 'awołch',\n",
|
||
" 'doszlo',\n",
|
||
" 'pochło',\n",
|
||
" 'uratow',\n",
|
||
" 'jako',\n",
|
||
" 'wierzę',\n",
|
||
" 'ćwierć',\n",
|
||
" 'preche',\n",
|
||
" 'mocno',\n",
|
||
" 'kilku',\n",
|
||
" 'coś',\n",
|
||
" 'poprze',\n",
|
||
" 'ego',\n",
|
||
" 'pokole',\n",
|
||
" 'księżn',\n",
|
||
" 'bukiet',\n",
|
||
" 'tllm',\n",
|
||
" 'fortun',\n",
|
||
" 'dni',\n",
|
||
" 'dnia',\n",
|
||
" 'niesub',\n",
|
||
" 'wydawa',\n",
|
||
" 'lscy',\n",
|
||
" 'cele',\n",
|
||
" 'pienią',\n",
|
||
" 'ludzie',\n",
|
||
" 'gorący']"
|
||
]
|
||
},
|
||
"execution_count": 73,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"train_data_stemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in train_data_tokenized]\n",
|
||
"train_data_stemmatized[0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 74,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"vectorizer = TfidfVectorizer()\n",
|
||
"vectors = vectorizer.fit_transform([' '.join(i) for i in train_data_stemmatized])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 75,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"feature_names = vectorizer.get_feature_names()\n",
|
||
"dense = vectors.todense()\n",
|
||
"denselist = dense.tolist()\n",
|
||
"df = pd.DataFrame(denselist, columns=feature_names)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 76,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"430"
|
||
]
|
||
},
|
||
"execution_count": 76,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(train_data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 77,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>__</th>\n",
|
||
" <th>___</th>\n",
|
||
" <th>____</th>\n",
|
||
" <th>_____</th>\n",
|
||
" <th>______</th>\n",
|
||
" <th>____x</th>\n",
|
||
" <th>__ch</th>\n",
|
||
" <th>__n_</th>\n",
|
||
" <th>__naie</th>\n",
|
||
" <th>__o</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>франкф</th>\n",
|
||
" <th>фялофс</th>\n",
|
||
" <th>что</th>\n",
|
||
" <th>шшяшшш</th>\n",
|
||
" <th>щвашш</th>\n",
|
||
" <th>ьввдвн</th>\n",
|
||
" <th>ьлало</th>\n",
|
||
" <th>эавкде</th>\n",
|
||
" <th>юрвдич</th>\n",
|
||
" <th>ях</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>10 rows × 42788 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" __ ___ ____ _____ ______ ____x __ch __n_ __naie __o ... франкф \\\n",
|
||
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
|
||
"1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
|
||
"2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
|
||
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
|
||
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
|
||
"5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
|
||
"6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
|
||
"7 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
|
||
"8 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
|
||
"9 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 \n",
|
||
"\n",
|
||
" фялофс что шшяшшш щвашш ьввдвн ьлало эавкде юрвдич ях \n",
|
||
"0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"7 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"8 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"9 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
|
||
"\n",
|
||
"[10 rows x 42788 columns]"
|
||
]
|
||
},
|
||
"execution_count": 77,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df[:10]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 78,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([0.47377066, 0. , 0. , ..., 0. , 0. ,\n",
|
||
" 0. ])"
|
||
]
|
||
},
|
||
"execution_count": 78,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"vectorizer.transform(['__ ma kota']).toarray()[0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 79,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"train_Y = [(float(x[0]) + float(x[1])) / 2 for x in train_data]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 80,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"LinearRegression()"
|
||
]
|
||
},
|
||
"execution_count": 80,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"model = LinearRegression() # definicja modelu\n",
|
||
"model.fit(df, train_Y) # dopasowanie modelu"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 81,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([1985.49452053, 1925.6178082 , 1967.30958903, 1937.49999998,\n",
|
||
" 1919.55479387, 1932.77459015, 1842.49999998, 1932.08333332,\n",
|
||
" 1930.67808218, 2000.49999998])"
|
||
]
|
||
},
|
||
"execution_count": 81,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"model.predict(df[:10])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 82,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"with open('dev-0/in.tsv', \"r\", encoding=\"utf-8\") as f:\n",
|
||
" dev_0_data = [line.rstrip() for line in f]\n",
|
||
"\n",
|
||
"dev_0_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in dev_0_data]\n",
|
||
"dev_0_data_stemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in dev_0_data_tokenized]\n",
|
||
"dev_0_data = [' '.join(i) for i in dev_0_data_stemmatized]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 83,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"y_predicted = model.predict(vectorizer.transform(dev_0_data).toarray())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 84,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([1889.28635713, 1950.9440436 , 1957.26235075, 1959.53052259,\n",
|
||
" 1914.96228803, 1948.17090442, 1951.19472106, 1917.66714928,\n",
|
||
" 1912.14525243, 1936.7929999 ])"
|
||
]
|
||
},
|
||
"execution_count": 84,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"y_predicted[:10]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 92,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"f = open(\"dev-0/out.tsv\", \"a\")\n",
|
||
"for i in y_predicted:\n",
|
||
" f.write(str(round(i, 11)) + '\\n')\n",
|
||
"f.close()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 86,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"with open('dev-0/expected.tsv', \"r\", encoding=\"utf-8\") as f:\n",
|
||
" e = [line.rstrip() for line in f]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 94,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"38.80250628936373\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import math\n",
|
||
"t = []\n",
|
||
"for i in range(len(y_predicted)):\n",
|
||
" tmp = (float(y_predicted[i]) - float(e[i])) ** 2\n",
|
||
" t.append(tmp)\n",
|
||
"print(math.sqrt(sum(t)/len(y_predicted)))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 88,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"with open('test-A/in.tsv', \"r\", encoding=\"utf-8\") as f:\n",
|
||
" test_A_data = [line.rstrip() for line in f]\n",
|
||
"\n",
|
||
"test_A_data_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in test_A_data]\n",
|
||
"test_A_data_stemmatized = [list(set([w[:6] for w in set(i) - set(stop_words)])) for i in test_A_data_tokenized]\n",
|
||
"test_A_data = [' '.join(i) for i in test_A_data_stemmatized]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 89,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"y_test_predicted = model.predict(vectorizer.transform(test_A_data).toarray())"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 90,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([1970.80482572, 1891.52353205, 1914.05051655, 1921.30242974,\n",
|
||
" 1908.01225049, 1912.69373127, 1911.11153893, 1948.74997295,\n",
|
||
" 1925.77888352, 1923.62798817])"
|
||
]
|
||
},
|
||
"execution_count": 90,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"y_test_predicted[:10]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 93,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"f = open(\"test-A/out.tsv\", \"a\")\n",
|
||
"for i in y_test_predicted:\n",
|
||
" f.write(str(round(i, 11)) + '\\n')\n",
|
||
"f.close()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.8.3"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 4
|
||
}
|