sport-text-classification-b.../sport text classification.ipynb
2021-05-24 20:57:41 +02:00

14 KiB
Raw Blame History

import pandas as pd
import numpy as np
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import os.path
import gzip
import shutil
with gzip.open('train/train.tsv.gz', 'rb') as f_in:
    with open('train/train.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
data = pd.read_csv('train/train.tsv', sep='\t', names=["Ball","Text"])
data
Ball Text
0 1 Mindaugas Budzinauskas wierzy w odbudowę formy...
1 1 Przyjmujący reprezentacji Polski wrócił do PGE...
2 0 FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...
3 1 Aleksander Filipiak: Czuję się dobrze w nowym ...
4 0 Victoria Carl i Aleksiej Czerwotkin mistrzami ...
... ... ...
98127 1 Kamil Syprzak zaczyna kolekcjonować trofea. FC...
98128 1 Holandia: dwa gole Piotra Parzyszka Piotr Parz...
98129 1 Sparingowo: Korona gorsza od Stali. Lettieri s...
98130 1 Vive - Wisła. Ośmiu debiutantów w tegorocznej ...
98131 1 WTA Miami: Timea Bacsinszky pokonana, Swietłan...

98132 rows × 2 columns

model = None
sentences = [x.split() for x in data["Text"]]
if not os.path.isfile('word2vec.model'):
    model = Word2Vec(sentences=data["Text"])
    model.save("word2vec.model")
    model.train(sentences, total_examples=len(sentences), epochs=10)
else:
    model = Word2Vec.load("word2vec.model")
prepared_training_data['Text'] = prepared_training_data['Text'].apply(lambda x: model.wv[x.split()])
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-6-dec2e93bf676> in <module>
----> 1 prepared_training_data['Text'] = prepared_training_data['Text'].apply(lambda x: model.wv[x.split()])

~\anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
   4198             else:
   4199                 values = self.astype(object)._values
-> 4200                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   4201 
   4202         if len(mapped) and isinstance(mapped[0], Series):

pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()

<ipython-input-6-dec2e93bf676> in <lambda>(x)
----> 1 prepared_training_data['Text'] = prepared_training_data['Text'].apply(lambda x: model.wv[x.split()])

~\anaconda3\lib\site-packages\gensim\models\keyedvectors.py in __getitem__(self, entities)
    353             return self.get_vector(entities)
    354 
--> 355         return vstack([self.get_vector(entity) for entity in entities])
    356 
    357     def __contains__(self, entity):

~\anaconda3\lib\site-packages\gensim\models\keyedvectors.py in <listcomp>(.0)
    353             return self.get_vector(entities)
    354 
--> 355         return vstack([self.get_vector(entity) for entity in entities])
    356 
    357     def __contains__(self, entity):

~\anaconda3\lib\site-packages\gensim\models\keyedvectors.py in get_vector(self, word)
    469 
    470     def get_vector(self, word):
--> 471         return self.word_vec(word)
    472 
    473     def words_closer_than(self, w1, w2):

~\anaconda3\lib\site-packages\gensim\models\keyedvectors.py in word_vec(self, word, use_norm)
    466             return result
    467         else:
--> 468             raise KeyError("word '%s' not in vocabulary" % word)
    469 
    470     def get_vector(self, word):

KeyError: "word 'Mindaugas' not in vocabulary"