sport-text-classification-b.../.ipynb_checkpoints/sport text classification-checkpoint.ipynb
2021-05-24 20:57:41 +02:00

6.3 KiB
Raw Blame History

!pip install gensim
Requirement already satisfied: gensim in c:\users\annad\anaconda3\lib\site-packages (3.8.3)
Requirement already satisfied: smart-open>=1.8.1 in c:\users\annad\anaconda3\lib\site-packages (from gensim) (5.0.0)
Requirement already satisfied: six>=1.5.0 in c:\users\annad\anaconda3\lib\site-packages (from gensim) (1.15.0)
Requirement already satisfied: numpy>=1.11.3 in c:\users\annad\anaconda3\lib\site-packages (from gensim) (1.19.2)
Requirement already satisfied: scipy>=0.18.1 in c:\users\annad\anaconda3\lib\site-packages (from gensim) (1.5.2)
Requirement already satisfied: Cython==0.29.14 in c:\users\annad\anaconda3\lib\site-packages (from gensim) (0.29.14)
import pandas as pd
import numpy as np
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import os.path
import gzip
import shutil
with gzip.open('train/train.tsv.gz', 'rb') as f_in:
    with open('train/train.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
data = pd.read_csv('train/train.tsv', sep='\t', names=["Ball","Text"])
data
Ball Text
0 1 Mindaugas Budzinauskas wierzy w odbudowę formy...
1 1 Przyjmujący reprezentacji Polski wrócił do PGE...
2 0 FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...
3 1 Aleksander Filipiak: Czuję się dobrze w nowym ...
4 0 Victoria Carl i Aleksiej Czerwotkin mistrzami ...
... ... ...
98127 1 Kamil Syprzak zaczyna kolekcjonować trofea. FC...
98128 1 Holandia: dwa gole Piotra Parzyszka Piotr Parz...
98129 1 Sparingowo: Korona gorsza od Stali. Lettieri s...
98130 1 Vive - Wisła. Ośmiu debiutantów w tegorocznej ...
98131 1 WTA Miami: Timea Bacsinszky pokonana, Swietłan...

98132 rows × 2 columns

model = None
if not os.path.isfile('word2vec.model'):    
    model = Word2Vec(sentences=data["Text"], window=5, min_count=1, workers=5)
    model.save("word2vec.model")
else: