Uczenie_glebokie/word2vector

# -*- coding: utf-8 -*-
"""word2vec.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1MnKjTwfarSfbGSH71rBr5kEytE9hew5o
"""

!pip install numpy pandas gensim tensorflow
!wget https://gonito.net/get/bin/geval
!chmod u+x geval

!git clone https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public.git

!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.vec.gz
!gzip -d cc.pl.300.vec.gz

import gzip
import shutil

with gzip.open('sport-text-classification-ball-ISI-public/train/train.tsv.gz', 'rb') as f_in:
    with open('sport-text-classification-ball-ISI-public/train/train.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import csv

# Wczytanie danychh
def load_data(filepath):
    texts = []
    with open(filepath, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if len(row) >= 1:
                texts.append(' '.join(row))
    return texts

def load_labeled_data(filepath):
    texts = []
    labels = []
    with open(filepath, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if len(row) == 2:
                label, text = row
                if label.isdigit():
                    labels.append(int(label))
                    texts.append(text)
                else:
                    print(f"Ignoring line due to invalid label: {row}")
            else:
                print(f"Ignoring line due to incorrect number of fields: {row}")
    return texts, labels
print("Starting to load labeled data")


train_texts, train_labels = load_labeled_data('sport-text-classification-ball-ISI-public/train/train.tsv')
print("Loaded training data")
dev_texts = load_data('sport-text-classification-ball-ISI-public/dev-0/in.tsv')
print("Loaded dev data")
test_texts = load_data('sport-text-classification-ball-ISI-public/test-A/in.tsv')
print("Loaded test data")

train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
dev_df = pd.DataFrame({'text': dev_texts})
test_df = pd.DataFrame({'text': test_texts})
print("Starting to load Word2Vec model")

# Wczytanie word2vec
word2vec = KeyedVectors.load_word2vec_format('cc.pl.300.vec', binary=False)
print("Loaded Word2Vec model")

# Przygotowanie tokenizera
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
print("Fitted tokenizer")

# Zamiana tekstu na sekwencje
train_sequences = tokenizer.texts_to_sequences(train_texts)
dev_sequences = tokenizer.texts_to_sequences(dev_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])

# Padding sekwencji
max_length = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in dev_sequences), max(len(seq) for seq in test_sequences))
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
dev_padded = pad_sequences(dev_sequences, maxlen=max_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')
print("Padded sequences")

# Przygotowanie macierzy embedding
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, word2vec.vector_size))

for word, i in tokenizer.word_index.items():
    if word in word2vec:
        embedding_matrix[i] = word2vec[word]
print("Prepared embedding matrix")

# Architektura modelu
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=word2vec.vector_size, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print("Compiled model")

# Trenowanie
model.fit(train_padded, np.array(train_labels), epochs=10, batch_size=32, validation_split=0.2)
print("Trained model")

# Predykcja
dev_predictions = model.predict(dev_padded)
test_predictions = model.predict(test_padded)

# Zamiana wyników na format binarny
dev_predictions = (dev_predictions > 0.5).astype(int)
test_predictions = (test_predictions > 0.5).astype(int)

# Zapisanie wyników
pd.DataFrame(dev_predictions).to_csv('dev-0_out.tsv', sep='\t', header=False, index=False)
pd.DataFrame(test_predictions).to_csv('test-A_out.tsv', sep='\t', header=False, index=False)
print("Saved predictions")

# Commented out IPython magic to ensure Python compatibility.
# %cd sport-text-classification-ball-ISI-public
!../geval -t dev-0 --metric Accuracy
!../geval -t test-A --metric Accuracy

import math

accuracy = 0.91471
points = math.ceil(accuracy * 7.0)
print(f"Points: {points}")