142 lines
4.9 KiB
Plaintext
142 lines
4.9 KiB
Plaintext
|
# -*- coding: utf-8 -*-
|
||
|
"""word2vec.ipynb
|
||
|
|
||
|
Automatically generated by Colab.
|
||
|
|
||
|
Original file is located at
|
||
|
https://colab.research.google.com/drive/1MnKjTwfarSfbGSH71rBr5kEytE9hew5o
|
||
|
"""
|
||
|
|
||
|
!pip install numpy pandas gensim tensorflow
|
||
|
!wget https://gonito.net/get/bin/geval
|
||
|
!chmod u+x geval
|
||
|
|
||
|
!git clone https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public.git
|
||
|
|
||
|
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.vec.gz
|
||
|
!gzip -d cc.pl.300.vec.gz
|
||
|
|
||
|
import gzip
|
||
|
import shutil
|
||
|
|
||
|
with gzip.open('sport-text-classification-ball-ISI-public/train/train.tsv.gz', 'rb') as f_in:
|
||
|
with open('sport-text-classification-ball-ISI-public/train/train.tsv', 'wb') as f_out:
|
||
|
shutil.copyfileobj(f_in, f_out)
|
||
|
|
||
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
from gensim.models import KeyedVectors
|
||
|
from tensorflow.keras.models import Sequential
|
||
|
from tensorflow.keras.layers import Dense, Flatten, Embedding
|
||
|
from tensorflow.keras.preprocessing.text import Tokenizer
|
||
|
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
||
|
import csv
|
||
|
|
||
|
# Wczytanie danychh
|
||
|
def load_data(filepath):
|
||
|
texts = []
|
||
|
with open(filepath, 'r', encoding='utf-8') as file:
|
||
|
reader = csv.reader(file, delimiter='\t')
|
||
|
for row in reader:
|
||
|
if len(row) >= 1:
|
||
|
texts.append(' '.join(row))
|
||
|
return texts
|
||
|
|
||
|
def load_labeled_data(filepath):
|
||
|
texts = []
|
||
|
labels = []
|
||
|
with open(filepath, 'r', encoding='utf-8') as file:
|
||
|
reader = csv.reader(file, delimiter='\t')
|
||
|
for row in reader:
|
||
|
if len(row) == 2:
|
||
|
label, text = row
|
||
|
if label.isdigit():
|
||
|
labels.append(int(label))
|
||
|
texts.append(text)
|
||
|
else:
|
||
|
print(f"Ignoring line due to invalid label: {row}")
|
||
|
else:
|
||
|
print(f"Ignoring line due to incorrect number of fields: {row}")
|
||
|
return texts, labels
|
||
|
print("Starting to load labeled data")
|
||
|
|
||
|
|
||
|
train_texts, train_labels = load_labeled_data('sport-text-classification-ball-ISI-public/train/train.tsv')
|
||
|
print("Loaded training data")
|
||
|
dev_texts = load_data('sport-text-classification-ball-ISI-public/dev-0/in.tsv')
|
||
|
print("Loaded dev data")
|
||
|
test_texts = load_data('sport-text-classification-ball-ISI-public/test-A/in.tsv')
|
||
|
print("Loaded test data")
|
||
|
|
||
|
train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
|
||
|
dev_df = pd.DataFrame({'text': dev_texts})
|
||
|
test_df = pd.DataFrame({'text': test_texts})
|
||
|
print("Starting to load Word2Vec model")
|
||
|
|
||
|
# Wczytanie word2vec
|
||
|
word2vec = KeyedVectors.load_word2vec_format('cc.pl.300.vec', binary=False)
|
||
|
print("Loaded Word2Vec model")
|
||
|
|
||
|
# Przygotowanie tokenizera
|
||
|
tokenizer = Tokenizer()
|
||
|
tokenizer.fit_on_texts(train_texts)
|
||
|
print("Fitted tokenizer")
|
||
|
|
||
|
# Zamiana tekstu na sekwencje
|
||
|
train_sequences = tokenizer.texts_to_sequences(train_texts)
|
||
|
dev_sequences = tokenizer.texts_to_sequences(dev_df['text'])
|
||
|
test_sequences = tokenizer.texts_to_sequences(test_df['text'])
|
||
|
|
||
|
# Padding sekwencji
|
||
|
max_length = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in dev_sequences), max(len(seq) for seq in test_sequences))
|
||
|
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
|
||
|
dev_padded = pad_sequences(dev_sequences, maxlen=max_length, padding='post')
|
||
|
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')
|
||
|
print("Padded sequences")
|
||
|
|
||
|
# Przygotowanie macierzy embedding
|
||
|
vocab_size = len(tokenizer.word_index) + 1
|
||
|
embedding_matrix = np.zeros((vocab_size, word2vec.vector_size))
|
||
|
|
||
|
for word, i in tokenizer.word_index.items():
|
||
|
if word in word2vec:
|
||
|
embedding_matrix[i] = word2vec[word]
|
||
|
print("Prepared embedding matrix")
|
||
|
|
||
|
# Architektura modelu
|
||
|
model = Sequential()
|
||
|
model.add(Embedding(input_dim=vocab_size, output_dim=word2vec.vector_size, weights=[embedding_matrix], input_length=max_length, trainable=False))
|
||
|
model.add(Flatten())
|
||
|
model.add(Dense(10, activation='relu'))
|
||
|
model.add(Dense(1, activation='sigmoid'))
|
||
|
|
||
|
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
|
||
|
print("Compiled model")
|
||
|
|
||
|
# Trenowanie
|
||
|
model.fit(train_padded, np.array(train_labels), epochs=10, batch_size=32, validation_split=0.2)
|
||
|
print("Trained model")
|
||
|
|
||
|
# Predykcja
|
||
|
dev_predictions = model.predict(dev_padded)
|
||
|
test_predictions = model.predict(test_padded)
|
||
|
|
||
|
# Zamiana wyników na format binarny
|
||
|
dev_predictions = (dev_predictions > 0.5).astype(int)
|
||
|
test_predictions = (test_predictions > 0.5).astype(int)
|
||
|
|
||
|
# Zapisanie wyników
|
||
|
pd.DataFrame(dev_predictions).to_csv('dev-0_out.tsv', sep='\t', header=False, index=False)
|
||
|
pd.DataFrame(test_predictions).to_csv('test-A_out.tsv', sep='\t', header=False, index=False)
|
||
|
print("Saved predictions")
|
||
|
|
||
|
# Commented out IPython magic to ensure Python compatibility.
|
||
|
# %cd sport-text-classification-ball-ISI-public
|
||
|
!../geval -t dev-0 --metric Accuracy
|
||
|
!../geval -t test-A --metric Accuracy
|
||
|
|
||
|
import math
|
||
|
|
||
|
accuracy = 0.91471
|
||
|
points = math.ceil(accuracy * 7.0)
|
||
|
print(f"Points: {points}")
|