# -*- coding: utf-8 -*- """word2vec.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1MnKjTwfarSfbGSH71rBr5kEytE9hew5o """ !pip install numpy pandas gensim tensorflow !wget https://gonito.net/get/bin/geval !chmod u+x geval !git clone https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public.git !wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pl.300.vec.gz !gzip -d cc.pl.300.vec.gz import gzip import shutil with gzip.open('sport-text-classification-ball-ISI-public/train/train.tsv.gz', 'rb') as f_in: with open('sport-text-classification-ball-ISI-public/train/train.tsv', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) import pandas as pd import numpy as np from gensim.models import KeyedVectors from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Flatten, Embedding from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences import csv # Wczytanie danychh def load_data(filepath): texts = [] with open(filepath, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') for row in reader: if len(row) >= 1: texts.append(' '.join(row)) return texts def load_labeled_data(filepath): texts = [] labels = [] with open(filepath, 'r', encoding='utf-8') as file: reader = csv.reader(file, delimiter='\t') for row in reader: if len(row) == 2: label, text = row if label.isdigit(): labels.append(int(label)) texts.append(text) else: print(f"Ignoring line due to invalid label: {row}") else: print(f"Ignoring line due to incorrect number of fields: {row}") return texts, labels print("Starting to load labeled data") train_texts, train_labels = load_labeled_data('sport-text-classification-ball-ISI-public/train/train.tsv') print("Loaded training data") dev_texts = load_data('sport-text-classification-ball-ISI-public/dev-0/in.tsv') print("Loaded dev data") test_texts = load_data('sport-text-classification-ball-ISI-public/test-A/in.tsv') print("Loaded test data") train_df = pd.DataFrame({'text': train_texts, 'label': train_labels}) dev_df = pd.DataFrame({'text': dev_texts}) test_df = pd.DataFrame({'text': test_texts}) print("Starting to load Word2Vec model") # Wczytanie word2vec word2vec = KeyedVectors.load_word2vec_format('cc.pl.300.vec', binary=False) print("Loaded Word2Vec model") # Przygotowanie tokenizera tokenizer = Tokenizer() tokenizer.fit_on_texts(train_texts) print("Fitted tokenizer") # Zamiana tekstu na sekwencje train_sequences = tokenizer.texts_to_sequences(train_texts) dev_sequences = tokenizer.texts_to_sequences(dev_df['text']) test_sequences = tokenizer.texts_to_sequences(test_df['text']) # Padding sekwencji max_length = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in dev_sequences), max(len(seq) for seq in test_sequences)) train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post') dev_padded = pad_sequences(dev_sequences, maxlen=max_length, padding='post') test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post') print("Padded sequences") # Przygotowanie macierzy embedding vocab_size = len(tokenizer.word_index) + 1 embedding_matrix = np.zeros((vocab_size, word2vec.vector_size)) for word, i in tokenizer.word_index.items(): if word in word2vec: embedding_matrix[i] = word2vec[word] print("Prepared embedding matrix") # Architektura modelu model = Sequential() model.add(Embedding(input_dim=vocab_size, output_dim=word2vec.vector_size, weights=[embedding_matrix], input_length=max_length, trainable=False)) model.add(Flatten()) model.add(Dense(10, activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) print("Compiled model") # Trenowanie model.fit(train_padded, np.array(train_labels), epochs=10, batch_size=32, validation_split=0.2) print("Trained model") # Predykcja dev_predictions = model.predict(dev_padded) test_predictions = model.predict(test_padded) # Zamiana wyników na format binarny dev_predictions = (dev_predictions > 0.5).astype(int) test_predictions = (test_predictions > 0.5).astype(int) # Zapisanie wyników pd.DataFrame(dev_predictions).to_csv('dev-0_out.tsv', sep='\t', header=False, index=False) pd.DataFrame(test_predictions).to_csv('test-A_out.tsv', sep='\t', header=False, index=False) print("Saved predictions") # Commented out IPython magic to ensure Python compatibility. # %cd sport-text-classification-ball-ISI-public !../geval -t dev-0 --metric Accuracy !../geval -t test-A --metric Accuracy import math accuracy = 0.91471 points = math.ceil(accuracy * 7.0) print(f"Points: {points}")