fix directory, does not change much in score

This commit is contained in:
Aleksandra 2024-05-16 11:05:35 +02:00
parent 1eb60bd963
commit 6e08c6f0af
5 changed files with 7931 additions and 7920 deletions

View File

@ -12,11 +12,10 @@ Perplexity hashed by
<b>Zadania</b> <b>Zadania</b>
----------------- -----------------
1. Statystyczny model językowy (zadanie 5) 1. Statystyczny model językowy (zadanie 5)
- branch: master - Perplexity hashed on `dev-0`: 555.75 - branch: master - Perplexity hashed on `dev-0`: 549.12
- branch: 05_ngram - Perplexity hashed on `dev-0`: xxx
<br><br> <br><br>
2. Neuronowy model językowy (zadanie 7) 2. Neuronowy model językowy (zadanie 7)
- branch: 07_neural - Perplexity hashed on `dev-0`: xxx - branch: 07_neural - Perplexity hashed on `dev-0`: 465.53
<br><br> <br><br>
3. Model neuronowy rekurencyjny (zadanie 9) 3. Model neuronowy rekurencyjny (zadanie 9)
- branch: 09_neural - Perplexity hashed on `dev-0`: xxx - branch: 09_neural - Perplexity hashed on `dev-0`: xxx

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

9
run.py
View File

@ -1,15 +1,14 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import csv import csv
import os
import re import re
import random
from collections import Counter, defaultdict from collections import Counter, defaultdict
import nltk import nltk
import math import math
from tqdm import tqdm from tqdm import tqdm
directory = "train/in.tsv.xz" directory = "train/in.tsv.xz"
directory_expected = "train/expected.tsv"
directory_dev_0 = "dev-0/in.tsv.xz" directory_dev_0 = "dev-0/in.tsv.xz"
directory_test_A = "test-A/in.tsv.xz" directory_test_A = "test-A/in.tsv.xz"
@ -70,13 +69,17 @@ class Model():
dataframeList = pd.read_csv(directory, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10000) dataframeList = pd.read_csv(directory, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10000)
expectedList = pd.read_csv(directory, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10000) expectedList = pd.read_csv(directory_expected, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10000)
DATASET = "" DATASET = ""
for number, (dataframe, expected) in enumerate(zip(dataframeList, expectedList)): for number, (dataframe, expected) in enumerate(zip(dataframeList, expectedList)):
dataframe = dataframe.reset_index()
dataframe = dataframe.replace(r'\\r|\\n|\n|\\t', ' ', regex=True) dataframe = dataframe.replace(r'\\r|\\n|\n|\\t', ' ', regex=True)
expected['Word'] = expected['Word'].apply(lambda x: str(x).strip())
word = expected['Word']
left_text = dataframe['LeftContext'].to_list() left_text = dataframe['LeftContext'].to_list()
right_text = dataframe['RightContext'].to_list() right_text = dataframe['RightContext'].to_list()
word = expected['Word'].to_list() word = expected['Word'].to_list()

File diff suppressed because it is too large Load Diff