fix directory, does not change much in score

This commit is contained in:
Aleksandra 2024-05-16 11:05:35 +02:00
parent 1eb60bd963
commit 6e08c6f0af
5 changed files with 7931 additions and 7920 deletions

View File

@ -12,11 +12,10 @@ Perplexity hashed by
<b>Zadania</b>
-----------------
1. Statystyczny model językowy (zadanie 5)
- branch: master - Perplexity hashed on `dev-0`: 555.75
- branch: 05_ngram - Perplexity hashed on `dev-0`: xxx
- branch: master - Perplexity hashed on `dev-0`: 549.12
<br><br>
2. Neuronowy model językowy (zadanie 7)
- branch: 07_neural - Perplexity hashed on `dev-0`: xxx
- branch: 07_neural - Perplexity hashed on `dev-0`: 465.53
<br><br>
3. Model neuronowy rekurencyjny (zadanie 9)
- branch: 09_neural - Perplexity hashed on `dev-0`: xxx

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

9
run.py
View File

@ -1,15 +1,14 @@
import pandas as pd
import numpy as np
import csv
import os
import re
import random
from collections import Counter, defaultdict
import nltk
import math
from tqdm import tqdm
directory = "train/in.tsv.xz"
directory_expected = "train/expected.tsv"
directory_dev_0 = "dev-0/in.tsv.xz"
directory_test_A = "test-A/in.tsv.xz"
@ -70,13 +69,17 @@ class Model():
dataframeList = pd.read_csv(directory, sep='\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10000)
expectedList = pd.read_csv(directory, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10000)
expectedList = pd.read_csv(directory_expected, sep='\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10000)
DATASET = ""
for number, (dataframe, expected) in enumerate(zip(dataframeList, expectedList)):
dataframe = dataframe.reset_index()
dataframe = dataframe.replace(r'\\r|\\n|\n|\\t', ' ', regex=True)
expected['Word'] = expected['Word'].apply(lambda x: str(x).strip())
word = expected['Word']
left_text = dataframe['LeftContext'].to_list()
right_text = dataframe['RightContext'].to_list()
word = expected['Word'].to_list()

File diff suppressed because it is too large Load Diff