This commit is contained in:
Norbert Litkowski 2022-04-25 00:28:09 +02:00
parent 61e88a9c8c
commit b78257156a
5 changed files with 854 additions and 1 deletions

2
.gitignore vendored
View File

@ -1,4 +1,4 @@
.ipynb_checkpoints*
*~
*.swp
*.bak

0
model.arpa Normal file
View File

604
run.ipynb Normal file
View File

@ -0,0 +1,604 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "f834096a",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from utils import *"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5bf0e02b",
"metadata": {},
"outputs": [],
"source": [
"data = get_csv(\"train/in.tsv.xz\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "98ebf07f",
"metadata": {},
"outputs": [],
"source": [
"train_labels = get_csv(\"train/expected.tsv\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "42cb7bb1",
"metadata": {},
"outputs": [],
"source": [
"train_data = data[[6,7]]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e6e0480e",
"metadata": {},
"outputs": [],
"source": [
"train_data = pd.concat([train_data, train_labels], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "464dc043",
"metadata": {},
"outputs": [],
"source": [
"train_data[607] = train_data[6] + train_data[0] + train_data[7]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "f5115f59",
"metadata": {},
"outputs": [],
"source": [
"train_data[607] = train_data[607].apply(clean_text)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "25585b08",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 came fiom the last place to thisnplace and thi...\n",
"1 mb boot political obeednattempt to imagine a p...\n",
"2 thera were in only aeventyninenuberlbers lo ...\n",
"3 a gixnl man y niterertiiiv diiclosurs regard ...\n",
"4 tin ub tv thf bbabbt qabjenmr schiffs tutemen...\n",
" ... \n",
"432017 sam clendenin bad a fancy for uinscience of me...\n",
"432018 witahtt halting the party ware dilven to the s...\n",
"432019 it was the last thing that either ofnthem expe...\n",
"432020 settlement with the departmentnit is also show...\n",
"432021 flour quotationslow extras at r ® ncity mi...\n",
"Name: 607, Length: 432022, dtype: object"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data[607]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "325a9592",
"metadata": {},
"outputs": [],
"source": [
"with open(\"tmp\", \"w+\") as f:\n",
" for t in train_data[607]:\n",
" f.write(t + \"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "08888fa3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== 1/5 Counting and sorting n-grams ===\n",
"Reading /home/me/challenging-america-word-gap-prediction-kenlm/tmp\n",
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
"************************/home/me/kenlm/lm/builder/corpus_count.cc:179 in void lm::builder::{anonymous}::ComplainDisallowed(StringPiece, lm::WarningAction&) threw FormatLoadException.\n",
"Special word <s> is not allowed in the corpus. I plan to support models containing <unk> in the future. Pass --skip_symbols to convert these symbols to whitespace.\n",
"/bin/bash: linia 1: 3982 Przerwane (zrzut pamięci) ../kenlm/build//bin/lmplz -o 4 < tmp > model.arpa\n"
]
}
],
"source": [
"KENLM_BUILD_PATH = \"../kenlm/build/\"\n",
"!$KENLM_BUILD_PATH/bin/lmplz -o 4 < tmp > model.arpa"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "42a8d737",
"metadata": {},
"outputs": [],
"source": [
"!rm tmp"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "311c90de",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Defaulting to user installation because normal site-packages is not writeable\n",
"Collecting kenlm\n",
" Downloading kenlm-0.tar.gz (1.4 MB)\n",
" |████████████████████████████████| 1.4 MB 610 kB/s \n",
"\u001b[?25hBuilding wheels for collected packages: kenlm\n",
" Building wheel for kenlm (setup.py) ... \u001b[?25lerror\n",
"\u001b[31m ERROR: Command errored out with exit status 1:\n",
" command: /usr/bin/python -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '\"'\"'/tmp/pip-install-tpogj6ox/kenlm_6280e82d7a044d36906510f5646258a0/setup.py'\"'\"'; __file__='\"'\"'/tmp/pip-install-tpogj6ox/kenlm_6280e82d7a044d36906510f5646258a0/setup.py'\"'\"';f=getattr(tokenize, '\"'\"'open'\"'\"', open)(__file__);code=f.read().replace('\"'\"'\\r\\n'\"'\"', '\"'\"'\\n'\"'\"');f.close();exec(compile(code, __file__, '\"'\"'exec'\"'\"'))' bdist_wheel -d /tmp/pip-wheel-s72u5291\n",
" cwd: /tmp/pip-install-tpogj6ox/kenlm_6280e82d7a044d36906510f5646258a0/\n",
" Complete output (380 lines):\n",
" running bdist_wheel\n",
" running build\n",
" running build_ext\n",
" building 'kenlm' extension\n",
" creating build/temp.linux-x86_64-3.10\n",
" creating build/temp.linux-x86_64-3.10/lm\n",
" creating build/temp.linux-x86_64-3.10/python\n",
" creating build/temp.linux-x86_64-3.10/util\n",
" creating build/temp.linux-x86_64-3.10/util/double-conversion\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/bhiksha.cc -o build/temp.linux-x86_64-3.10/lm/bhiksha.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/binary_format.cc -o build/temp.linux-x86_64-3.10/lm/binary_format.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" lm/binary_format.cc: In member function void lm::ngram::BinaryFormat::FinishFile(const lm::ngram::Config&, lm::ngram::ModelType, unsigned int, const std::vector<long unsigned int>&):\n",
" lm/binary_format.cc:261:9: warning: void* memset(void*, int, size_t) clearing an object of type struct lm::ngram::Parameters with no trivial copy-assignment; use assignment or value-initialization instead [-Wclass-memaccess]\n",
" 261 | memset(&params, 0, sizeof(Parameters));\n",
" | ~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
" In file included from lm/binary_format.cc:1:\n",
" ./lm/binary_format.hh:42:8: note: struct lm::ngram::Parameters declared here\n",
" 42 | struct Parameters {\n",
" | ^~~~~~~~~~\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/config.cc -o build/temp.linux-x86_64-3.10/lm/config.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/lm_exception.cc -o build/temp.linux-x86_64-3.10/lm/lm_exception.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/model.cc -o build/temp.linux-x86_64-3.10/lm/model.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" In file included from ./lm/model.hh:13,\n",
" from lm/model.cc:1:\n",
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
" | ^~~~~\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/quantize.cc -o build/temp.linux-x86_64-3.10/lm/quantize.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/read_arpa.cc -o build/temp.linux-x86_64-3.10/lm/read_arpa.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/search_hashed.cc -o build/temp.linux-x86_64-3.10/lm/search_hashed.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" In file included from ./lm/model.hh:13,\n",
" from lm/search_hashed.cc:6:\n",
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
" | ^~~~~\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/search_trie.cc -o build/temp.linux-x86_64-3.10/lm/search_trie.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" In file included from lm/search_trie.cc:12:\n",
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
" | ^~~~~\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/sizes.cc -o build/temp.linux-x86_64-3.10/lm/sizes.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" In file included from ./lm/model.hh:13,\n",
" from lm/sizes.cc:2:\n",
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
" | ^~~~~\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/trie.cc -o build/temp.linux-x86_64-3.10/lm/trie.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/trie_sort.cc -o build/temp.linux-x86_64-3.10/lm/trie_sort.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" In file included from lm/trie_sort.cc:6:\n",
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
" | ^~~~~\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/value_build.cc -o build/temp.linux-x86_64-3.10/lm/value_build.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" In file included from ./lm/model.hh:13,\n",
" from lm/value_build.cc:3:\n",
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
" | ^~~~~\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/virtual_interface.cc -o build/temp.linux-x86_64-3.10/lm/virtual_interface.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/vocab.cc -o build/temp.linux-x86_64-3.10/lm/vocab.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" In file included from lm/vocab.cc:1:\n",
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
" | ^~~~~\n",
" lm/vocab.cc:285:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 285 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException) {\n",
" | ^~~~~\n",
" lm/vocab.cc:297:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 297 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException) {\n",
" | ^~~~~\n",
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c python/kenlm.cpp -o build/temp.linux-x86_64-3.10/python/kenlm.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
" In file included from ./lm/model.hh:13,\n",
" from python/kenlm.cpp:253:\n",
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
" | ^~~~~\n",
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
" | ^~~~~\n",
" In file included from /usr/include/python3.10/Python.h:74,\n",
" from python/kenlm.cpp:16:\n",
" python/kenlm.cpp: In function void __pyx_tp_dealloc_5kenlm_Model(PyObject*):\n",
" /usr/include/python3.10/object.h:133:33: error: lvalue required as increment operand\n",
" 133 | #define Py_REFCNT(ob) _Py_REFCNT(_PyObject_CAST_CONST(ob))\n",
" | ~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
" python/kenlm.cpp:4398:7: note: in expansion of macro Py_REFCNT\n",
" 4398 | ++Py_REFCNT(o);\n",
" | ^~~~~~~~~\n",
" /usr/include/python3.10/object.h:133:33: error: lvalue required as decrement operand\n",
" 133 | #define Py_REFCNT(ob) _Py_REFCNT(_PyObject_CAST_CONST(ob))\n",
" | ~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
" python/kenlm.cpp:4400:7: note: in expansion of macro Py_REFCNT\n",
" 4400 | --Py_REFCNT(o);\n",
" | ^~~~~~~~~\n",
" python/kenlm.cpp: In function PyObject* PyInit_kenlm():\n",
" python/kenlm.cpp:4876:37: error: PyTypeObject {aka struct _typeobject} has no member named tp_print\n",
" 4876 | __pyx_type_5kenlm_FullScoreReturn.tp_print = 0;\n",
" | ^~~~~~~~\n",
" python/kenlm.cpp:4880:27: error: PyTypeObject {aka struct _typeobject} has no member named tp_print\n",
" 4880 | __pyx_type_5kenlm_State.tp_print = 0;\n",
" | ^~~~~~~~\n",
" python/kenlm.cpp:4884:28: error: PyTypeObject {aka struct _typeobject} has no member named tp_print\n",
" 4884 | __pyx_type_5kenlm_Config.tp_print = 0;\n",
" | ^~~~~~~~\n",
" python/kenlm.cpp:4888:27: error: PyTypeObject {aka struct _typeobject} has no member named tp_print\n",
" 4888 | __pyx_type_5kenlm_Model.tp_print = 0;\n",
" | ^~~~~~~~\n",
" python/kenlm.cpp:4902:53: error: PyTypeObject {aka struct _typeobject} has no member named tp_print\n",
" 4902 | __pyx_type_5kenlm___pyx_scope_struct__full_scores.tp_print = 0;\n",
" | ^~~~~~~~\n",
" In file included from /usr/include/python3.10/unicodeobject.h:1046,\n",
" from /usr/include/python3.10/Python.h:83,\n",
" from python/kenlm.cpp:16:\n",
" python/kenlm.cpp: In function int __Pyx_ParseOptionalKeywords(PyObject*, PyObject***, PyObject*, PyObject**, Py_ssize_t, const char*):\n",
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: Py_ssize_t _PyUnicode_get_wstr_length(PyObject*) is deprecated [-Wdeprecated-declarations]\n",
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:261:7: note: in expansion of macro PyUnicode_WSTR_LENGTH\n",
" 261 | PyUnicode_WSTR_LENGTH(op) : \\\n",
" | ^~~~~~~~~~~~~~~~~~~~~\n",
" python/kenlm.cpp:5396:22: note: in expansion of macro PyUnicode_GET_SIZE\n",
" 5396 | (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
" | ^~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:262:33: warning: Py_UNICODE* PyUnicode_AsUnicode(PyObject*) is deprecated [-Wdeprecated-declarations]\n",
" 262 | ((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\\\n",
" | ~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~\n",
" python/kenlm.cpp:5396:22: note: in expansion of macro PyUnicode_GET_SIZE\n",
" 5396 | (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
" | ^~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:580:45: note: declared here\n",
" 580 | Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(\n",
" | ^~~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: Py_ssize_t _PyUnicode_get_wstr_length(PyObject*) is deprecated [-Wdeprecated-declarations]\n",
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:264:8: note: in expansion of macro PyUnicode_WSTR_LENGTH\n",
" 264 | PyUnicode_WSTR_LENGTH(op)))\n",
" | ^~~~~~~~~~~~~~~~~~~~~\n",
" python/kenlm.cpp:5396:22: note: in expansion of macro PyUnicode_GET_SIZE\n",
" 5396 | (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
" | ^~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: Py_ssize_t _PyUnicode_get_wstr_length(PyObject*) is deprecated [-Wdeprecated-declarations]\n",
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:261:7: note: in expansion of macro PyUnicode_WSTR_LENGTH\n",
" 261 | PyUnicode_WSTR_LENGTH(op) : \\\n",
" | ^~~~~~~~~~~~~~~~~~~~~\n",
" python/kenlm.cpp:5396:52: note: in expansion of macro PyUnicode_GET_SIZE\n",
" 5396 | (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
" | ^~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:262:33: warning: Py_UNICODE* PyUnicode_AsUnicode(PyObject*) is deprecated [-Wdeprecated-declarations]\n",
" 262 | ((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\\\n",
" | ~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~\n",
" python/kenlm.cpp:5396:52: note: in expansion of macro PyUnicode_GET_SIZE\n",
" 5396 | (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
" | ^~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:580:45: note: declared here\n",
" 580 | Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(\n",
" | ^~~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: Py_ssize_t _PyUnicode_get_wstr_length(PyObject*) is deprecated [-Wdeprecated-declarations]\n",
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:264:8: note: in expansion of macro PyUnicode_WSTR_LENGTH\n",
" 264 | PyUnicode_WSTR_LENGTH(op)))\n",
" | ^~~~~~~~~~~~~~~~~~~~~\n",
" python/kenlm.cpp:5396:52: note: in expansion of macro PyUnicode_GET_SIZE\n",
" 5396 | (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
" | ^~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: Py_ssize_t _PyUnicode_get_wstr_length(PyObject*) is deprecated [-Wdeprecated-declarations]\n",
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:261:7: note: in expansion of macro PyUnicode_WSTR_LENGTH\n",
" 261 | PyUnicode_WSTR_LENGTH(op) : \\\n",
" | ^~~~~~~~~~~~~~~~~~~~~\n",
" python/kenlm.cpp:5412:26: note: in expansion of macro PyUnicode_GET_SIZE\n",
" 5412 | (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
" | ^~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:262:33: warning: Py_UNICODE* PyUnicode_AsUnicode(PyObject*) is deprecated [-Wdeprecated-declarations]\n",
" 262 | ((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\\\n",
" | ~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~\n",
" python/kenlm.cpp:5412:26: note: in expansion of macro PyUnicode_GET_SIZE\n",
" 5412 | (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
" | ^~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:580:45: note: declared here\n",
" 580 | Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(\n",
" | ^~~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: Py_ssize_t _PyUnicode_get_wstr_length(PyObject*) is deprecated [-Wdeprecated-declarations]\n",
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:264:8: note: in expansion of macro PyUnicode_WSTR_LENGTH\n",
" 264 | PyUnicode_WSTR_LENGTH(op)))\n",
" | ^~~~~~~~~~~~~~~~~~~~~\n",
" python/kenlm.cpp:5412:26: note: in expansion of macro PyUnicode_GET_SIZE\n",
" 5412 | (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
" | ^~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: Py_ssize_t _PyUnicode_get_wstr_length(PyObject*) is deprecated [-Wdeprecated-declarations]\n",
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:261:7: note: in expansion of macro PyUnicode_WSTR_LENGTH\n",
" 261 | PyUnicode_WSTR_LENGTH(op) : \\\n",
" | ^~~~~~~~~~~~~~~~~~~~~\n",
" python/kenlm.cpp:5412:59: note: in expansion of macro PyUnicode_GET_SIZE\n",
" 5412 | (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
" | ^~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:262:33: warning: Py_UNICODE* PyUnicode_AsUnicode(PyObject*) is deprecated [-Wdeprecated-declarations]\n",
" 262 | ((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\\\n",
" | ~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~\n",
" python/kenlm.cpp:5412:59: note: in expansion of macro PyUnicode_GET_SIZE\n",
" 5412 | (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
" | ^~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:580:45: note: declared here\n",
" 580 | Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(\n",
" | ^~~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: Py_ssize_t _PyUnicode_get_wstr_length(PyObject*) is deprecated [-Wdeprecated-declarations]\n",
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:264:8: note: in expansion of macro PyUnicode_WSTR_LENGTH\n",
" 264 | PyUnicode_WSTR_LENGTH(op)))\n",
" | ^~~~~~~~~~~~~~~~~~~~~\n",
" python/kenlm.cpp:5412:59: note: in expansion of macro PyUnicode_GET_SIZE\n",
" 5412 | (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
" | ^~~~~~~~~~~~~~~~~~\n",
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
" python/kenlm.cpp: In function void __Pyx_ExceptionSave(PyObject**, PyObject**, PyObject**):\n",
" python/kenlm.cpp:5583:21: error: PyThreadState {aka struct _ts} has no member named exc_type; did you mean curexc_type?\n",
" 5583 | *type = tstate->exc_type;\n",
" | ^~~~~~~~\n",
" | curexc_type\n",
" python/kenlm.cpp:5584:22: error: PyThreadState {aka struct _ts} has no member named exc_value; did you mean curexc_value?\n",
" 5584 | *value = tstate->exc_value;\n",
" | ^~~~~~~~~\n",
" | curexc_value\n",
" python/kenlm.cpp:5585:19: error: PyThreadState {aka struct _ts} has no member named exc_traceback; did you mean curexc_traceback?\n",
" 5585 | *tb = tstate->exc_traceback;\n",
" | ^~~~~~~~~~~~~\n",
" | curexc_traceback\n",
" python/kenlm.cpp: In function void __Pyx_ExceptionReset(PyObject*, PyObject*, PyObject*):\n",
" python/kenlm.cpp:5597:24: error: PyThreadState {aka struct _ts} has no member named exc_type; did you mean curexc_type?\n",
" 5597 | tmp_type = tstate->exc_type;\n",
" | ^~~~~~~~\n",
" | curexc_type\n",
" python/kenlm.cpp:5598:25: error: PyThreadState {aka struct _ts} has no member named exc_value; did you mean curexc_value?\n",
" 5598 | tmp_value = tstate->exc_value;\n",
" | ^~~~~~~~~\n",
" | curexc_value\n",
" python/kenlm.cpp:5599:22: error: PyThreadState {aka struct _ts} has no member named exc_traceback; did you mean curexc_traceback?\n",
" 5599 | tmp_tb = tstate->exc_traceback;\n",
" | ^~~~~~~~~~~~~\n",
" | curexc_traceback\n",
" python/kenlm.cpp:5600:13: error: PyThreadState {aka struct _ts} has no member named exc_type; did you mean curexc_type?\n",
" 5600 | tstate->exc_type = type;\n",
" | ^~~~~~~~\n",
" | curexc_type\n",
" python/kenlm.cpp:5601:13: error: PyThreadState {aka struct _ts} has no member named exc_value; did you mean curexc_value?\n",
" 5601 | tstate->exc_value = value;\n",
" | ^~~~~~~~~\n",
" | curexc_value\n",
" python/kenlm.cpp:5602:13: error: PyThreadState {aka struct _ts} has no member named exc_traceback; did you mean curexc_traceback?\n",
" 5602 | tstate->exc_traceback = tb;\n",
" | ^~~~~~~~~~~~~\n",
" | curexc_traceback\n",
" python/kenlm.cpp: In function int __Pyx_GetException(PyObject**, PyObject**, PyObject**):\n",
" python/kenlm.cpp:5645:24: error: PyThreadState {aka struct _ts} has no member named exc_type; did you mean curexc_type?\n",
" 5645 | tmp_type = tstate->exc_type;\n",
" | ^~~~~~~~\n",
" | curexc_type\n",
" python/kenlm.cpp:5646:25: error: PyThreadState {aka struct _ts} has no member named exc_value; did you mean curexc_value?\n",
" 5646 | tmp_value = tstate->exc_value;\n",
" | ^~~~~~~~~\n",
" | curexc_value\n",
" python/kenlm.cpp:5647:22: error: PyThreadState {aka struct _ts} has no member named exc_traceback; did you mean curexc_traceback?\n",
" 5647 | tmp_tb = tstate->exc_traceback;\n",
" | ^~~~~~~~~~~~~\n",
" | curexc_traceback\n",
" python/kenlm.cpp:5648:13: error: PyThreadState {aka struct _ts} has no member named exc_type; did you mean curexc_type?\n",
" 5648 | tstate->exc_type = local_type;\n",
" | ^~~~~~~~\n",
" | curexc_type\n",
" python/kenlm.cpp:5649:13: error: PyThreadState {aka struct _ts} has no member named exc_value; did you mean curexc_value?\n",
" 5649 | tstate->exc_value = local_value;\n",
" | ^~~~~~~~~\n",
" | curexc_value\n",
" python/kenlm.cpp:5650:13: error: PyThreadState {aka struct _ts} has no member named exc_traceback; did you mean curexc_traceback?\n",
" 5650 | tstate->exc_traceback = local_tb;\n",
" | ^~~~~~~~~~~~~\n",
" | curexc_traceback\n",
" python/kenlm.cpp: In function void __Pyx_ExceptionSwap(PyObject**, PyObject**, PyObject**):\n",
" python/kenlm.cpp:6376:24: error: PyThreadState {aka struct _ts} has no member named exc_type; did you mean curexc_type?\n",
" 6376 | tmp_type = tstate->exc_type;\n",
" | ^~~~~~~~\n",
" | curexc_type\n",
" python/kenlm.cpp:6377:25: error: PyThreadState {aka struct _ts} has no member named exc_value; did you mean curexc_value?\n",
" 6377 | tmp_value = tstate->exc_value;\n",
" | ^~~~~~~~~\n",
" | curexc_value\n",
" python/kenlm.cpp:6378:22: error: PyThreadState {aka struct _ts} has no member named exc_traceback; did you mean curexc_traceback?\n",
" 6378 | tmp_tb = tstate->exc_traceback;\n",
" | ^~~~~~~~~~~~~\n",
" | curexc_traceback\n",
" python/kenlm.cpp:6379:13: error: PyThreadState {aka struct _ts} has no member named exc_type; did you mean curexc_type?\n",
" 6379 | tstate->exc_type = *type;\n",
" | ^~~~~~~~\n",
" | curexc_type\n",
" python/kenlm.cpp:6380:13: error: PyThreadState {aka struct _ts} has no member named exc_value; did you mean curexc_value?\n",
" 6380 | tstate->exc_value = *value;\n",
" | ^~~~~~~~~\n",
" | curexc_value\n",
" python/kenlm.cpp:6381:13: error: PyThreadState {aka struct _ts} has no member named exc_traceback; did you mean curexc_traceback?\n",
" 6381 | tstate->exc_traceback = *tb;\n",
" | ^~~~~~~~~~~~~\n",
" | curexc_traceback\n",
" error: command '/usr/bin/gcc' failed with exit code 1\n",
" ----------------------------------------\u001b[0m\n",
"\u001b[31m ERROR: Failed building wheel for kenlm\u001b[0m\n",
"\u001b[?25h Running setup.py clean for kenlm\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Failed to build kenlm\n",
"Installing collected packages: kenlm\n",
" Running setup.py install for kenlm ... \u001b[?25l-"
]
}
],
"source": [
"!pip install kenlm"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a849ad70",
"metadata": {},
"outputs": [],
"source": [
"import kenlm\n",
"model = kenlm.Model(\"\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

211
testing.ipynb Normal file
View File

@ -0,0 +1,211 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "21c9b695",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import csv\n",
"import regex as re\n",
"import nltk\n",
"from collections import Counter, defaultdict\n",
"import string\n",
"import unicodedata\n",
"\n",
"def clean_text(text): \n",
" return re.sub(r\"\\p{P}\", \"\", str(text).lower().replace(\"-\\\\n\", \"\").replace(\"\\\\n\", \" \"))\n",
"\n",
"def train_model(data, model):\n",
" for _, row in data.iterrows():\n",
" words = nltk.word_tokenize(clean_text(row[\"final\"]))\n",
" for w1, w2 in nltk.bigrams(words, pad_left=True, pad_right=True):\n",
" if w1 and w2:\n",
" model[w2][w1] += 1\n",
" for w1 in model:\n",
" total_count = float(sum(model[w1].values()))\n",
" for w2 in model[w1]:\n",
" model[w2][w1] /= total_count\n",
"\n",
"\n",
"def predict(word, model):\n",
" predictions = dict(model[word])\n",
" most_common = dict(Counter(predictions).most_common(5))\n",
"\n",
" total_prob = 0.0\n",
" str_prediction = \"\"\n",
"\n",
" for word, prob in most_common.items():\n",
" total_prob += prob\n",
" str_prediction += f\"{word}:{prob} \"\n",
"\n",
" if not total_prob:\n",
" return \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n",
"\n",
" if 1 - total_prob >= 0.01:\n",
" str_prediction += f\":{1-total_prob}\"\n",
" else:\n",
" str_prediction += f\":0.01\"\n",
"\n",
" return str_prediction\n",
"\n",
"\n",
"def predict_data(read_path, save_path, model):\n",
" data = pd.read_csv(\n",
" read_path, sep=\"\\t\", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE\n",
" )\n",
" with open(save_path, \"w\") as file:\n",
" for _, row in data.iterrows():\n",
" words = nltk.word_tokenize(clean_text(row[7]))\n",
" if len(words) < 3:\n",
" prediction = \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n",
" else:\n",
" prediction = predict(words[-1], model)\n",
" file.write(prediction + \"\\n\")\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e39473e2",
"metadata": {},
"outputs": [],
"source": [
"with open(\"in-header.tsv\") as f:\n",
" in_cols = f.read().strip().split(\"\\t\")\n",
"\n",
"with open(\"out-header.tsv\") as f:\n",
" out_cols = f.read().strip().split(\"\\t\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "bde510c9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['FileId', 'Year', 'LeftContext', 'RightContext']"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"in_cols"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0e8b31dd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Word']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out_cols"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7662d802",
"metadata": {},
"outputs": [],
"source": [
"data = pd.read_csv(\n",
" \"train/in.tsv.xz\",\n",
" sep=\"\\t\",\n",
" on_bad_lines='skip',\n",
" header=None,\n",
" # names=in_cols,\n",
" quoting=csv.QUOTE_NONE,\n",
")\n",
"\n",
"train_labels = pd.read_csv(\n",
" \"train/expected.tsv\",\n",
" sep=\"\\t\",\n",
" on_bad_lines='skip',\n",
" header=None,\n",
" # names=out_cols,\n",
" quoting=csv.QUOTE_NONE,\n",
")\n",
"\n",
"train_data = data[[7, 6]]\n",
"train_data = pd.concat([train_data, train_labels], axis=1)\n",
"\n",
"train_data[\"final\"] = train_data[7] + train_data[0] + train_data[6]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c3d2cfec",
"metadata": {},
"outputs": [],
"source": [
"train_data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bd92ba07",
"metadata": {},
"outputs": [],
"source": [
"\n",
"model = defaultdict(lambda: defaultdict(lambda: 0))\n",
"\n",
"train_model(train_data, model)\n",
"predict_data(\"dev-0/in.tsv.xz\", \"dev-0/out.tsv\", model)\n",
"predict_data(\"test-A/in.tsv.xz\", \"test-A/out.tsv\", model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad23240e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

38
utils.py Normal file
View File

@ -0,0 +1,38 @@
import nltk
import pandas as pd
import regex as re
from csv import QUOTE_NONE
ENCODING = "utf-8"
REP = re.compile(r"[{}\[\]\&%^$*#\(\)@\t\n0123456789]+")
REM = re.compile(r"'s|[\-]\\n|\-\\n|\p{P}")
def clean_text(text):
res = str(text).lower().strip()
res = res.replace("", "'")
res = REM.sub("", res)
res = REP.sub(" ", res)
res = res.replace("'s", " is")
res = res.replace("'ll", " will")
res = res.replace("won't", "will not")
return res.replace("'m", " am")
def get_csv(fname):
return pd.read_csv(
fname,
sep="\t",
on_bad_lines='skip',
header=None,
quoting=QUOTE_NONE,
encoding=ENCODING
)
def check_prerequisites():
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')