arpa
This commit is contained in:
parent
61e88a9c8c
commit
b78257156a
2
.gitignore
vendored
2
.gitignore
vendored
@ -1,4 +1,4 @@
|
|||||||
|
.ipynb_checkpoints*
|
||||||
*~
|
*~
|
||||||
*.swp
|
*.swp
|
||||||
*.bak
|
*.bak
|
||||||
|
0
model.arpa
Normal file
0
model.arpa
Normal file
604
run.ipynb
Normal file
604
run.ipynb
Normal file
@ -0,0 +1,604 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "f834096a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from utils import *"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "5bf0e02b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = get_csv(\"train/in.tsv.xz\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "98ebf07f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"train_labels = get_csv(\"train/expected.tsv\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "42cb7bb1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"train_data = data[[6,7]]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "e6e0480e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"train_data = pd.concat([train_data, train_labels], axis=1)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "464dc043",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"train_data[607] = train_data[6] + train_data[0] + train_data[7]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "f5115f59",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"train_data[607] = train_data[607].apply(clean_text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "25585b08",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0 came fiom the last place to thisnplace and thi...\n",
|
||||||
|
"1 mb boot political obeednattempt to imagine a p...\n",
|
||||||
|
"2 thera were in only aeventyninenuberlbers lo ...\n",
|
||||||
|
"3 a gixnl man y niterertiiiv diiclosurs regard ...\n",
|
||||||
|
"4 tin ub tv thf bbabbt qabjenmr schiffs tutemen...\n",
|
||||||
|
" ... \n",
|
||||||
|
"432017 sam clendenin bad a fancy for uinscience of me...\n",
|
||||||
|
"432018 witahtt halting the party ware dilven to the s...\n",
|
||||||
|
"432019 it was the last thing that either ofnthem expe...\n",
|
||||||
|
"432020 settlement with the departmentnit is also show...\n",
|
||||||
|
"432021 flour quotationslow extras at r ® ncity mi...\n",
|
||||||
|
"Name: 607, Length: 432022, dtype: object"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"train_data[607]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "325a9592",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open(\"tmp\", \"w+\") as f:\n",
|
||||||
|
" for t in train_data[607]:\n",
|
||||||
|
" f.write(t + \"\\n\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "08888fa3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"=== 1/5 Counting and sorting n-grams ===\n",
|
||||||
|
"Reading /home/me/challenging-america-word-gap-prediction-kenlm/tmp\n",
|
||||||
|
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
||||||
|
"************************/home/me/kenlm/lm/builder/corpus_count.cc:179 in void lm::builder::{anonymous}::ComplainDisallowed(StringPiece, lm::WarningAction&) threw FormatLoadException.\n",
|
||||||
|
"Special word <s> is not allowed in the corpus. I plan to support models containing <unk> in the future. Pass --skip_symbols to convert these symbols to whitespace.\n",
|
||||||
|
"/bin/bash: linia 1: 3982 Przerwane (zrzut pamięci) ../kenlm/build//bin/lmplz -o 4 < tmp > model.arpa\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"KENLM_BUILD_PATH = \"../kenlm/build/\"\n",
|
||||||
|
"!$KENLM_BUILD_PATH/bin/lmplz -o 4 < tmp > model.arpa"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "42a8d737",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!rm tmp"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "311c90de",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Defaulting to user installation because normal site-packages is not writeable\n",
|
||||||
|
"Collecting kenlm\n",
|
||||||
|
" Downloading kenlm-0.tar.gz (1.4 MB)\n",
|
||||||
|
" |████████████████████████████████| 1.4 MB 610 kB/s \n",
|
||||||
|
"\u001b[?25hBuilding wheels for collected packages: kenlm\n",
|
||||||
|
" Building wheel for kenlm (setup.py) ... \u001b[?25lerror\n",
|
||||||
|
"\u001b[31m ERROR: Command errored out with exit status 1:\n",
|
||||||
|
" command: /usr/bin/python -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '\"'\"'/tmp/pip-install-tpogj6ox/kenlm_6280e82d7a044d36906510f5646258a0/setup.py'\"'\"'; __file__='\"'\"'/tmp/pip-install-tpogj6ox/kenlm_6280e82d7a044d36906510f5646258a0/setup.py'\"'\"';f=getattr(tokenize, '\"'\"'open'\"'\"', open)(__file__);code=f.read().replace('\"'\"'\\r\\n'\"'\"', '\"'\"'\\n'\"'\"');f.close();exec(compile(code, __file__, '\"'\"'exec'\"'\"'))' bdist_wheel -d /tmp/pip-wheel-s72u5291\n",
|
||||||
|
" cwd: /tmp/pip-install-tpogj6ox/kenlm_6280e82d7a044d36906510f5646258a0/\n",
|
||||||
|
" Complete output (380 lines):\n",
|
||||||
|
" running bdist_wheel\n",
|
||||||
|
" running build\n",
|
||||||
|
" running build_ext\n",
|
||||||
|
" building 'kenlm' extension\n",
|
||||||
|
" creating build/temp.linux-x86_64-3.10\n",
|
||||||
|
" creating build/temp.linux-x86_64-3.10/lm\n",
|
||||||
|
" creating build/temp.linux-x86_64-3.10/python\n",
|
||||||
|
" creating build/temp.linux-x86_64-3.10/util\n",
|
||||||
|
" creating build/temp.linux-x86_64-3.10/util/double-conversion\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/bhiksha.cc -o build/temp.linux-x86_64-3.10/lm/bhiksha.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/binary_format.cc -o build/temp.linux-x86_64-3.10/lm/binary_format.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" lm/binary_format.cc: In member function ‘void lm::ngram::BinaryFormat::FinishFile(const lm::ngram::Config&, lm::ngram::ModelType, unsigned int, const std::vector<long unsigned int>&)’:\n",
|
||||||
|
" lm/binary_format.cc:261:9: warning: ‘void* memset(void*, int, size_t)’ clearing an object of type ‘struct lm::ngram::Parameters’ with no trivial copy-assignment; use assignment or value-initialization instead [-Wclass-memaccess]\n",
|
||||||
|
" 261 | memset(¶ms, 0, sizeof(Parameters));\n",
|
||||||
|
" | ~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" In file included from lm/binary_format.cc:1:\n",
|
||||||
|
" ./lm/binary_format.hh:42:8: note: ‘struct lm::ngram::Parameters’ declared here\n",
|
||||||
|
" 42 | struct Parameters {\n",
|
||||||
|
" | ^~~~~~~~~~\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/config.cc -o build/temp.linux-x86_64-3.10/lm/config.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/lm_exception.cc -o build/temp.linux-x86_64-3.10/lm/lm_exception.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/model.cc -o build/temp.linux-x86_64-3.10/lm/model.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" In file included from ./lm/model.hh:13,\n",
|
||||||
|
" from lm/model.cc:1:\n",
|
||||||
|
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/quantize.cc -o build/temp.linux-x86_64-3.10/lm/quantize.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/read_arpa.cc -o build/temp.linux-x86_64-3.10/lm/read_arpa.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/search_hashed.cc -o build/temp.linux-x86_64-3.10/lm/search_hashed.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" In file included from ./lm/model.hh:13,\n",
|
||||||
|
" from lm/search_hashed.cc:6:\n",
|
||||||
|
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/search_trie.cc -o build/temp.linux-x86_64-3.10/lm/search_trie.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" In file included from lm/search_trie.cc:12:\n",
|
||||||
|
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/sizes.cc -o build/temp.linux-x86_64-3.10/lm/sizes.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" In file included from ./lm/model.hh:13,\n",
|
||||||
|
" from lm/sizes.cc:2:\n",
|
||||||
|
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/trie.cc -o build/temp.linux-x86_64-3.10/lm/trie.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/trie_sort.cc -o build/temp.linux-x86_64-3.10/lm/trie_sort.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" In file included from lm/trie_sort.cc:6:\n",
|
||||||
|
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/value_build.cc -o build/temp.linux-x86_64-3.10/lm/value_build.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" In file included from ./lm/model.hh:13,\n",
|
||||||
|
" from lm/value_build.cc:3:\n",
|
||||||
|
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/virtual_interface.cc -o build/temp.linux-x86_64-3.10/lm/virtual_interface.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c lm/vocab.cc -o build/temp.linux-x86_64-3.10/lm/vocab.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" In file included from lm/vocab.cc:1:\n",
|
||||||
|
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" lm/vocab.cc:285:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 285 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException) {\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" lm/vocab.cc:297:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 297 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException) {\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -ffat-lto-objects -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -march=x86-64 -mtune=generic -O3 -pipe -fno-plt -fexceptions -Wp,-D_FORTIFY_SOURCE=2 -Wformat -Werror=format-security -fstack-clash-protection -fcf-protection -flto=auto -fPIC -I. -I/usr/include/python3.10 -c python/kenlm.cpp -o build/temp.linux-x86_64-3.10/python/kenlm.o -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -std=c++11 -DHAVE_ZLIB -DHAVE_BZLIB -DHAVE_XZLIB\n",
|
||||||
|
" In file included from ./lm/model.hh:13,\n",
|
||||||
|
" from python/kenlm.cpp:253:\n",
|
||||||
|
" ./lm/vocab.hh:210:43: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 210 | void MissingUnknown(const Config &config) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:211:67: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 211 | void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" ./lm/vocab.hh:213:85: warning: dynamic exception specifications are deprecated in C++11 [-Wdeprecated]\n",
|
||||||
|
" 213 | template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {\n",
|
||||||
|
" | ^~~~~\n",
|
||||||
|
" In file included from /usr/include/python3.10/Python.h:74,\n",
|
||||||
|
" from python/kenlm.cpp:16:\n",
|
||||||
|
" python/kenlm.cpp: In function ‘void __pyx_tp_dealloc_5kenlm_Model(PyObject*)’:\n",
|
||||||
|
" /usr/include/python3.10/object.h:133:33: error: lvalue required as increment operand\n",
|
||||||
|
" 133 | #define Py_REFCNT(ob) _Py_REFCNT(_PyObject_CAST_CONST(ob))\n",
|
||||||
|
" | ~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:4398:7: note: in expansion of macro ‘Py_REFCNT’\n",
|
||||||
|
" 4398 | ++Py_REFCNT(o);\n",
|
||||||
|
" | ^~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/object.h:133:33: error: lvalue required as decrement operand\n",
|
||||||
|
" 133 | #define Py_REFCNT(ob) _Py_REFCNT(_PyObject_CAST_CONST(ob))\n",
|
||||||
|
" | ~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:4400:7: note: in expansion of macro ‘Py_REFCNT’\n",
|
||||||
|
" 4400 | --Py_REFCNT(o);\n",
|
||||||
|
" | ^~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp: In function ‘PyObject* PyInit_kenlm()’:\n",
|
||||||
|
" python/kenlm.cpp:4876:37: error: ‘PyTypeObject’ {aka ‘struct _typeobject’} has no member named ‘tp_print’\n",
|
||||||
|
" 4876 | __pyx_type_5kenlm_FullScoreReturn.tp_print = 0;\n",
|
||||||
|
" | ^~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:4880:27: error: ‘PyTypeObject’ {aka ‘struct _typeobject’} has no member named ‘tp_print’\n",
|
||||||
|
" 4880 | __pyx_type_5kenlm_State.tp_print = 0;\n",
|
||||||
|
" | ^~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:4884:28: error: ‘PyTypeObject’ {aka ‘struct _typeobject’} has no member named ‘tp_print’\n",
|
||||||
|
" 4884 | __pyx_type_5kenlm_Config.tp_print = 0;\n",
|
||||||
|
" | ^~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:4888:27: error: ‘PyTypeObject’ {aka ‘struct _typeobject’} has no member named ‘tp_print’\n",
|
||||||
|
" 4888 | __pyx_type_5kenlm_Model.tp_print = 0;\n",
|
||||||
|
" | ^~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:4902:53: error: ‘PyTypeObject’ {aka ‘struct _typeobject’} has no member named ‘tp_print’\n",
|
||||||
|
" 4902 | __pyx_type_5kenlm___pyx_scope_struct__full_scores.tp_print = 0;\n",
|
||||||
|
" | ^~~~~~~~\n",
|
||||||
|
" In file included from /usr/include/python3.10/unicodeobject.h:1046,\n",
|
||||||
|
" from /usr/include/python3.10/Python.h:83,\n",
|
||||||
|
" from python/kenlm.cpp:16:\n",
|
||||||
|
" python/kenlm.cpp: In function ‘int __Pyx_ParseOptionalKeywords(PyObject*, PyObject***, PyObject*, PyObject**, Py_ssize_t, const char*)’:\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: ‘Py_ssize_t _PyUnicode_get_wstr_length(PyObject*)’ is deprecated [-Wdeprecated-declarations]\n",
|
||||||
|
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
|
||||||
|
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:261:7: note: in expansion of macro ‘PyUnicode_WSTR_LENGTH’\n",
|
||||||
|
" 261 | PyUnicode_WSTR_LENGTH(op) : \\\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:5396:22: note: in expansion of macro ‘PyUnicode_GET_SIZE’\n",
|
||||||
|
" 5396 | (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
|
||||||
|
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:262:33: warning: ‘Py_UNICODE* PyUnicode_AsUnicode(PyObject*)’ is deprecated [-Wdeprecated-declarations]\n",
|
||||||
|
" 262 | ((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\\\n",
|
||||||
|
" | ~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:5396:22: note: in expansion of macro ‘PyUnicode_GET_SIZE’\n",
|
||||||
|
" 5396 | (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:580:45: note: declared here\n",
|
||||||
|
" 580 | Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: ‘Py_ssize_t _PyUnicode_get_wstr_length(PyObject*)’ is deprecated [-Wdeprecated-declarations]\n",
|
||||||
|
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
|
||||||
|
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:264:8: note: in expansion of macro ‘PyUnicode_WSTR_LENGTH’\n",
|
||||||
|
" 264 | PyUnicode_WSTR_LENGTH(op)))\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:5396:22: note: in expansion of macro ‘PyUnicode_GET_SIZE’\n",
|
||||||
|
" 5396 | (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
|
||||||
|
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: ‘Py_ssize_t _PyUnicode_get_wstr_length(PyObject*)’ is deprecated [-Wdeprecated-declarations]\n",
|
||||||
|
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
|
||||||
|
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:261:7: note: in expansion of macro ‘PyUnicode_WSTR_LENGTH’\n",
|
||||||
|
" 261 | PyUnicode_WSTR_LENGTH(op) : \\\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:5396:52: note: in expansion of macro ‘PyUnicode_GET_SIZE’\n",
|
||||||
|
" 5396 | (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
|
||||||
|
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:262:33: warning: ‘Py_UNICODE* PyUnicode_AsUnicode(PyObject*)’ is deprecated [-Wdeprecated-declarations]\n",
|
||||||
|
" 262 | ((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\\\n",
|
||||||
|
" | ~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:5396:52: note: in expansion of macro ‘PyUnicode_GET_SIZE’\n",
|
||||||
|
" 5396 | (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:580:45: note: declared here\n",
|
||||||
|
" 580 | Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: ‘Py_ssize_t _PyUnicode_get_wstr_length(PyObject*)’ is deprecated [-Wdeprecated-declarations]\n",
|
||||||
|
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
|
||||||
|
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:264:8: note: in expansion of macro ‘PyUnicode_WSTR_LENGTH’\n",
|
||||||
|
" 264 | PyUnicode_WSTR_LENGTH(op)))\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:5396:52: note: in expansion of macro ‘PyUnicode_GET_SIZE’\n",
|
||||||
|
" 5396 | (PyUnicode_GET_SIZE(**name) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
|
||||||
|
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: ‘Py_ssize_t _PyUnicode_get_wstr_length(PyObject*)’ is deprecated [-Wdeprecated-declarations]\n",
|
||||||
|
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
|
||||||
|
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:261:7: note: in expansion of macro ‘PyUnicode_WSTR_LENGTH’\n",
|
||||||
|
" 261 | PyUnicode_WSTR_LENGTH(op) : \\\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:5412:26: note: in expansion of macro ‘PyUnicode_GET_SIZE’\n",
|
||||||
|
" 5412 | (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
|
||||||
|
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:262:33: warning: ‘Py_UNICODE* PyUnicode_AsUnicode(PyObject*)’ is deprecated [-Wdeprecated-declarations]\n",
|
||||||
|
" 262 | ((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\\\n",
|
||||||
|
" | ~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:5412:26: note: in expansion of macro ‘PyUnicode_GET_SIZE’\n",
|
||||||
|
" 5412 | (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:580:45: note: declared here\n",
|
||||||
|
" 580 | Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: ‘Py_ssize_t _PyUnicode_get_wstr_length(PyObject*)’ is deprecated [-Wdeprecated-declarations]\n",
|
||||||
|
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
|
||||||
|
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:264:8: note: in expansion of macro ‘PyUnicode_WSTR_LENGTH’\n",
|
||||||
|
" 264 | PyUnicode_WSTR_LENGTH(op)))\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:5412:26: note: in expansion of macro ‘PyUnicode_GET_SIZE’\n",
|
||||||
|
" 5412 | (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
|
||||||
|
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: ‘Py_ssize_t _PyUnicode_get_wstr_length(PyObject*)’ is deprecated [-Wdeprecated-declarations]\n",
|
||||||
|
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
|
||||||
|
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:261:7: note: in expansion of macro ‘PyUnicode_WSTR_LENGTH’\n",
|
||||||
|
" 261 | PyUnicode_WSTR_LENGTH(op) : \\\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:5412:59: note: in expansion of macro ‘PyUnicode_GET_SIZE’\n",
|
||||||
|
" 5412 | (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
|
||||||
|
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:262:33: warning: ‘Py_UNICODE* PyUnicode_AsUnicode(PyObject*)’ is deprecated [-Wdeprecated-declarations]\n",
|
||||||
|
" 262 | ((void)PyUnicode_AsUnicode(_PyObject_CAST(op)),\\\n",
|
||||||
|
" | ~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:5412:59: note: in expansion of macro ‘PyUnicode_GET_SIZE’\n",
|
||||||
|
" 5412 | (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:580:45: note: declared here\n",
|
||||||
|
" 580 | Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:451:61: warning: ‘Py_ssize_t _PyUnicode_get_wstr_length(PyObject*)’ is deprecated [-Wdeprecated-declarations]\n",
|
||||||
|
" 451 | #define PyUnicode_WSTR_LENGTH(op) _PyUnicode_get_wstr_length((PyObject*)op)\n",
|
||||||
|
" | ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:264:8: note: in expansion of macro ‘PyUnicode_WSTR_LENGTH’\n",
|
||||||
|
" 264 | PyUnicode_WSTR_LENGTH(op)))\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp:5412:59: note: in expansion of macro ‘PyUnicode_GET_SIZE’\n",
|
||||||
|
" 5412 | (PyUnicode_GET_SIZE(**argname) != PyUnicode_GET_SIZE(key)) ? 1 :\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" /usr/include/python3.10/cpython/unicodeobject.h:446:26: note: declared here\n",
|
||||||
|
" 446 | static inline Py_ssize_t _PyUnicode_get_wstr_length(PyObject *op) {\n",
|
||||||
|
" | ^~~~~~~~~~~~~~~~~~~~~~~~~~\n",
|
||||||
|
" python/kenlm.cpp: In function ‘void __Pyx_ExceptionSave(PyObject**, PyObject**, PyObject**)’:\n",
|
||||||
|
" python/kenlm.cpp:5583:21: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_type’; did you mean ‘curexc_type’?\n",
|
||||||
|
" 5583 | *type = tstate->exc_type;\n",
|
||||||
|
" | ^~~~~~~~\n",
|
||||||
|
" | curexc_type\n",
|
||||||
|
" python/kenlm.cpp:5584:22: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_value’; did you mean ‘curexc_value’?\n",
|
||||||
|
" 5584 | *value = tstate->exc_value;\n",
|
||||||
|
" | ^~~~~~~~~\n",
|
||||||
|
" | curexc_value\n",
|
||||||
|
" python/kenlm.cpp:5585:19: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_traceback’; did you mean ‘curexc_traceback’?\n",
|
||||||
|
" 5585 | *tb = tstate->exc_traceback;\n",
|
||||||
|
" | ^~~~~~~~~~~~~\n",
|
||||||
|
" | curexc_traceback\n",
|
||||||
|
" python/kenlm.cpp: In function ‘void __Pyx_ExceptionReset(PyObject*, PyObject*, PyObject*)’:\n",
|
||||||
|
" python/kenlm.cpp:5597:24: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_type’; did you mean ‘curexc_type’?\n",
|
||||||
|
" 5597 | tmp_type = tstate->exc_type;\n",
|
||||||
|
" | ^~~~~~~~\n",
|
||||||
|
" | curexc_type\n",
|
||||||
|
" python/kenlm.cpp:5598:25: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_value’; did you mean ‘curexc_value’?\n",
|
||||||
|
" 5598 | tmp_value = tstate->exc_value;\n",
|
||||||
|
" | ^~~~~~~~~\n",
|
||||||
|
" | curexc_value\n",
|
||||||
|
" python/kenlm.cpp:5599:22: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_traceback’; did you mean ‘curexc_traceback’?\n",
|
||||||
|
" 5599 | tmp_tb = tstate->exc_traceback;\n",
|
||||||
|
" | ^~~~~~~~~~~~~\n",
|
||||||
|
" | curexc_traceback\n",
|
||||||
|
" python/kenlm.cpp:5600:13: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_type’; did you mean ‘curexc_type’?\n",
|
||||||
|
" 5600 | tstate->exc_type = type;\n",
|
||||||
|
" | ^~~~~~~~\n",
|
||||||
|
" | curexc_type\n",
|
||||||
|
" python/kenlm.cpp:5601:13: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_value’; did you mean ‘curexc_value’?\n",
|
||||||
|
" 5601 | tstate->exc_value = value;\n",
|
||||||
|
" | ^~~~~~~~~\n",
|
||||||
|
" | curexc_value\n",
|
||||||
|
" python/kenlm.cpp:5602:13: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_traceback’; did you mean ‘curexc_traceback’?\n",
|
||||||
|
" 5602 | tstate->exc_traceback = tb;\n",
|
||||||
|
" | ^~~~~~~~~~~~~\n",
|
||||||
|
" | curexc_traceback\n",
|
||||||
|
" python/kenlm.cpp: In function ‘int __Pyx_GetException(PyObject**, PyObject**, PyObject**)’:\n",
|
||||||
|
" python/kenlm.cpp:5645:24: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_type’; did you mean ‘curexc_type’?\n",
|
||||||
|
" 5645 | tmp_type = tstate->exc_type;\n",
|
||||||
|
" | ^~~~~~~~\n",
|
||||||
|
" | curexc_type\n",
|
||||||
|
" python/kenlm.cpp:5646:25: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_value’; did you mean ‘curexc_value’?\n",
|
||||||
|
" 5646 | tmp_value = tstate->exc_value;\n",
|
||||||
|
" | ^~~~~~~~~\n",
|
||||||
|
" | curexc_value\n",
|
||||||
|
" python/kenlm.cpp:5647:22: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_traceback’; did you mean ‘curexc_traceback’?\n",
|
||||||
|
" 5647 | tmp_tb = tstate->exc_traceback;\n",
|
||||||
|
" | ^~~~~~~~~~~~~\n",
|
||||||
|
" | curexc_traceback\n",
|
||||||
|
" python/kenlm.cpp:5648:13: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_type’; did you mean ‘curexc_type’?\n",
|
||||||
|
" 5648 | tstate->exc_type = local_type;\n",
|
||||||
|
" | ^~~~~~~~\n",
|
||||||
|
" | curexc_type\n",
|
||||||
|
" python/kenlm.cpp:5649:13: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_value’; did you mean ‘curexc_value’?\n",
|
||||||
|
" 5649 | tstate->exc_value = local_value;\n",
|
||||||
|
" | ^~~~~~~~~\n",
|
||||||
|
" | curexc_value\n",
|
||||||
|
" python/kenlm.cpp:5650:13: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_traceback’; did you mean ‘curexc_traceback’?\n",
|
||||||
|
" 5650 | tstate->exc_traceback = local_tb;\n",
|
||||||
|
" | ^~~~~~~~~~~~~\n",
|
||||||
|
" | curexc_traceback\n",
|
||||||
|
" python/kenlm.cpp: In function ‘void __Pyx_ExceptionSwap(PyObject**, PyObject**, PyObject**)’:\n",
|
||||||
|
" python/kenlm.cpp:6376:24: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_type’; did you mean ‘curexc_type’?\n",
|
||||||
|
" 6376 | tmp_type = tstate->exc_type;\n",
|
||||||
|
" | ^~~~~~~~\n",
|
||||||
|
" | curexc_type\n",
|
||||||
|
" python/kenlm.cpp:6377:25: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_value’; did you mean ‘curexc_value’?\n",
|
||||||
|
" 6377 | tmp_value = tstate->exc_value;\n",
|
||||||
|
" | ^~~~~~~~~\n",
|
||||||
|
" | curexc_value\n",
|
||||||
|
" python/kenlm.cpp:6378:22: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_traceback’; did you mean ‘curexc_traceback’?\n",
|
||||||
|
" 6378 | tmp_tb = tstate->exc_traceback;\n",
|
||||||
|
" | ^~~~~~~~~~~~~\n",
|
||||||
|
" | curexc_traceback\n",
|
||||||
|
" python/kenlm.cpp:6379:13: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_type’; did you mean ‘curexc_type’?\n",
|
||||||
|
" 6379 | tstate->exc_type = *type;\n",
|
||||||
|
" | ^~~~~~~~\n",
|
||||||
|
" | curexc_type\n",
|
||||||
|
" python/kenlm.cpp:6380:13: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_value’; did you mean ‘curexc_value’?\n",
|
||||||
|
" 6380 | tstate->exc_value = *value;\n",
|
||||||
|
" | ^~~~~~~~~\n",
|
||||||
|
" | curexc_value\n",
|
||||||
|
" python/kenlm.cpp:6381:13: error: ‘PyThreadState’ {aka ‘struct _ts’} has no member named ‘exc_traceback’; did you mean ‘curexc_traceback’?\n",
|
||||||
|
" 6381 | tstate->exc_traceback = *tb;\n",
|
||||||
|
" | ^~~~~~~~~~~~~\n",
|
||||||
|
" | curexc_traceback\n",
|
||||||
|
" error: command '/usr/bin/gcc' failed with exit code 1\n",
|
||||||
|
" ----------------------------------------\u001b[0m\n",
|
||||||
|
"\u001b[31m ERROR: Failed building wheel for kenlm\u001b[0m\n",
|
||||||
|
"\u001b[?25h Running setup.py clean for kenlm\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Failed to build kenlm\n",
|
||||||
|
"Installing collected packages: kenlm\n",
|
||||||
|
" Running setup.py install for kenlm ... \u001b[?25l-"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!pip install kenlm"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a849ad70",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import kenlm\n",
|
||||||
|
"model = kenlm.Model(\"\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
211
testing.ipynb
Normal file
211
testing.ipynb
Normal file
@ -0,0 +1,211 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "21c9b695",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import csv\n",
|
||||||
|
"import regex as re\n",
|
||||||
|
"import nltk\n",
|
||||||
|
"from collections import Counter, defaultdict\n",
|
||||||
|
"import string\n",
|
||||||
|
"import unicodedata\n",
|
||||||
|
"\n",
|
||||||
|
"def clean_text(text): \n",
|
||||||
|
" return re.sub(r\"\\p{P}\", \"\", str(text).lower().replace(\"-\\\\n\", \"\").replace(\"\\\\n\", \" \"))\n",
|
||||||
|
"\n",
|
||||||
|
"def train_model(data, model):\n",
|
||||||
|
" for _, row in data.iterrows():\n",
|
||||||
|
" words = nltk.word_tokenize(clean_text(row[\"final\"]))\n",
|
||||||
|
" for w1, w2 in nltk.bigrams(words, pad_left=True, pad_right=True):\n",
|
||||||
|
" if w1 and w2:\n",
|
||||||
|
" model[w2][w1] += 1\n",
|
||||||
|
" for w1 in model:\n",
|
||||||
|
" total_count = float(sum(model[w1].values()))\n",
|
||||||
|
" for w2 in model[w1]:\n",
|
||||||
|
" model[w2][w1] /= total_count\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def predict(word, model):\n",
|
||||||
|
" predictions = dict(model[word])\n",
|
||||||
|
" most_common = dict(Counter(predictions).most_common(5))\n",
|
||||||
|
"\n",
|
||||||
|
" total_prob = 0.0\n",
|
||||||
|
" str_prediction = \"\"\n",
|
||||||
|
"\n",
|
||||||
|
" for word, prob in most_common.items():\n",
|
||||||
|
" total_prob += prob\n",
|
||||||
|
" str_prediction += f\"{word}:{prob} \"\n",
|
||||||
|
"\n",
|
||||||
|
" if not total_prob:\n",
|
||||||
|
" return \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n",
|
||||||
|
"\n",
|
||||||
|
" if 1 - total_prob >= 0.01:\n",
|
||||||
|
" str_prediction += f\":{1-total_prob}\"\n",
|
||||||
|
" else:\n",
|
||||||
|
" str_prediction += f\":0.01\"\n",
|
||||||
|
"\n",
|
||||||
|
" return str_prediction\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def predict_data(read_path, save_path, model):\n",
|
||||||
|
" data = pd.read_csv(\n",
|
||||||
|
" read_path, sep=\"\\t\", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE\n",
|
||||||
|
" )\n",
|
||||||
|
" with open(save_path, \"w\") as file:\n",
|
||||||
|
" for _, row in data.iterrows():\n",
|
||||||
|
" words = nltk.word_tokenize(clean_text(row[7]))\n",
|
||||||
|
" if len(words) < 3:\n",
|
||||||
|
" prediction = \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n",
|
||||||
|
" else:\n",
|
||||||
|
" prediction = predict(words[-1], model)\n",
|
||||||
|
" file.write(prediction + \"\\n\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "e39473e2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open(\"in-header.tsv\") as f:\n",
|
||||||
|
" in_cols = f.read().strip().split(\"\\t\")\n",
|
||||||
|
"\n",
|
||||||
|
"with open(\"out-header.tsv\") as f:\n",
|
||||||
|
" out_cols = f.read().strip().split(\"\\t\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "bde510c9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['FileId', 'Year', 'LeftContext', 'RightContext']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"in_cols"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "0e8b31dd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['Word']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"out_cols"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "7662d802",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data = pd.read_csv(\n",
|
||||||
|
" \"train/in.tsv.xz\",\n",
|
||||||
|
" sep=\"\\t\",\n",
|
||||||
|
" on_bad_lines='skip',\n",
|
||||||
|
" header=None,\n",
|
||||||
|
" # names=in_cols,\n",
|
||||||
|
" quoting=csv.QUOTE_NONE,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"train_labels = pd.read_csv(\n",
|
||||||
|
" \"train/expected.tsv\",\n",
|
||||||
|
" sep=\"\\t\",\n",
|
||||||
|
" on_bad_lines='skip',\n",
|
||||||
|
" header=None,\n",
|
||||||
|
" # names=out_cols,\n",
|
||||||
|
" quoting=csv.QUOTE_NONE,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"train_data = data[[7, 6]]\n",
|
||||||
|
"train_data = pd.concat([train_data, train_labels], axis=1)\n",
|
||||||
|
"\n",
|
||||||
|
"train_data[\"final\"] = train_data[7] + train_data[0] + train_data[6]\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c3d2cfec",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"train_data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "bd92ba07",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"model = defaultdict(lambda: defaultdict(lambda: 0))\n",
|
||||||
|
"\n",
|
||||||
|
"train_model(train_data, model)\n",
|
||||||
|
"predict_data(\"dev-0/in.tsv.xz\", \"dev-0/out.tsv\", model)\n",
|
||||||
|
"predict_data(\"test-A/in.tsv.xz\", \"test-A/out.tsv\", model)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "ad23240e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
38
utils.py
Normal file
38
utils.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
import nltk
|
||||||
|
import pandas as pd
|
||||||
|
import regex as re
|
||||||
|
from csv import QUOTE_NONE
|
||||||
|
|
||||||
|
ENCODING = "utf-8"
|
||||||
|
|
||||||
|
REP = re.compile(r"[{}\[\]\&%^$*#\(\)@\t\n0123456789]+")
|
||||||
|
REM = re.compile(r"'s|[\-]\\n|\-\\n|\p{P}")
|
||||||
|
|
||||||
|
|
||||||
|
def clean_text(text):
|
||||||
|
res = str(text).lower().strip()
|
||||||
|
res = res.replace("’", "'")
|
||||||
|
res = REM.sub("", res)
|
||||||
|
res = REP.sub(" ", res)
|
||||||
|
res = res.replace("'s", " is")
|
||||||
|
res = res.replace("'ll", " will")
|
||||||
|
res = res.replace("won't", "will not")
|
||||||
|
return res.replace("'m", " am")
|
||||||
|
|
||||||
|
|
||||||
|
def get_csv(fname):
|
||||||
|
return pd.read_csv(
|
||||||
|
fname,
|
||||||
|
sep="\t",
|
||||||
|
on_bad_lines='skip',
|
||||||
|
header=None,
|
||||||
|
quoting=QUOTE_NONE,
|
||||||
|
encoding=ENCODING
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def check_prerequisites():
|
||||||
|
try:
|
||||||
|
nltk.data.find('tokenizers/punkt')
|
||||||
|
except LookupError:
|
||||||
|
nltk.download('punkt')
|
Loading…
Reference in New Issue
Block a user