65 KiB
65 KiB
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
cd drive/MyDrive
[Errno 2] No such file or directory: 'drive/MyDrive' /content/drive/MyDrive/challenging-america-word-gap-prediction
cd challenging-america-word-gap-prediction/
[Errno 2] No such file or directory: 'challenging-america-word-gap-prediction/' /content/drive/MyDrive/challenging-america-word-gap-prediction
import pandas as pd
data = pd.read_csv("train/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, encoding="utf-8")
exp_words = pd.read_csv("train/expected.tsv", sep="\t", on_bad_lines='skip', header=None, encoding="utf-8")
train_data = data[[6, 7]]
train_data= pd.concat([train_data, exp_words], axis=1)
train_data.rename(columns={6: 'First Part', 7: 'Second Part', 0:'Expected word'}, inplace=True)
train_data['Concatenated'] = train_data['First Part'] + train_data['Expected word'] + train_data['Second Part']
import regex as re
train_data.replace('\n', '', regex=True)
First Part | Second Part | Expected word | Concatenated | |
---|---|---|---|---|
0 | came fiom the last place to this\nplace, and t... | said\nit's all squash. The best I could get\ni... | lie | came fiom the last place to this\nplace, and t... |
1 | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... | \ninto a proper perspective with those\nminor ... | himself | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... |
2 | Thera were in 1771 only aeventy-nine\n*ub*erlb... | NaN | of | NaN |
3 | whenever any prize property shall!*' condemn- ... | the ceitihcate of'\noperate to prevent tfie ma... | ably | whenever any prize property shall!*' condemn- ... |
4 | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... | \nTerms of sale: One-tblrd, togethor with the ... | j | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... |
... | ... | ... | ... | ... |
428512 | Sam Clendenin bad a fancy for Ui«\nscience of ... | \nSam was arrested.\nThe case excited a great ... | NaN | NaN |
428513 | Wita.htt halting the party ware dilven to the ... | through the alnp the »Uitors laapeeeed tia.»\n... | NaN | NaN |
428514 | It was the last thing that either of\nthem exp... | Agua Negra across the line.\nIt was a grim pla... | NaN | NaN |
428515 | settlement with the department.\nIt is also sh... | \na note of Wood, Dialogue fc Co., for\nc27,im... | NaN | NaN |
428516 | Flour quotations—low extras at 1 R0®2 50;\ncit... | 3214c;do White at 3614c: Mixed Western at\n331... | NaN | NaN |
428517 rows × 4 columns
import regex as re
train_data.replace('\n', '', regex=True)
First Part | Second Part | Expected word | Concatenated | |
---|---|---|---|---|
0 | came fiom the last place to this\nplace, and t... | said\nit's all squash. The best I could get\ni... | lie | came fiom the last place to this\nplace, and t... |
1 | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... | \ninto a proper perspective with those\nminor ... | himself | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... |
2 | Thera were in 1771 only aeventy-nine\n*ub*erlb... | NaN | of | NaN |
3 | whenever any prize property shall!*' condemn- ... | the ceitihcate of'\noperate to prevent tfie ma... | ably | whenever any prize property shall!*' condemn- ... |
4 | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... | \nTerms of sale: One-tblrd, togethor with the ... | j | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... |
... | ... | ... | ... | ... |
428512 | Sam Clendenin bad a fancy for Ui«\nscience of ... | \nSam was arrested.\nThe case excited a great ... | NaN | NaN |
428513 | Wita.htt halting the party ware dilven to the ... | through the alnp the »Uitors laapeeeed tia.»\n... | NaN | NaN |
428514 | It was the last thing that either of\nthem exp... | Agua Negra across the line.\nIt was a grim pla... | NaN | NaN |
428515 | settlement with the department.\nIt is also sh... | \na note of Wood, Dialogue fc Co., for\nc27,im... | NaN | NaN |
428516 | Flour quotations—low extras at 1 R0®2 50;\ncit... | 3214c;do White at 3614c: Mixed Western at\n331... | NaN | NaN |
428517 rows × 4 columns
import nltk
nltk.download('punkt')
from collections import Counter, defaultdict
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip.
type(train_data['Concatenated'])
pandas.core.series.Series
! pip install kenlm
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting kenlm Downloading kenlm-0.1.tar.gz (424 kB) [2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m425.0/425.0 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m [?25h Preparing metadata (setup.py) ... [?25l[?25hdone Building wheels for collected packages: kenlm Building wheel for kenlm (setup.py) ... [?25l[?25hdone Created wheel for kenlm: filename=kenlm-0.1-cp39-cp39-linux_x86_64.whl size=3001228 sha256=9936418a67cd8b93ca741cb6eebe33c29f80ca0eeb1befad98119b4ce5a95056 Stored in directory: /root/.cache/pip/wheels/34/4e/25/ef89c6aa677d672b9b6031e6f6b03d4a2340e358d479e86777 Successfully built kenlm Installing collected packages: kenlm Successfully installed kenlm-0.1
train_data
First Part | Second Part | Expected word | Concatenated | |
---|---|---|---|---|
0 | came fiom the last place to this\nplace, and t... | said\nit's all squash. The best I could get\ni... | lie | came fiom the last place to this\nplace, and t... |
1 | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... | \ninto a proper perspective with those\nminor ... | himself | MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... |
2 | Thera were in 1771 only aeventy-nine\n*ub*erlb... | NaN | of | NaN |
3 | whenever any prize property shall!*' condemn- ... | the ceitihcate of'\noperate to prevent tfie ma... | ably | whenever any prize property shall!*' condemn- ... |
4 | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... | \nTerms of sale: One-tblrd, togethor with the ... | j | SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... |
... | ... | ... | ... | ... |
428512 | Sam Clendenin bad a fancy for Ui«\nscience of ... | \nSam was arrested.\nThe case excited a great ... | NaN | NaN |
428513 | Wita.htt halting the party ware dilven to the ... | through the alnp the »Uitors laapeeeed tia.»\n... | NaN | NaN |
428514 | It was the last thing that either of\nthem exp... | Agua Negra across the line.\nIt was a grim pla... | NaN | NaN |
428515 | settlement with the department.\nIt is also sh... | \na note of Wood, Dialogue fc Co., for\nc27,im... | NaN | NaN |
428516 | Flour quotations—low extras at 1 R0®2 50;\ncit... | 3214c;do White at 3614c: Mixed Western at\n331... | NaN | NaN |
428517 rows × 4 columns
! pip install https://github.com/kpu/kenlm/archive/master.zip
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Collecting https://github.com/kpu/kenlm/archive/master.zip Downloading https://github.com/kpu/kenlm/archive/master.zip [2K [32m-[0m [32m553.5 kB[0m [31m8.4 MB/s[0m [33m0:00:00[0m [?25h Installing build dependencies ... [?25l[?25hdone Getting requirements to build wheel ... [?25l[?25hdone Preparing metadata (pyproject.toml) ... [?25l[?25hdone Building wheels for collected packages: kenlm Building wheel for kenlm (pyproject.toml) ... [?25l[?25hdone Created wheel for kenlm: filename=kenlm-0.0.0-cp39-cp39-linux_x86_64.whl size=3262507 sha256=d6011bc0a0c1321eb855313d02b28b6ebeb81f79bd161245da540f02b75259b9 Stored in directory: /tmp/pip-ephem-wheel-cache-m1yq1oil/wheels/b5/52/c9/af2949d9776846ea81a9cba52a4fe5a81b9ace3b9f2530c3f3 Successfully built kenlm Installing collected packages: kenlm Attempting uninstall: kenlm Found existing installation: kenlm 0.1 Uninstalling kenlm-0.1: Successfully uninstalled kenlm-0.1 Successfully installed kenlm-0.0.0
! git clone https://github.com/kpu/kenlm
Cloning into 'kenlm'... remote: Enumerating objects: 14147, done.[K remote: Counting objects: 100% (460/460), done.[K remote: Compressing objects: 100% (319/319), done.[K remote: Total 14147 (delta 152), reused 399 (delta 127), pack-reused 13687[K Receiving objects: 100% (14147/14147), 5.91 MiB | 8.32 MiB/s, done. Resolving deltas: 100% (8032/8032), done. Updating files: 100% (304/304), done.
cd kenlm
/content/drive/MyDrive/challenging-america-word-gap-prediction/kenlm
mkdir build
cd build
/content/drive/MyDrive/challenging-america-word-gap-prediction/kenlm/build
! cmake ..
-- The C compiler identification is GNU 9.4.0 -- The CXX compiler identification is GNU 9.4.0 -- Detecting C compiler ABI info -- Detecting C compiler ABI info - done -- Check for working C compiler: /usr/bin/cc - skipped -- Detecting C compile features -- Detecting C compile features - done -- Detecting CXX compiler ABI info -- Detecting CXX compiler ABI info - done -- Check for working CXX compiler: /usr/bin/c++ - skipped -- Detecting CXX compile features -- Detecting CXX compile features - done -- Could NOT find Eigen3 (missing: Eigen3_DIR) -- Found Boost: /usr/lib/x86_64-linux-gnu/cmake/Boost-1.71.0/BoostConfig.cmake (found suitable version "1.71.0", minimum required is "1.41.0") found components: program_options system thread unit_test_framework -- Check if compiler accepts -pthread -- Check if compiler accepts -pthread - yes -- Found Threads: TRUE -- Found ZLIB: /usr/lib/x86_64-linux-gnu/libz.so (found version "1.2.11") -- Found BZip2: /usr/lib/x86_64-linux-gnu/libbz2.so (found version "1.0.8") -- Looking for BZ2_bzCompressInit -- Looking for BZ2_bzCompressInit - found -- Looking for lzma_auto_decoder in /usr/lib/x86_64-linux-gnu/liblzma.so -- Looking for lzma_auto_decoder in /usr/lib/x86_64-linux-gnu/liblzma.so - found -- Looking for lzma_easy_encoder in /usr/lib/x86_64-linux-gnu/liblzma.so -- Looking for lzma_easy_encoder in /usr/lib/x86_64-linux-gnu/liblzma.so - found -- Looking for lzma_lzma_preset in /usr/lib/x86_64-linux-gnu/liblzma.so -- Looking for lzma_lzma_preset in /usr/lib/x86_64-linux-gnu/liblzma.so - found -- Found LibLZMA: /usr/lib/x86_64-linux-gnu/liblzma.so (found version "5.2.4") -- Looking for clock_gettime in rt -- Looking for clock_gettime in rt - found -- Configuring done -- Generating done -- Build files have been written to: /content/drive/MyDrive/challenging-america-word-gap-prediction/kenlm/build
! make -j 4
[ -1%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/bignum.cc.o[0m [ 0%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/fast-dtoa.cc.o[0m [ 2%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/cached-powers.cc.o[0m [ 2%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/bignum-dtoa.cc.o[0m [ 3%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/fixed-dtoa.cc.o[0m [ 5%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/strtod.cc.o[0m [ 6%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/double-to-string.cc.o[0m [ 7%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/double-conversion/string-to-double.cc.o[0m [ 8%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/chain.cc.o[0m [ 10%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/count_records.cc.o[0m [ 11%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/io.cc.o[0m [ 12%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/line_input.cc.o[0m [ 13%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/multi_progress.cc.o[0m [ 15%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/stream/rewindable_stream.cc.o[0m [ 16%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/bit_packing.cc.o[0m [ 17%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/ersatz_progress.cc.o[0m [ 18%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/exception.cc.o[0m [ 20%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/file.cc.o[0m [ 21%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/file_piece.cc.o[0m [ 22%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/float_to_string.cc.o[0m [ 23%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/integer_to_string.cc.o[0m [ 25%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/mmap.cc.o[0m [ 26%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/murmur_hash.cc.o[0m [ 27%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/parallel_read.cc.o[0m [ 28%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/pool.cc.o[0m [ 30%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/read_compressed.cc.o[0m [ 31%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/scoped.cc.o[0m [ 32%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/spaces.cc.o[0m [ 33%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/string_piece.cc.o[0m [ 35%] [32mBuilding CXX object util/CMakeFiles/kenlm_util.dir/usage.cc.o[0m [ 36%] [32m[1mLinking CXX static library ../lib/libkenlm_util.a[0m [ 36%] Built target kenlm_util [ 37%] [32mBuilding CXX object util/CMakeFiles/probing_hash_table_benchmark.dir/probing_hash_table_benchmark_main.cc.o[0m [ 38%] [32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/arpa_io.cc.o[0m [ 40%] [32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/phrase.cc.o[0m [ 41%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/bhiksha.cc.o[0m [ 42%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/binary_format.cc.o[0m [ 43%] [32mBuilding CXX object lm/filter/CMakeFiles/kenlm_filter.dir/vocab.cc.o[0m [ 45%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/config.cc.o[0m [ 46%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/lm_exception.cc.o[0m [ 47%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/model.cc.o[0m [ 48%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/quantize.cc.o[0m [ 50%] [32m[1mLinking CXX static library ../../lib/libkenlm_filter.a[0m [ 50%] Built target kenlm_filter [ 51%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/read_arpa.cc.o[0m [ 52%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/search_hashed.cc.o[0m [ 53%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/search_trie.cc.o[0m [ 55%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/sizes.cc.o[0m [ 56%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/trie.cc.o[0m [ 57%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/trie_sort.cc.o[0m [ 58%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/value_build.cc.o[0m [ 60%] [32m[1mLinking CXX executable ../bin/probing_hash_table_benchmark[0m [ 60%] Built target probing_hash_table_benchmark [ 61%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/virtual_interface.cc.o[0m [ 62%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/vocab.cc.o[0m [ 63%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/model_buffer.cc.o[0m [ 65%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/print.cc.o[0m [ 66%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/renumber.cc.o[0m [ 67%] [32mBuilding CXX object lm/CMakeFiles/kenlm.dir/common/size_option.cc.o[0m [ 68%] [32m[1mLinking CXX static library ../lib/libkenlm.a[0m [ 68%] Built target kenlm [ 70%] [32mBuilding CXX object lm/CMakeFiles/query.dir/query_main.cc.o[0m [ 72%] [32mBuilding CXX object lm/CMakeFiles/build_binary.dir/build_binary_main.cc.o[0m [ 73%] [32mBuilding CXX object lm/CMakeFiles/kenlm_benchmark.dir/kenlm_benchmark_main.cc.o[0m [ 73%] [32mBuilding CXX object lm/CMakeFiles/fragment.dir/fragment_main.cc.o[0m [ 75%] [32m[1mLinking CXX executable ../bin/fragment[0m [ 75%] Built target fragment [ 76%] [32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/adjust_counts.cc.o[0m [ 77%] [32m[1mLinking CXX executable ../bin/build_binary[0m [ 77%] Built target build_binary [ 78%] [32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/corpus_count.cc.o[0m [ 80%] [32m[1mLinking CXX executable ../bin/query[0m [ 80%] Built target query [ 81%] [32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/initial_probabilities.cc.o[0m [ 82%] [32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/interpolate.cc.o[0m [ 83%] [32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/output.cc.o[0m [ 85%] [32mBuilding CXX object lm/builder/CMakeFiles/kenlm_builder.dir/pipeline.cc.o[0m [ 86%] [32mBuilding CXX object lm/filter/CMakeFiles/filter.dir/filter_main.cc.o[0m [ 87%] [32mBuilding CXX object lm/filter/CMakeFiles/phrase_table_vocab.dir/phrase_table_vocab_main.cc.o[0m [ 88%] [32m[1mLinking CXX executable ../../bin/phrase_table_vocab[0m [ 88%] Built target phrase_table_vocab [ 90%] [32m[1mLinking CXX executable ../bin/kenlm_benchmark[0m [ 90%] Built target kenlm_benchmark [ 91%] [32m[1mLinking CXX static library ../../lib/libkenlm_builder.a[0m [ 91%] Built target kenlm_builder [ 92%] [32mBuilding CXX object lm/builder/CMakeFiles/lmplz.dir/lmplz_main.cc.o[0m [ 93%] [32mBuilding CXX object lm/builder/CMakeFiles/count_ngrams.dir/count_ngrams_main.cc.o[0m [ 95%] [32m[1mLinking CXX executable ../../bin/filter[0m [ 95%] Built target filter [ 96%] [32m[1mLinking CXX executable ../../bin/lmplz[0m [ 96%] Built target lmplz [ 97%] [32m[1mLinking CXX executable ../../bin/count_ngrams[0m [ 97%] Built target count_ngrams
! make install
[ 36%] Built target kenlm_util [ 38%] Built target probing_hash_table_benchmark [ 63%] Built target kenlm [ 66%] Built target query [ 68%] Built target fragment [ 71%] Built target build_binary [ 73%] Built target kenlm_benchmark [ 82%] Built target kenlm_builder [ 85%] Built target lmplz [ 87%] Built target count_ngrams [ 92%] Built target kenlm_filter [ 95%] Built target filter [ 97%] Built target phrase_table_vocab [36mInstall the project...[0m -- Install configuration: "Release" -- Installing: /usr/local/share/kenlm/cmake/kenlmTargets.cmake -- Installing: /usr/local/share/kenlm/cmake/kenlmTargets-release.cmake -- Installing: /usr/local/include/kenlm/util/bit_packing.hh -- Installing: /usr/local/include/kenlm/util/ersatz_progress.hh -- Installing: /usr/local/include/kenlm/util/exception.hh -- Installing: /usr/local/include/kenlm/util/fake_ostream.hh -- Installing: /usr/local/include/kenlm/util/file.hh -- Installing: /usr/local/include/kenlm/util/file_piece.hh -- Installing: /usr/local/include/kenlm/util/file_stream.hh -- Installing: /usr/local/include/kenlm/util/fixed_array.hh -- Installing: /usr/local/include/kenlm/util/float_to_string.hh -- Installing: /usr/local/include/kenlm/util/getopt.hh -- Installing: /usr/local/include/kenlm/util/have.hh -- Installing: /usr/local/include/kenlm/util/integer_to_string.hh -- Installing: /usr/local/include/kenlm/util/joint_sort.hh -- Installing: /usr/local/include/kenlm/util/mmap.hh -- Installing: /usr/local/include/kenlm/util/multi_intersection.hh -- Installing: /usr/local/include/kenlm/util/murmur_hash.hh -- Installing: /usr/local/include/kenlm/util/parallel_read.hh -- Installing: /usr/local/include/kenlm/util/pcqueue.hh -- Installing: /usr/local/include/kenlm/util/pool.hh -- Installing: /usr/local/include/kenlm/util/probing_hash_table.hh -- Installing: /usr/local/include/kenlm/util/proxy_iterator.hh -- Installing: /usr/local/include/kenlm/util/read_compressed.hh -- Installing: /usr/local/include/kenlm/util/scoped.hh -- Installing: /usr/local/include/kenlm/util/sized_iterator.hh -- Installing: /usr/local/include/kenlm/util/sorted_uniform.hh -- Installing: /usr/local/include/kenlm/util/spaces.hh -- Installing: /usr/local/include/kenlm/util/string_piece.hh -- Installing: /usr/local/include/kenlm/util/string_piece_hash.hh -- Installing: /usr/local/include/kenlm/util/string_stream.hh -- Installing: /usr/local/include/kenlm/util/thread_pool.hh -- Installing: /usr/local/include/kenlm/util/tokenize_piece.hh -- Installing: /usr/local/include/kenlm/util/usage.hh -- Installing: /usr/local/include/kenlm/util/double-conversion/bignum-dtoa.h -- Installing: /usr/local/include/kenlm/util/double-conversion/bignum.h -- Installing: /usr/local/include/kenlm/util/double-conversion/cached-powers.h -- Installing: /usr/local/include/kenlm/util/double-conversion/diy-fp.h -- Installing: /usr/local/include/kenlm/util/double-conversion/double-conversion.h -- Installing: /usr/local/include/kenlm/util/double-conversion/double-to-string.h -- Installing: /usr/local/include/kenlm/util/double-conversion/fast-dtoa.h -- Installing: /usr/local/include/kenlm/util/double-conversion/fixed-dtoa.h -- Installing: /usr/local/include/kenlm/util/double-conversion/ieee.h -- Installing: /usr/local/include/kenlm/util/double-conversion/string-to-double.h -- Installing: /usr/local/include/kenlm/util/double-conversion/strtod.h -- Installing: /usr/local/include/kenlm/util/double-conversion/utils.h -- Installing: /usr/local/include/kenlm/util/stream/block.hh -- Installing: /usr/local/include/kenlm/util/stream/chain.hh -- Installing: /usr/local/include/kenlm/util/stream/config.hh -- Installing: /usr/local/include/kenlm/util/stream/count_records.hh -- Installing: /usr/local/include/kenlm/util/stream/io.hh -- Installing: /usr/local/include/kenlm/util/stream/line_input.hh -- Installing: /usr/local/include/kenlm/util/stream/multi_progress.hh -- Installing: /usr/local/include/kenlm/util/stream/multi_stream.hh -- Installing: /usr/local/include/kenlm/util/stream/rewindable_stream.hh -- Installing: /usr/local/include/kenlm/util/stream/sort.hh -- Installing: /usr/local/include/kenlm/util/stream/stream.hh -- Installing: /usr/local/include/kenlm/util/stream/typed_stream.hh -- Installing: /usr/local/include/kenlm/lm/bhiksha.hh -- Installing: /usr/local/include/kenlm/lm/binary_format.hh -- Installing: /usr/local/include/kenlm/lm/blank.hh -- Installing: /usr/local/include/kenlm/lm/config.hh -- Installing: /usr/local/include/kenlm/lm/enumerate_vocab.hh -- Installing: /usr/local/include/kenlm/lm/facade.hh -- Installing: /usr/local/include/kenlm/lm/left.hh -- Installing: /usr/local/include/kenlm/lm/lm_exception.hh -- Installing: /usr/local/include/kenlm/lm/max_order.hh -- Installing: /usr/local/include/kenlm/lm/model.hh -- Installing: /usr/local/include/kenlm/lm/model_type.hh -- Installing: /usr/local/include/kenlm/lm/ngram_query.hh -- Installing: /usr/local/include/kenlm/lm/partial.hh -- Installing: /usr/local/include/kenlm/lm/quantize.hh -- Installing: /usr/local/include/kenlm/lm/read_arpa.hh -- Installing: /usr/local/include/kenlm/lm/return.hh -- Installing: /usr/local/include/kenlm/lm/search_hashed.hh -- Installing: /usr/local/include/kenlm/lm/search_trie.hh -- Installing: /usr/local/include/kenlm/lm/sizes.hh -- Installing: /usr/local/include/kenlm/lm/state.hh -- Installing: /usr/local/include/kenlm/lm/trie.hh -- Installing: /usr/local/include/kenlm/lm/trie_sort.hh -- Installing: /usr/local/include/kenlm/lm/value.hh -- Installing: /usr/local/include/kenlm/lm/value_build.hh -- Installing: /usr/local/include/kenlm/lm/virtual_interface.hh -- Installing: /usr/local/include/kenlm/lm/vocab.hh -- Installing: /usr/local/include/kenlm/lm/weights.hh -- Installing: /usr/local/include/kenlm/lm/word_index.hh -- Installing: /usr/local/include/kenlm/lm/builder/adjust_counts.hh -- Installing: /usr/local/include/kenlm/lm/builder/combine_counts.hh -- Installing: /usr/local/include/kenlm/lm/builder/corpus_count.hh -- Installing: /usr/local/include/kenlm/lm/builder/debug_print.hh -- Installing: /usr/local/include/kenlm/lm/builder/discount.hh -- Installing: /usr/local/include/kenlm/lm/builder/hash_gamma.hh -- Installing: /usr/local/include/kenlm/lm/builder/header_info.hh -- Installing: /usr/local/include/kenlm/lm/builder/initial_probabilities.hh -- Installing: /usr/local/include/kenlm/lm/builder/interpolate.hh -- Installing: /usr/local/include/kenlm/lm/builder/output.hh -- Installing: /usr/local/include/kenlm/lm/builder/payload.hh -- Installing: /usr/local/include/kenlm/lm/builder/pipeline.hh -- Installing: /usr/local/include/kenlm/lm/common/compare.hh -- Installing: /usr/local/include/kenlm/lm/common/joint_order.hh -- Installing: /usr/local/include/kenlm/lm/common/model_buffer.hh -- Installing: /usr/local/include/kenlm/lm/common/ngram.hh -- Installing: /usr/local/include/kenlm/lm/common/ngram_stream.hh -- Installing: /usr/local/include/kenlm/lm/common/print.hh -- Installing: /usr/local/include/kenlm/lm/common/renumber.hh -- Installing: /usr/local/include/kenlm/lm/common/size_option.hh -- Installing: /usr/local/include/kenlm/lm/common/special.hh -- Installing: /usr/local/include/kenlm/lm/filter/arpa_io.hh -- Installing: /usr/local/include/kenlm/lm/filter/count_io.hh -- Installing: /usr/local/include/kenlm/lm/filter/format.hh -- Installing: /usr/local/include/kenlm/lm/filter/phrase.hh -- Installing: /usr/local/include/kenlm/lm/filter/thread.hh -- Installing: /usr/local/include/kenlm/lm/filter/vocab.hh -- Installing: /usr/local/include/kenlm/lm/filter/wrapper.hh -- Installing: /usr/local/include/kenlm/lm/interpolate/backoff_matrix.hh -- Installing: /usr/local/include/kenlm/lm/interpolate/backoff_reunification.hh -- Installing: /usr/local/include/kenlm/lm/interpolate/bounded_sequence_encoding.hh -- Installing: /usr/local/include/kenlm/lm/interpolate/interpolate_info.hh -- Installing: /usr/local/include/kenlm/lm/interpolate/merge_probabilities.hh -- Installing: /usr/local/include/kenlm/lm/interpolate/merge_vocab.hh -- Installing: /usr/local/include/kenlm/lm/interpolate/normalize.hh -- Installing: /usr/local/include/kenlm/lm/interpolate/pipeline.hh -- Installing: /usr/local/include/kenlm/lm/interpolate/split_worker.hh -- Installing: /usr/local/include/kenlm/lm/interpolate/tune_derivatives.hh -- Installing: /usr/local/include/kenlm/lm/interpolate/tune_instances.hh -- Installing: /usr/local/include/kenlm/lm/interpolate/tune_matrix.hh -- Installing: /usr/local/include/kenlm/lm/interpolate/tune_weights.hh -- Installing: /usr/local/include/kenlm/lm/interpolate/universal_vocab.hh -- Installing: /usr/local/share/kenlm/cmake/kenlmConfig.cmake -- Installing: /usr/local/lib/libkenlm_util.a -- Installing: /usr/local/bin/probing_hash_table_benchmark -- Installing: /usr/local/lib/libkenlm.a -- Installing: /usr/local/bin/query -- Installing: /usr/local/bin/fragment -- Installing: /usr/local/bin/build_binary -- Installing: /usr/local/bin/kenlm_benchmark -- Installing: /usr/local/bin/lmplz -- Installing: /usr/local/bin/count_ngrams -- Installing: /usr/local/lib/libkenlm_builder.a -- Installing: /usr/local/bin/filter -- Installing: /usr/local/bin/phrase_table_vocab -- Installing: /usr/local/lib/libkenlm_filter.a
KENLM_BUILD_PATH = "/content/drive/MyDrive/challenging-america-word-gap-prediction/kenlm/build"
import numpy as np
train_data['Concatenated'] = train_data['Concatenated'].replace(np.nan, '', regex=True)
train_data['Concatenated']
0 came fiom the last place to this\nplace, and t... 1 MB. BOOT'S POLITICAL OBEED\nAttempt to imagine... 2 3 whenever any prize property shall!*' condemn- ... 4 SA LKOFVALUABLE UNIMPBOV&D RE\\\\L\nJSIATF. ON T... ... 428512 428513 428514 428515 428516 Name: Concatenated, Length: 428517, dtype: object
with open("new_train", "w+") as f:
for t in train_data['Concatenated']:
f.write(t + "\n")
!$KENLM_BUILD_PATH/bin/lmplz -o 4 < new_train > model.arpa
=== 1/5 Counting and sorting n-grams === Reading /content/drive/MyDrive/challenging-america-word-gap-prediction/kenlm/build/new_train ----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 **************************************************************************************************** Unigram tokens 77591849 types 8606129 === 2/5 Calculating and sorting adjusted counts === Chain sizes: 1:103273548 2:1836559360 3:3443548928 4:5509678080 Statistics: 1 8606129 D1=0.858113 D2=1.02331 D3+=1.17414 2 33054619 D1=0.888834 D2=1.06683 D3+=1.1959 3 57923129 D1=0.933907 D2=1.16291 D3+=1.25462 4 69946963 D1=0.952529 D2=1.26688 D3+=1.32772 Memory estimate for binary LM: type MB probing 3496 assuming -p 1.5 probing 4049 assuming -r models -p 1.5 trie 1888 without quantization trie 1186 assuming -q 8 -b 8 quantization trie 1679 assuming -a 22 array pointer compression trie 977 assuming -a 22 -q 8 -b 8 array pointer compression and quantization === 3/5 Calculating and sorting initial probabilities === Chain sizes: 1:103273548 2:528873904 3:1158462580 4:1678727112 ----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 #################################################################################################### === 4/5 Calculating and writing order-interpolated probabilities === Chain sizes: 1:103273548 2:528873904 3:1158462580 4:1678727112 ----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 #################################################################################################### === 5/5 Writing ARPA model === ----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 **************************************************************************************************** Name:lmplz VmPeak:10909296 kB VmRSS:3116 kB RSSMax:3969704 kB user:275.432 sys:59.5966 CPU:335.028 real:441.039
import kenlm