Compare commits
1 Commits
Author | SHA1 | Date |
---|---|---|
Th3NiKo | 35d0bbd849 |
|
@ -0,0 +1,385 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"\n",
|
||||||
|
"train = pd.read_csv(\"train/in.tsv.xz\",header=None, compression='xz',sep=\"\\t\", names=[\"text\",\"time\"])\n",
|
||||||
|
"expected = pd.read_csv(\"train/expected.tsv\", header=None)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"train[\"expected\"] = expected"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 34,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"count 185478.000000\n",
|
||||||
|
"mean 303.405056\n",
|
||||||
|
"std 494.328936\n",
|
||||||
|
"min 3.000000\n",
|
||||||
|
"25% 68.000000\n",
|
||||||
|
"50% 151.000000\n",
|
||||||
|
"75% 341.000000\n",
|
||||||
|
"max 10251.000000\n",
|
||||||
|
"Name: text, dtype: float64"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 34,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"train[train[\"expected\"]==' S'][\"text\"].str.len().describe()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 35,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"count 104063.000000\n",
|
||||||
|
"mean 298.150995\n",
|
||||||
|
"std 504.984133\n",
|
||||||
|
"min 3.000000\n",
|
||||||
|
"25% 65.000000\n",
|
||||||
|
"50% 146.000000\n",
|
||||||
|
"75% 330.000000\n",
|
||||||
|
"max 10161.000000\n",
|
||||||
|
"Name: text, dtype: float64"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 35,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"train[train[\"expected\"]==' P'][\"text\"].str.len().describe()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 39,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[nltk_data] Downloading package punkt to /home/th3niko/nltk_data...\n",
|
||||||
|
"[nltk_data] Unzipping tokenizers/punkt.zip.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import string\n",
|
||||||
|
"from nltk import word_tokenize\n",
|
||||||
|
"import nltk\n",
|
||||||
|
"from nltk.corpus import stopwords\n",
|
||||||
|
"stopwords = set(stopwords.words('english'))\n",
|
||||||
|
"nltk.download(\"punkt\")\n",
|
||||||
|
"\n",
|
||||||
|
"def clean_text(text):\n",
|
||||||
|
" text = word_tokenize(text)\n",
|
||||||
|
" text = [word.lower() for word in text if word.isalpha()]\n",
|
||||||
|
" punct = str.maketrans('','',string.punctuation)\n",
|
||||||
|
" text = [word.translate(punct) for word in text]\n",
|
||||||
|
" text = [word for word in text if not word in stopwords]\n",
|
||||||
|
" return text\n",
|
||||||
|
"\n",
|
||||||
|
"train['text'] = train['text'].apply(clean_text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 40,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0 [medical, issues, recently]\n",
|
||||||
|
"1 [supposedly, aluminum, barium, strontium, used...\n",
|
||||||
|
"2 [nobel, prizes, make, rich]\n",
|
||||||
|
"3 [came, article, stayed, doctor]\n",
|
||||||
|
"4 [resorted, insults, got, owned, directly, afte...\n",
|
||||||
|
" ... \n",
|
||||||
|
"289536 [really, baby, shampoo, actually, highly, alka...\n",
|
||||||
|
"289537 [gives, example, brendan, reilly, doctor, came...\n",
|
||||||
|
"289538 [ca, fix, stupidity]\n",
|
||||||
|
"289539 [excellent, points, also, looking, bit, progra...\n",
|
||||||
|
"289540 [earlier, year, may, couple, days, ago, nov]\n",
|
||||||
|
"Name: text, Length: 289541, dtype: object"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 40,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"train['text']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 45,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from collections import Counter\n",
|
||||||
|
"def counter(text):\n",
|
||||||
|
" cnt = Counter()\n",
|
||||||
|
" for msgs in text:\n",
|
||||||
|
" for msg in msgs:\n",
|
||||||
|
" cnt[msg] += 1\n",
|
||||||
|
" return cnt\n",
|
||||||
|
"\n",
|
||||||
|
"text_cnt_s = counter(train[train['expected']==' S']['text'])\n",
|
||||||
|
"text_cnt_p = counter(train[train['expected']==' P']['text'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 58,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"text_s = text_cnt_s.most_common(100)\n",
|
||||||
|
"text_p = text_cnt_p.most_common(100)\n",
|
||||||
|
"text_s = pd.DataFrame(text_s,columns = ['words','counts'])\n",
|
||||||
|
"text_p = pd.DataFrame(text_p,columns = ['words','counts'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 53,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"%matplotlib inline\n",
|
||||||
|
"import seaborn as sns"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 57,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/home/th3niko/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
|
||||||
|
"of pandas will change to not sort by default.\n",
|
||||||
|
"\n",
|
||||||
|
"To accept the future behavior, pass 'sort=False'.\n",
|
||||||
|
"\n",
|
||||||
|
"To retain the current behavior and silence the warning, pass 'sort=True'.\n",
|
||||||
|
"\n",
|
||||||
|
" \"\"\"Entry point for launching an IPython kernel.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>counts1</th>\n",
|
||||||
|
" <th>counts2</th>\n",
|
||||||
|
" <th>dataset</th>\n",
|
||||||
|
" <th>words1</th>\n",
|
||||||
|
" <th>words2</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>39094.0</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>s</td>\n",
|
||||||
|
" <td>would</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>36978.0</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>s</td>\n",
|
||||||
|
" <td>like</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>36461.0</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>s</td>\n",
|
||||||
|
" <td>people</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <td>3</td>\n",
|
||||||
|
" <td>29143.0</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>s</td>\n",
|
||||||
|
" <td>one</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <td>4</td>\n",
|
||||||
|
" <td>26827.0</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>s</td>\n",
|
||||||
|
" <td>think</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <td>95</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>3007.0</td>\n",
|
||||||
|
" <td>p</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>kind</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <td>96</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>2990.0</td>\n",
|
||||||
|
" <td>p</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>show</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <td>97</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>2970.0</td>\n",
|
||||||
|
" <td>p</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>far</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <td>98</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>2964.0</td>\n",
|
||||||
|
" <td>p</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>feel</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <td>99</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>2915.0</td>\n",
|
||||||
|
" <td>p</td>\n",
|
||||||
|
" <td>NaN</td>\n",
|
||||||
|
" <td>try</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"<p>200 rows × 5 columns</p>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" counts1 counts2 dataset words1 words2\n",
|
||||||
|
"0 39094.0 NaN s would NaN\n",
|
||||||
|
"1 36978.0 NaN s like NaN\n",
|
||||||
|
"2 36461.0 NaN s people NaN\n",
|
||||||
|
"3 29143.0 NaN s one NaN\n",
|
||||||
|
"4 26827.0 NaN s think NaN\n",
|
||||||
|
".. ... ... ... ... ...\n",
|
||||||
|
"95 NaN 3007.0 p NaN kind\n",
|
||||||
|
"96 NaN 2990.0 p NaN show\n",
|
||||||
|
"97 NaN 2970.0 p NaN far\n",
|
||||||
|
"98 NaN 2964.0 p NaN feel\n",
|
||||||
|
"99 NaN 2915.0 p NaN try\n",
|
||||||
|
"\n",
|
||||||
|
"[200 rows x 5 columns]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 57,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"concatenated = pd.concat([text_s.assign(dataset='s'), text_p.assign(dataset='p')])\n",
|
||||||
|
"concatenated\n",
|
||||||
|
"sns.set(style=\"whitegrid\")\n",
|
||||||
|
"g = sns.catplot(x=\"words\", y=\"counts\", data=concatenated,\n",
|
||||||
|
" height=6, kind=\"bar\", palette=\"muted\",style=\"dataset\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
Binary file not shown.
After Width: | Height: | Size: 109 KiB |
2656
dev-0/out.tsv
2656
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
1500
mostUsed.txt
1500
mostUsed.txt
File diff suppressed because it is too large
Load Diff
1525
mostUsedP.txt
1525
mostUsedP.txt
File diff suppressed because it is too large
Load Diff
1531
mostUsedS.txt
1531
mostUsedS.txt
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,32 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import pickle
|
||||||
|
from math import log
|
||||||
|
from tokenizer import tokenize
|
||||||
|
|
||||||
|
model = pickle.load(open("model.pkl","rb"))
|
||||||
|
pskeptic, vocabulary_size, skeptic_words_total, paranormal_words_total, skeptic_count, paranormal_count = model
|
||||||
|
|
||||||
|
|
||||||
|
for line in sys.stdin:
|
||||||
|
document = line.rstrip()
|
||||||
|
fields = document.split('\t')
|
||||||
|
document = fields[0]
|
||||||
|
terms = tokenize(document)
|
||||||
|
|
||||||
|
log_prob_sketpic = log(pskeptic)
|
||||||
|
log_prob_paranormal = log(1 - pskeptic)
|
||||||
|
|
||||||
|
for term in terms:
|
||||||
|
if term not in skeptic_count:
|
||||||
|
skeptic_count[term] = 0
|
||||||
|
if term not in paranormal_count:
|
||||||
|
paranormal_count[term] = 0
|
||||||
|
log_prob_sketpic += log((skeptic_count[term] + 1) / (skeptic_words_total + vocabulary_size))
|
||||||
|
log_prob_paranormal += log((paranormal_count[term] + 1) / (paranormal_words_total + vocabulary_size))
|
||||||
|
|
||||||
|
if log_prob_sketpic > log_prob_paranormal:
|
||||||
|
print('S')
|
||||||
|
else:
|
||||||
|
print('P')
|
14
solve.py
14
solve.py
|
@ -1,14 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
import pandas as pd
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
# sort | uniq -c
|
|
||||||
#train = pd.read_csv("./train/in.tsv.xz", delimiter='\t')
|
|
||||||
#import sys
|
|
||||||
#for line in sys.stdin
|
|
||||||
#if re.search(r'UFO', line) print("P")
|
|
||||||
for line in sys.stdin:
|
|
||||||
if re.search(r'(video|paranormal|happened|alien|camera|ghost|sleep|dream|moving|sky|contact|sightings|footage|photo|phenomena|phenomenon|spirit|shadow|board|window|creepy|wake|eye|film|circles|lol|extraterrestrial|floating|disclosure|civilization|record|glitch|driving|ufo|flash|sharing)', line.lower()):
|
|
||||||
print("P")
|
|
||||||
else:
|
|
||||||
print("S")
|
|
2566
test-A/out.tsv
2566
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,25 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
from nltk.tokenize import word_tokenize
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.stem.porter import PorterStemmer
|
||||||
|
import nltk
|
||||||
|
import re
|
||||||
|
import string
|
||||||
|
|
||||||
|
|
||||||
|
wordlist = set(nltk.corpus.words.words())
|
||||||
|
porter = PorterStemmer()
|
||||||
|
stop_words = set(stopwords.words('english'))
|
||||||
|
printable = set(string.printable)
|
||||||
|
|
||||||
|
def tokenize(d):
|
||||||
|
d = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'thereisasimplelinkinside', d, flags=re.MULTILINE)
|
||||||
|
d = re.sub(r'\\n',' ',d)
|
||||||
|
d = re.sub(r'\*|\'|\"|\/|~|_|=|-',' ',d)
|
||||||
|
d = ''.join(filter(lambda x: x in printable, d))
|
||||||
|
tokenized = word_tokenize(d)
|
||||||
|
#tokenized = re.split(r'\/|\\| ', d)
|
||||||
|
lower = [w.lower() for w in tokenized]
|
||||||
|
words = [w for w in lower if not w in stop_words]
|
||||||
|
return words
|
|
@ -0,0 +1,58 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import pickle
|
||||||
|
from tokenizer import tokenize
|
||||||
|
|
||||||
|
|
||||||
|
def train():
|
||||||
|
documents_total = 0
|
||||||
|
skeptic_documents_total = 0
|
||||||
|
|
||||||
|
vocabulary = set()
|
||||||
|
|
||||||
|
skeptic_words_total = 0
|
||||||
|
paranormal_words_total = 0
|
||||||
|
|
||||||
|
skeptic_count = {}
|
||||||
|
paranormal_count = {}
|
||||||
|
|
||||||
|
for line in sys.stdin:
|
||||||
|
line = line.rstrip()
|
||||||
|
fields = line.split('\t')
|
||||||
|
label = fields[0].strip()
|
||||||
|
document = fields[1]
|
||||||
|
terms = tokenize(document)
|
||||||
|
|
||||||
|
for t in terms:
|
||||||
|
vocabulary.add(t)
|
||||||
|
|
||||||
|
documents_total += 1
|
||||||
|
if label == 'S':
|
||||||
|
skeptic_documents_total += 1
|
||||||
|
skeptic_words_total += len(terms)
|
||||||
|
for term in terms:
|
||||||
|
if term in skeptic_count:
|
||||||
|
skeptic_count[term] += 1
|
||||||
|
else:
|
||||||
|
skeptic_count[term] = 1
|
||||||
|
else:
|
||||||
|
paranormal_words_total += len(terms)
|
||||||
|
for term in terms:
|
||||||
|
if term in paranormal_count:
|
||||||
|
paranormal_count[term] += 1
|
||||||
|
else:
|
||||||
|
paranormal_count[term] = 1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
psketpic = skeptic_documents_total / documents_total
|
||||||
|
vocabulary_size = len(vocabulary)
|
||||||
|
|
||||||
|
model = (psketpic, vocabulary_size, skeptic_words_total,paranormal_words_total, skeptic_count, paranormal_count)
|
||||||
|
pickle.dump(model, open("model.pkl", "wb"))
|
||||||
|
|
||||||
|
print(paranormal_count)
|
||||||
|
print(skeptic_words_total)
|
||||||
|
|
||||||
|
train()
|
|
@ -1,12 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
input="../mostUsedP.txt"
|
|
||||||
while IFS= read -r line
|
|
||||||
do
|
|
||||||
p=`xzcat in.tsv.xz | paste expected.tsv - |grep "P.* $line" | wc -l`
|
|
||||||
s=`xzcat in.tsv.xz | paste expected.tsv - |grep "S.* $line" | wc -l`
|
|
||||||
diff=$((p-s))
|
|
||||||
if [ $p -ge $s ]
|
|
||||||
then
|
|
||||||
echo "$line, $diff"
|
|
||||||
fi
|
|
||||||
done < "$input"
|
|
|
@ -1,202 +0,0 @@
|
||||||
video, 1790
|
|
||||||
UFO, 3604
|
|
||||||
saw, 958
|
|
||||||
light, 1910
|
|
||||||
paranormal, 1871
|
|
||||||
looks, 459
|
|
||||||
happened, 569
|
|
||||||
story, 324
|
|
||||||
night, 1327
|
|
||||||
alien, 1511
|
|
||||||
house, 1054
|
|
||||||
camera, 1611
|
|
||||||
aliens, 794
|
|
||||||
experience, 342
|
|
||||||
lights, 1214
|
|
||||||
looked, 193
|
|
||||||
object, 508
|
|
||||||
came, 1026
|
|
||||||
UFOs, 1097
|
|
||||||
room, 273
|
|
||||||
seeing, 99
|
|
||||||
ghost, 1301
|
|
||||||
videos, 645
|
|
||||||
nI, 0
|
|
||||||
sleep, 503
|
|
||||||
weird, 608
|
|
||||||
flying, 584
|
|
||||||
picture, 718
|
|
||||||
dream, 1191
|
|
||||||
stories, 385
|
|
||||||
moving, 494
|
|
||||||
space, 268
|
|
||||||
felt, 10
|
|
||||||
strange, 436
|
|
||||||
objects, 531
|
|
||||||
experiences, 519
|
|
||||||
technology, 189
|
|
||||||
watching, 8
|
|
||||||
sky, 769
|
|
||||||
fake, 698
|
|
||||||
military, 235
|
|
||||||
dont, 223
|
|
||||||
door, 401
|
|
||||||
contact, 333
|
|
||||||
planet, 45
|
|
||||||
sightings, 620
|
|
||||||
phone, 114
|
|
||||||
craft, 681
|
|
||||||
footage, 612
|
|
||||||
advanced, 176
|
|
||||||
cool, 83
|
|
||||||
dreams, 532
|
|
||||||
ghosts, 319
|
|
||||||
pictures, 455
|
|
||||||
experienced, 300
|
|
||||||
eyes, 97
|
|
||||||
photo, 1113
|
|
||||||
moved, 254
|
|
||||||
phenomena, 273
|
|
||||||
phenomenon, 220
|
|
||||||
air, 298
|
|
||||||
image, 174
|
|
||||||
happening, 116
|
|
||||||
spirit, 470
|
|
||||||
travel, 305
|
|
||||||
video, 1790
|
|
||||||
dark, 384
|
|
||||||
bed, 328
|
|
||||||
reports, 95
|
|
||||||
walking, 138
|
|
||||||
beings, 233
|
|
||||||
ET, 562
|
|
||||||
shadow, 449
|
|
||||||
nThe, 0
|
|
||||||
Looks, 36
|
|
||||||
board, 151
|
|
||||||
scared, 322
|
|
||||||
night, 1327
|
|
||||||
bright, 348
|
|
||||||
house, 1054
|
|
||||||
spirits, 369
|
|
||||||
photos, 511
|
|
||||||
Very, 42
|
|
||||||
sitting, 42
|
|
||||||
lived, 51
|
|
||||||
story, 324
|
|
||||||
thats, 127
|
|
||||||
video, 1790
|
|
||||||
speed, 101
|
|
||||||
window, 366
|
|
||||||
plane, 258
|
|
||||||
creepy, 444
|
|
||||||
shape, 397
|
|
||||||
cameras, 302
|
|
||||||
wake, 180
|
|
||||||
sighting, 1073
|
|
||||||
passed, 24
|
|
||||||
eye, 58
|
|
||||||
woke, 267
|
|
||||||
activity, 64
|
|
||||||
dad, 89
|
|
||||||
film, 479
|
|
||||||
Sounds, 5
|
|
||||||
feet, 43
|
|
||||||
fake, 698
|
|
||||||
standing, 33
|
|
||||||
happened, 569
|
|
||||||
UFO, 3604
|
|
||||||
fly, 648
|
|
||||||
ufo, 721
|
|
||||||
voice, 95
|
|
||||||
night, 1327
|
|
||||||
circles, 122
|
|
||||||
lol, 310
|
|
||||||
seconds, 135
|
|
||||||
extraterrestrial, 267
|
|
||||||
experience, 342
|
|
||||||
paralysis, 332
|
|
||||||
aircraft, 247
|
|
||||||
room, 273
|
|
||||||
brother, 29
|
|
||||||
haunted, 335
|
|
||||||
youtube, 30
|
|
||||||
story, 324
|
|
||||||
Ghost, 238
|
|
||||||
spot, 79
|
|
||||||
paranormal, 1871
|
|
||||||
house, 1054
|
|
||||||
scary, 136
|
|
||||||
distance, 176
|
|
||||||
nIf, 0
|
|
||||||
witness, 495
|
|
||||||
freaked, 236
|
|
||||||
witnesses, 224
|
|
||||||
music, 34
|
|
||||||
weather, 9
|
|
||||||
images, 125
|
|
||||||
cant, 78
|
|
||||||
NASA, 60
|
|
||||||
walked, 52
|
|
||||||
sky, 769
|
|
||||||
floating, 168
|
|
||||||
noise, 251
|
|
||||||
disclosure, 254
|
|
||||||
miles, 78
|
|
||||||
civilization, 125
|
|
||||||
Ouija, 175
|
|
||||||
record, 133
|
|
||||||
visit, 217
|
|
||||||
audio, 113
|
|
||||||
appeared, 103
|
|
||||||
incident, 91
|
|
||||||
slowly, 24
|
|
||||||
stars, 84
|
|
||||||
glitch, 602
|
|
||||||
corner, 141
|
|
||||||
orbs, 254
|
|
||||||
lens, 282
|
|
||||||
visiting, 83
|
|
||||||
town, 36
|
|
||||||
camera, 1611
|
|
||||||
location, 205
|
|
||||||
hoax, 380
|
|
||||||
visited, 97
|
|
||||||
aliens, 794
|
|
||||||
light, 1910
|
|
||||||
ship, 144
|
|
||||||
recording, 248
|
|
||||||
abduction, 239
|
|
||||||
experience, 342
|
|
||||||
UFOs, 1097
|
|
||||||
floor, 32
|
|
||||||
driving, 19
|
|
||||||
didnt, 119
|
|
||||||
UFO, 3604
|
|
||||||
project, 19
|
|
||||||
communicate, 29
|
|
||||||
radar, 77
|
|
||||||
visible, 54
|
|
||||||
ball, 480
|
|
||||||
planes, 75
|
|
||||||
street, 30
|
|
||||||
flash, 377
|
|
||||||
room, 273
|
|
||||||
sharing, 271
|
|
||||||
balloon, 539
|
|
||||||
presence, 26
|
|
||||||
entity, 140
|
|
||||||
filmed, 193
|
|
||||||
sleeping, 70
|
|
||||||
witnessed, 138
|
|
||||||
Aliens, 95
|
|
||||||
reflection, 260
|
|
||||||
lucid, 135
|
|
||||||
digital, 138
|
|
||||||
light, 1910
|
|
||||||
entities, 172
|
|
||||||
recorded, 74
|
|
||||||
fake, 698
|
|
||||||
memories, 51
|
|
||||||
aliens, 794
|
|
||||||
flight, 51
|
|
Loading…
Reference in New Issue