Compare commits

...

5 Commits

Author SHA1 Message Date
ksanu
12b6ee8fc6 Fixed output writing 2020-03-29 00:27:17 +01:00
ksanu
8d1e133c8e ISI-9 ready-made, MultinomialNB 2020-03-23 18:44:38 +01:00
ksanu
c05e1e4df7 ISI-9 ready-made, MultinomialNB 2020-03-23 18:43:43 +01:00
ksanu
f9b346e3fb Init branch ISI-9 2020-03-23 17:22:01 +01:00
ksanu
24dd877b29 ISI-2 naive bayes, self-made tokenizer 2020-03-23 13:24:57 +01:00
16 changed files with 10499 additions and 53848 deletions

4
.gitignore vendored
View File

@ -1,4 +1,5 @@
in.tsv
model.pkl
*~
*.swp
*.bak
@ -6,3 +7,4 @@
*.o
.DS_Store
.token
.idea

2
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
# Default ignored files
/workspace.xml

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

4
.idea/misc.xml Normal file
View File

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/paranormal-or-skeptic.iml" filepath="$PROJECT_DIR$/.idea/paranormal-or-skeptic.iml" />
</modules>
</component>
</project>

View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.7" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

File diff suppressed because it is too large Load Diff

1
info.txt Normal file
View File

@ -0,0 +1 @@
Use Naive Bayes implemented by some ready-made toolkit (e.g. sci-kit learn)

View File

@ -1,15 +0,0 @@
0.6920
0.6857
0.6969
0.6931
0.6927
0.6952
0.6969
0.6969
0.6959
0.6959
0.6965
0.6965
0.6965
0.6954
0.6965

View File

@ -1,20 +1,42 @@
import re
import sys
import pandas as pd
import numpy as np
import csv
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
for line in sys.stdin:
if re.search(r'UFO|paranormal|UFOs|video|night|house|saw|camera|lights|light|alien|aliens|ghost|object|dream|sky|room|ufo|craft|happened|sightings|footage|dreams|sleep', line):
print("P")
else:
print("S")
#load data:
train = pd.read_csv("train/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)
texts = train["text"]
y = pd.read_csv("train/expected.tsv", header=None)
#print(y)
#train
X_train_counts = count_vect.fit_transform(texts)
clf = MultinomialNB().fit(X_train_counts, y)
print(texts[0])
print(len(texts))
print(len(y))
#predict
dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)["text"]
testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["text","date"], quoting=csv.QUOTE_NONE)["text"]
dev0_new_counts = count_vect.transform(dev0)
testA_new_counts = count_vect.transform(testA)
predicted_dev0 = clf.predict(dev0_new_counts)
predicted_testA = clf.predict(testA_new_counts)
print(len(dev0))
print(len(predicted_dev0))
"""
with open("dev-0/out.tsv", "w") as out1:
for line in predicted_dev0:
out1.write(line)
out1.write("\n")
happened|sightings|footage|dreams|sleep|videos|experiences|weird|objects|flying|strange|ET|photo|moving|fake|sighting|door|ghosts|looks|bed|spirits|paralysis|pictures|glitch|shadow|picture|space|photos|looked|phenomena|contact|spirit|stories|phenomenon|window|ufos|haunted|lol|creepy|lanterns|dark|scared|cameras|balloon|seen|beings|disclosure|story
"""
with open("test-A/out.tsv", "w") as out2:
for line in predicted_testA:
out2.write(line)
out2.write("\n")

View File

@ -1,4 +0,0 @@
xzcat dev-0/in.tsv.xz | python3 solution.py > dev-0/out.tsv
xzcat test-A/in.tsv.xz | python3 solution.py > test-A/out.tsv
geval -t dev-0 >>scores.txt

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -1,4 +0,0 @@
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ P.*'| egrep -o '[[:alpha:]]+' | sort > sortedP
xzcat in.tsv.xz | paste expected.tsv - | egrep -o '^ S.*'| egrep -o '[[:alpha:]]+' | sort > sortedS
comm -23 sortedP sortedS > PsetsubtractionS
cat PsetsubtractionS | uniq -c | sort -nr > PsetsubtractionS_counted.txt