Compare commits

..

No commits in common. "ISI-001" and "master" have entirely different histories.

22 changed files with 294861 additions and 606840 deletions

2
.idea/.gitignore vendored
View File

@ -1,2 +0,0 @@
# Default ignored files
/workspace.xml

View File

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -1,6 +0,0 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

View File

@ -1,7 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
</project>

View File

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Pierwsze.iml" filepath="$PROJECT_DIR$/.idea/Pierwsze.iml" />
</modules>
</component>
</project>

View File

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

Binary file not shown.

38
code.py
View File

@ -1,21 +1,15 @@
from collections import defaultdict from collections import defaultdict
import math import math
import pickle import pickle
import re
open_file=('test-A/out.tsv')
#---------------TRAIN START
#Prawdopodobienstwo wylosowania dokumentu
def calc_class_logprob(expected_path): def calc_class_logprob(expected_path):
paranormal_classcount=0 paranormal_classcount=0
skeptic_classcount=0 skeptic_classcount=0
with open(expected_path,encoding='utf-8') as f: with open(expected_path) as f:
for line in f: for line in f:
if '1' in line: if 'P' in line:
paranormal_classcount += 1 paranormal_classcount += 1
if '0' in line: if 'S' in line:
skeptic_classcount += 1 skeptic_classcount += 1
paranormal_prob = paranormal_classcount / (paranormal_classcount + skeptic_classcount) paranormal_prob = paranormal_classcount / (paranormal_classcount + skeptic_classcount)
@ -26,25 +20,15 @@ def calc_class_logprob(expected_path):
def calc_word_count(in_path, expected_path): def calc_word_count(in_path, expected_path):
word_counts = {'paranormal':defaultdict(int), 'skeptic': defaultdict(int)} word_counts = {'paranormal':defaultdict(int), 'skeptic': defaultdict(int)}
with open(in_path,encoding='utf-8') as in_file, open(expected_path,encoding='utf-8') as expected_file: with open(in_path) as in_file, open(expected_path) as expected_file:
for line, exp in zip(in_file, expected_file): for line, exp in zip(in_file, expected_file):
class_ = exp.rstrip('\n').replace(' ','') class_ = exp.rstrip('\n').replace(' ','')
text, timestamp = line.rstrip('\n').split('\t') text, timestamp = line.rstrip('\n').split('\t')
text = text.lower() tokens = text.lower().split(' ')
text = re.sub(r'\\n+', " ", text)
text = re.sub(r'http\S+', " ", text)
text = re.sub(r'\/[a-z]\/', " ", text)
text = re.sub(r'[^a-z]', " ", text)
text = re.sub(r'\s{2,}', " ", text)
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
text = re.sub(r'^\s', "", text)
tokens = text.split(' ')
for token in tokens: for token in tokens:
if class_ == '1': if class_ == 'P':
word_counts['paranormal'][token] += 1 word_counts['paranormal'][token] += 1
elif class_ == '0': elif class_ == 'S':
word_counts['skeptic'][token] += 1 word_counts['skeptic'][token] += 1
return word_counts return word_counts
@ -62,14 +46,12 @@ def calc_word_logprobs(word_counts):
word_logprobs[class_][token] = math.log(word_prob) word_logprobs[class_][token] = math.log(word_prob)
return word_logprobs return word_logprobs
#--------------- TRAIN END
def main(): def main():
paranomal_class_logprob, skeptic_class_logprob = calc_class_logprob("train/expected.tsv") paranomal_class_logprob, skeptic_class_logprob = calc_class_logprob("F:/UAM/SEMESTR_I_MGR/SYSTEMY_INTELIGENTNE/ic4g/train/expected.tsv")
word_counts=calc_word_count("train/in.tsv","train/expected.tsv") word_counts=calc_word_count("F:/UAM/SEMESTR_I_MGR/SYSTEMY_INTELIGENTNE/ic4g/train/in.tsv","F:/UAM/SEMESTR_I_MGR/SYSTEMY_INTELIGENTNE/ic4g/train/expected.tsv")
word_counts['paranormal'][''] = 0
word_counts['skeptic'][''] = 0
word_logprobs = calc_word_logprobs(word_counts) word_logprobs = calc_word_logprobs(word_counts)
pickle.dump([paranomal_class_logprob, skeptic_class_logprob, word_logprobs], open('naive_base_model.pkl','wb')) pickle.dump([paranomal_class_logprob, skeptic_class_logprob, word_logprobs], open('naive_base_model.pkl','wb'))
main() main()

View File

@ -1,46 +0,0 @@
from collections import defaultdict
import math
import pickle
import re
open_file = open('naive_base_model.pkl', 'rb')
pickle_loaded = pickle.load(open_file)
paranomal_class_logprob, skeptic_class_logprob, word_logprobs = pickle_loaded
def prediction(input,output):
output_file = open(output,'w')
with open(input,encoding='utf-8') as in_file:
for line in in_file:
temp_paranormal_logprob = paranomal_class_logprob
temp_skeptic_logprob = skeptic_class_logprob
text, timestamp = line.rstrip('\n').split('\t')
text = text.lower()
text = re.sub(r'\\n+', " ", text)
text = re.sub(r'http\S+', " ", text)
text = re.sub(r'\/[a-z]\/', " ", text)
text = re.sub(r'[^a-z]', " ", text)
text = re.sub(r'\s{2,}', " ", text)
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
text = re.sub(r'^\s', "", text)
tokens = text.split(' ')
for token in tokens:
if token not in word_logprobs['paranormal']:
word_logprobs['paranormal'][token] = -14.78
if token not in word_logprobs['skeptic']:
word_logprobs['skeptic'][token] = -15.6
temp_paranormal_logprob += paranomal_class_logprob + word_logprobs['paranormal'][token]
temp_skeptic_logprob += skeptic_class_logprob + word_logprobs['skeptic'][token]
if temp_paranormal_logprob > temp_skeptic_logprob:
output_file.write('1\n')
else:
output_file.write('0\n')
##
def main():
prediction('dev-0/in.tsv','dev-0/out.tsv')
prediction('test-A/in.tsv/in.tsv','test-A/out.tsv')
main()

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

BIN
model.pkl

Binary file not shown.

Binary file not shown.

1439
out.tsv

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +0,0 @@
Y_hat: -0.0005393564509196473
Y_hat: -3.9997270479357687
Y_hat: -4.058054620846149
Y_hat: -4.948997989328446
Y_hat: -5.607522681904628
Y_hat: -5.510496552820199
1 Y_hat: -0.0005393564509196473
2 Y_hat: -3.9997270479357687
3 Y_hat: -4.058054620846149
4 Y_hat: -4.948997989328446
5 Y_hat: -5.607522681904628
6 Y_hat: -5.510496552820199

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

View File

@ -1,6 +0,0 @@
0
0
0
0
0
0
1 0
2 0
3 0
4 0
5 0
6 0

File diff suppressed because it is too large Load Diff

289579
train/in.tsv

File diff suppressed because one or more lines are too long