Compare commits

..

No commits in common. "ISI-2019-013" and "master" have entirely different histories.

18 changed files with 294908 additions and 605499 deletions

2
.idea/.gitignore vendored
View File

@ -1,2 +0,0 @@
# Default ignored files
/workspace.xml

View File

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.7 (PyEnv)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -1,6 +0,0 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

View File

@ -1,7 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (PyEnv)" project-jdk-type="Python SDK" />
</project>

View File

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Pierwsze.iml" filepath="$PROJECT_DIR$/.idea/Pierwsze.iml" />
</modules>
</component>
</project>

View File

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

57
code.py Normal file
View File

@ -0,0 +1,57 @@
from collections import defaultdict
import math
import pickle
def calc_class_logprob(expected_path):
paranormal_classcount=0
skeptic_classcount=0
with open(expected_path) as f:
for line in f:
if 'P' in line:
paranormal_classcount += 1
if 'S' in line:
skeptic_classcount += 1
paranormal_prob = paranormal_classcount / (paranormal_classcount + skeptic_classcount)
skeptic_prob = skeptic_classcount / (paranormal_classcount + skeptic_classcount)
return math.log(paranormal_prob), math.log(skeptic_prob)
def calc_word_count(in_path, expected_path):
word_counts = {'paranormal':defaultdict(int), 'skeptic': defaultdict(int)}
with open(in_path) as in_file, open(expected_path) as expected_file:
for line, exp in zip(in_file, expected_file):
class_ = exp.rstrip('\n').replace(' ','')
text, timestamp = line.rstrip('\n').split('\t')
tokens = text.lower().split(' ')
for token in tokens:
if class_ == 'P':
word_counts['paranormal'][token] += 1
elif class_ == 'S':
word_counts['skeptic'][token] += 1
return word_counts
def calc_word_logprobs(word_counts):
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())
total_paranormal = sum(word_counts['paranormal'].values()) + len(word_counts['paranormal'].keys())
word_logprobs= {'paranormal': {}, 'skeptic': {}}
for class_ in word_counts.keys(): # sceptic paranormal
for token, tokens in word_counts[class_].items():
if class_ == 'skeptic':
word_prob = (tokens+1)/total_skeptic
else:
word_prob = (tokens+1)/total_paranormal
word_logprobs[class_][token] = math.log(word_prob)
return word_logprobs
def main():
paranomal_class_logprob, skeptic_class_logprob = calc_class_logprob("F:/UAM/SEMESTR_I_MGR/SYSTEMY_INTELIGENTNE/ic4g/train/expected.tsv")
word_counts=calc_word_count("F:/UAM/SEMESTR_I_MGR/SYSTEMY_INTELIGENTNE/ic4g/train/in.tsv","F:/UAM/SEMESTR_I_MGR/SYSTEMY_INTELIGENTNE/ic4g/train/expected.tsv")
word_logprobs = calc_word_logprobs(word_counts)
pickle.dump([paranomal_class_logprob, skeptic_class_logprob, word_logprobs], open('naive_base_model.pkl','wb'))
main()

View File

@ -1,123 +0,0 @@
import random
import re
from _collections import defaultdict
def define_vocabulary(file_to_learn_new_words):
word_counts = {'count': defaultdict(int)}
with open(file_to_learn_new_words, encoding='utf-8') as in_file:
for line in in_file:
text, timestamp = line.rstrip('\n').split('\t')
tokens = text.lower().split(' ')
for token in tokens:
word_counts['count'][token] += 1
in_file.close()
return word_counts
def tokenize_list(string_input):
words=[]
string=string_input.replace('\\n',' ')
#text=re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
text = re.sub(r'\\n+', " ", string)
text = re.sub(r'http\S+', " ", text)
text = re.sub(r'\/[a-z]\/', " ", text)
text = re.sub(r'[^a-z]', " ", text)
text = re.sub(r'\s{2,}', " ", text)
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
text = re.sub(r'^\s', "", text)
string=''
for word in text:
string+=word
words=re.split(';+|,+|\*+|\n+| +|\_+|\%+|\t+|\[+|\]+|\.+|\(+|\)+|\++|\\+|\/+|[0-9]+|\#+|\'+|\"+|\-+|\=+|\&+|\:+|\?+|\!+|\^+|\·+',string)
regex=re.compile(r'http|^[a-zA-Z]$|org')
filtered_values=[
word
for word in words if not regex.match(word)
]
filtered_values[:] = (
value.lower()
for value in filtered_values if len(value)!=0
)
return filtered_values
def read_words(input_path):
vocabulary = {'count':defaultdict(int)}
index=0
with open(input_path,encoding='utf-8') as infile:
for line in infile:
index+=1
tokens = tokenize_list(line)
for token in tokens:
if token not in vocabulary:
vocabulary['vocabulary'][token]+=1
infile.close()
return vocabulary
def train(vocabulary,input_train,expected_train):
learning_rate=0.00001
#learning_precision=0.000001
words_vocabulary={}
with open(input_train,encoding='utf-8') as input_file, open(expected_train,encoding='utf-8') as expected_file:
for line, exp in zip(input_file,expected_file):
words_vocabulary[line]=int(exp)
weights={}
weight={}
delta=1
iteration=0
loss_sum=0.0
error=10.0
max_iteration=10000
for i in vocabulary['count'].keys():
weights[i]=random.uniform(-0.01,0.01)
# delta>learning_precision and
while iteration<max_iteration:
d,y = random.choice(list(words_vocabulary.items()))
y_hat=0
tokens=tokenize_list(d)
for token in tokens:
if token in vocabulary['count'].keys():
y_hat += weights[token] * tokens.count(token)
delta=(y_hat-y) * learning_rate
for word in tokens:
if word in words_vocabulary:
weights[word] -= (tokens.count(word)) * delta
loss = (y_hat - y)**2.0
loss_sum += loss
if iteration%1000 == 0:
if (error>(loss_sum/1000)):
weight=weights
error=loss_sum/1000
loss_sum=0.0
iteration += 1
input_file.close()
expected_file.close()
return weight, vocabulary
def prediction(input,output,weights,vocabulary):
with open(input,encoding='utf-8') as input_file, open(output,'w+',encoding='utf-8') as output:
for line in input_file:
y_hat=0
tokens=tokenize_list(line)
for token in tokens:
if token in vocabulary['count'].keys():
y_hat += weights[token] * (token.count(token))
if y_hat>0.5:
output.write('1\n')
else:
output.write('0\n')
output.close()
input_file.close()
def main():
vocabulary=define_vocabulary('train/in.tsv');
weights, words = train(vocabulary,'train/in.tsv','train/expected.tsv')
prediction('dev-0/in.tsv','dev-0/out.tsv',weights,words)
prediction('test-A/in.tsv','test-A/out.tsv',weights,words)
main()

View File

@ -1,61 +0,0 @@
import pickle
import re
def calculate_words(linetxt):
word_counts = {}
tokens = linetxt.split(' ')
for token in tokens:
if token in word_counts.keys():
word_counts[token]+=1
else:
word_counts[token]=1
word_counts[''] = 1
return word_counts
def tokenize_list(string_input):
string=string_input.replace('\\n',' ')
text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
text = re.sub(r'\\n+', " ", text)
text = re.sub(r'http\S+', " ", text)
text = re.sub(r'\/[a-z]\/', " ", text)
text = re.sub(r'[^a-z]', " ", text)
text = re.sub(r'\s{2,}', " ", text)
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
text = re.sub(r'^\s', "", text)
return text
def prediction(input,output):
loaded_model = pickle.load(open('model_linear_reg.pkl','rb'))
#print(loaded_model)
weights, word, vocabulary = loaded_model
#print("WORD: ")
#print(word)
#print(" WEIGHTS: ")
#print(weights)
output_f = open(output,'w')
with open(input, encoding='utf-8') as input_f:
for line in input_f:
text, timestamp = line.rstrip('\n').split('\t')
tokens = tokenize_list(text.lower())
line_vocabulary = calculate_words(tokens)
tokens = tokens.split(' ')
y_hat = weights[0]
for token in tokens:
if token in vocabulary.keys():
y_hat += weights[word[token]] * line_vocabulary[token]
if y_hat > 0.5:
output_f.write("1\n")
#print(y_hat)
else:
output_f.write("0\n")
#print(y_hat)
output_f.close()
def main():
prediction("dev-0/in.tsv","dev-0/out.tsv")
prediction("test-A/in.tsv","test-A/out.tsv")
main()

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

File diff suppressed because it is too large Load Diff

289579
train/in.tsv

File diff suppressed because one or more lines are too long