Compare commits

...

48 Commits

Author SHA1 Message Date
978a98936f laptop commit linear regression 2020-06-08 19:21:35 +02:00
53cd39d670 laptop commit linear regression 2020-06-08 19:11:20 +02:00
0c3e331712 laptop commit linear regression 2020-05-03 00:21:08 +02:00
25288337b6 laptop commit linear regression 2020-05-02 22:10:18 +02:00
7463017086 laptop commit linear regression 2020-05-02 21:41:03 +02:00
f01edb9559 laptop commit linear regression 2020-05-02 21:29:27 +02:00
5df01c9b41 laptop commit linear regression 2020-05-02 21:24:44 +02:00
dfa4304d9c laptop commit linear regression 2020-05-02 20:51:34 +02:00
689232cfef laptop commit linear regression 2020-05-02 20:49:03 +02:00
44cc9969ac laptop commit linear regression 2020-05-02 20:42:52 +02:00
71982de58b laptop commit linear regression 2020-05-02 20:40:03 +02:00
7ff1d64029 laptop commit linear regression 2020-05-02 20:35:07 +02:00
7305f82a9a laptop commit linear regression 2020-05-02 20:31:53 +02:00
468db5f757 laptop commit linear regression 2020-05-02 20:28:50 +02:00
37e5b270f2 laptop commit linear regression 2020-05-02 20:24:35 +02:00
1e6727b14d laptop commit linear regression 2020-05-02 20:21:22 +02:00
6340b70b31 laptop commit linear regression 2020-05-02 20:19:20 +02:00
40ad0c9b7b laptop commit linear regression 2020-05-02 20:18:40 +02:00
d42af9403b laptop commit linear regression 2020-05-02 20:17:32 +02:00
28cb1c6dc2 laptop commit linear regression 2020-05-02 20:15:48 +02:00
38cc40267c laptop commit linear regression 2020-05-02 20:13:19 +02:00
0e8e75e917 laptop commit linear regression 2020-05-02 20:10:21 +02:00
4e9e0f774c laptop commit fixed naive baise 2020-05-02 20:07:06 +02:00
Bartusiak
42b4d5a1ae Rewrite linear regression (0/1) 2020-04-20 19:09:43 +02:00
Bartusiak
f5b038ce64 Rewrite linear regression (0/1) 2020-04-20 19:07:31 +02:00
Bartusiak
8d2a814d44 Rewrite linear regression 2020-04-09 00:23:08 +02:00
Bartusiak
fa4c673309 Regression 2020-04-08 15:15:48 +02:00
Bartusiak
3cfd2bd792 Regression 2020-04-08 15:12:56 +02:00
Bartusiak
0e0d33afb4 Regression 2020-04-06 19:11:16 +02:00
Bartusiak
72f56d6b42 Regression 2020-04-05 20:10:04 +02:00
c7241d862d Commit to move project to PC 2020-04-05 00:34:05 +02:00
Bartusiak
a3ecdde87b Created vocabulary 2020-04-04 19:55:07 +02:00
Bartusiak
2dcb39fdde Created vocabulary 2020-04-02 20:01:33 +02:00
Bartusiak
a546cd9958 Created vocabulary 2020-04-02 18:29:06 +02:00
Bartusiak
65fbcd275f New branch 2020-04-02 18:28:01 +02:00
Bartusiak
db710a4df8 New branch 2020-04-02 12:44:08 +02:00
Bartusiak
caca661287 Create new prediction, becasue forgot about another in.tsv 2020-03-31 17:09:32 +02:00
Bartusiak
02b281edc8 Fixed problem with KeyError 2020-03-31 14:50:58 +02:00
Bartusiak
7ffbacd865 Fixed problem with KeyError 2020-03-31 14:47:37 +02:00
Bartusiak
5d7f903e18 Fixed problem with KeyError 2020-03-31 14:41:37 +02:00
Bartusiak
c6dff78d16 Fixed problem with KeyError 2020-03-31 01:34:48 +02:00
Bartusiak
d128feed46 Fixed problem with KeyError 2020-03-31 01:14:53 +02:00
Bartusiak
8946cee780 Fixed problem with KeyError 2020-03-31 01:10:38 +02:00
Bartusiak
f290964067 Added file code_prediction, where code creating file out.tsv which results is saving to folders dev-0 and test-A 2020-03-29 01:04:18 +01:00
Bartusiak
082c69e025 Added file code_prediction, where code creating file out.tsv which results is saving to folders dev-0 and test-A 2020-03-29 00:58:08 +01:00
Bartusiak
4951a7cc47 Added file code_prediction, where code creating file out.tsv which results is saving to folders dev-0 and test-A 2020-03-29 00:37:57 +01:00
Bartusiak
5c036684c8 First comment 2020-03-28 20:40:28 +01:00
Bartusiak
0774e8bc17 First comment 2020-03-23 20:24:24 +01:00
18 changed files with 605499 additions and 294908 deletions

2
.idea/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
# Default ignored files
/workspace.xml

8
.idea/Pierwsze.iml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.7 (PyEnv)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

7
.idea/misc.xml Normal file
View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (PyEnv)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Pierwsze.iml" filepath="$PROJECT_DIR$/.idea/Pierwsze.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

57
code.py
View File

@ -1,57 +0,0 @@
from collections import defaultdict
import math
import pickle
def calc_class_logprob(expected_path):
paranormal_classcount=0
skeptic_classcount=0
with open(expected_path) as f:
for line in f:
if 'P' in line:
paranormal_classcount += 1
if 'S' in line:
skeptic_classcount += 1
paranormal_prob = paranormal_classcount / (paranormal_classcount + skeptic_classcount)
skeptic_prob = skeptic_classcount / (paranormal_classcount + skeptic_classcount)
return math.log(paranormal_prob), math.log(skeptic_prob)
def calc_word_count(in_path, expected_path):
word_counts = {'paranormal':defaultdict(int), 'skeptic': defaultdict(int)}
with open(in_path) as in_file, open(expected_path) as expected_file:
for line, exp in zip(in_file, expected_file):
class_ = exp.rstrip('\n').replace(' ','')
text, timestamp = line.rstrip('\n').split('\t')
tokens = text.lower().split(' ')
for token in tokens:
if class_ == 'P':
word_counts['paranormal'][token] += 1
elif class_ == 'S':
word_counts['skeptic'][token] += 1
return word_counts
def calc_word_logprobs(word_counts):
total_skeptic = sum(word_counts['skeptic'].values()) + len(word_counts['skeptic'].keys())
total_paranormal = sum(word_counts['paranormal'].values()) + len(word_counts['paranormal'].keys())
word_logprobs= {'paranormal': {}, 'skeptic': {}}
for class_ in word_counts.keys(): # sceptic paranormal
for token, tokens in word_counts[class_].items():
if class_ == 'skeptic':
word_prob = (tokens+1)/total_skeptic
else:
word_prob = (tokens+1)/total_paranormal
word_logprobs[class_][token] = math.log(word_prob)
return word_logprobs
def main():
paranomal_class_logprob, skeptic_class_logprob = calc_class_logprob("F:/UAM/SEMESTR_I_MGR/SYSTEMY_INTELIGENTNE/ic4g/train/expected.tsv")
word_counts=calc_word_count("F:/UAM/SEMESTR_I_MGR/SYSTEMY_INTELIGENTNE/ic4g/train/in.tsv","F:/UAM/SEMESTR_I_MGR/SYSTEMY_INTELIGENTNE/ic4g/train/expected.tsv")
word_logprobs = calc_word_logprobs(word_counts)
pickle.dump([paranomal_class_logprob, skeptic_class_logprob, word_logprobs], open('naive_base_model.pkl','wb'))
main()

123
code_regression.py Normal file
View File

@ -0,0 +1,123 @@
import random
import re
from _collections import defaultdict
def define_vocabulary(file_to_learn_new_words):
word_counts = {'count': defaultdict(int)}
with open(file_to_learn_new_words, encoding='utf-8') as in_file:
for line in in_file:
text, timestamp = line.rstrip('\n').split('\t')
tokens = text.lower().split(' ')
for token in tokens:
word_counts['count'][token] += 1
in_file.close()
return word_counts
def tokenize_list(string_input):
words=[]
string=string_input.replace('\\n',' ')
#text=re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
text = re.sub(r'\\n+', " ", string)
text = re.sub(r'http\S+', " ", text)
text = re.sub(r'\/[a-z]\/', " ", text)
text = re.sub(r'[^a-z]', " ", text)
text = re.sub(r'\s{2,}', " ", text)
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
text = re.sub(r'^\s', "", text)
string=''
for word in text:
string+=word
words=re.split(';+|,+|\*+|\n+| +|\_+|\%+|\t+|\[+|\]+|\.+|\(+|\)+|\++|\\+|\/+|[0-9]+|\#+|\'+|\"+|\-+|\=+|\&+|\:+|\?+|\!+|\^+|\·+',string)
regex=re.compile(r'http|^[a-zA-Z]$|org')
filtered_values=[
word
for word in words if not regex.match(word)
]
filtered_values[:] = (
value.lower()
for value in filtered_values if len(value)!=0
)
return filtered_values
def read_words(input_path):
vocabulary = {'count':defaultdict(int)}
index=0
with open(input_path,encoding='utf-8') as infile:
for line in infile:
index+=1
tokens = tokenize_list(line)
for token in tokens:
if token not in vocabulary:
vocabulary['vocabulary'][token]+=1
infile.close()
return vocabulary
def train(vocabulary,input_train,expected_train):
learning_rate=0.00001
#learning_precision=0.000001
words_vocabulary={}
with open(input_train,encoding='utf-8') as input_file, open(expected_train,encoding='utf-8') as expected_file:
for line, exp in zip(input_file,expected_file):
words_vocabulary[line]=int(exp)
weights={}
weight={}
delta=1
iteration=0
loss_sum=0.0
error=10.0
max_iteration=10000
for i in vocabulary['count'].keys():
weights[i]=random.uniform(-0.01,0.01)
# delta>learning_precision and
while iteration<max_iteration:
d,y = random.choice(list(words_vocabulary.items()))
y_hat=0
tokens=tokenize_list(d)
for token in tokens:
if token in vocabulary['count'].keys():
y_hat += weights[token] * tokens.count(token)
delta=(y_hat-y) * learning_rate
for word in tokens:
if word in words_vocabulary:
weights[word] -= (tokens.count(word)) * delta
loss = (y_hat - y)**2.0
loss_sum += loss
if iteration%1000 == 0:
if (error>(loss_sum/1000)):
weight=weights
error=loss_sum/1000
loss_sum=0.0
iteration += 1
input_file.close()
expected_file.close()
return weight, vocabulary
def prediction(input,output,weights,vocabulary):
with open(input,encoding='utf-8') as input_file, open(output,'w+',encoding='utf-8') as output:
for line in input_file:
y_hat=0
tokens=tokenize_list(line)
for token in tokens:
if token in vocabulary['count'].keys():
y_hat += weights[token] * (token.count(token))
if y_hat>0.5:
output.write('1\n')
else:
output.write('0\n')
output.close()
input_file.close()
def main():
vocabulary=define_vocabulary('train/in.tsv');
weights, words = train(vocabulary,'train/in.tsv','train/expected.tsv')
prediction('dev-0/in.tsv','dev-0/out.tsv',weights,words)
prediction('test-A/in.tsv','test-A/out.tsv',weights,words)
main()

61
code_regression2 Normal file
View File

@ -0,0 +1,61 @@
import pickle
import re
def calculate_words(linetxt):
word_counts = {}
tokens = linetxt.split(' ')
for token in tokens:
if token in word_counts.keys():
word_counts[token]+=1
else:
word_counts[token]=1
word_counts[''] = 1
return word_counts
def tokenize_list(string_input):
string=string_input.replace('\\n',' ')
text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
text = re.sub(r'\\n+', " ", text)
text = re.sub(r'http\S+', " ", text)
text = re.sub(r'\/[a-z]\/', " ", text)
text = re.sub(r'[^a-z]', " ", text)
text = re.sub(r'\s{2,}', " ", text)
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
text = re.sub(r'^\s', "", text)
return text
def prediction(input,output):
loaded_model = pickle.load(open('model_linear_reg.pkl','rb'))
#print(loaded_model)
weights, word, vocabulary = loaded_model
#print("WORD: ")
#print(word)
#print(" WEIGHTS: ")
#print(weights)
output_f = open(output,'w')
with open(input, encoding='utf-8') as input_f:
for line in input_f:
text, timestamp = line.rstrip('\n').split('\t')
tokens = tokenize_list(text.lower())
line_vocabulary = calculate_words(tokens)
tokens = tokens.split(' ')
y_hat = weights[0]
for token in tokens:
if token in vocabulary.keys():
y_hat += weights[word[token]] * line_vocabulary[token]
if y_hat > 0.5:
output_f.write("1\n")
#print(y_hat)
else:
output_f.write("0\n")
#print(y_hat)
output_f.close()
def main():
prediction("dev-0/in.tsv","dev-0/out.tsv")
prediction("test-A/in.tsv","test-A/out.tsv")
main()

File diff suppressed because it is too large Load Diff

5272
dev-0/in.tsv Normal file

File diff suppressed because one or more lines are too long

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
naive_base_model.pkl Normal file

Binary file not shown.

5152
test-A/in.tsv Normal file

File diff suppressed because one or more lines are too long

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

0
test.tsv Normal file
View File

File diff suppressed because it is too large Load Diff

289579
train/in.tsv Normal file

File diff suppressed because one or more lines are too long