laptop commit linear regression

This commit is contained in:
Bartosz Ogonowski 2020-06-08 19:11:20 +02:00
parent 0c3e331712
commit 53cd39d670
5 changed files with 3069 additions and 3008 deletions

View File

@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4"> <module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager"> <component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" /> <content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" /> <orderEntry type="jdk" jdkName="Python 3.7 (PyEnv)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
</module> </module>

View File

@ -3,5 +3,5 @@
<component name="JavaScriptSettings"> <component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" /> <option name="languageLevel" value="ES6" />
</component> </component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" /> <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (PyEnv)" project-jdk-type="Python SDK" />
</project> </project>

61
code_regression2 Normal file
View File

@ -0,0 +1,61 @@
import pickle
import re
def calculate_words(linetxt):
word_counts = {}
tokens = linetxt.split(' ')
for token in tokens:
if token in word_counts.keys():
word_counts[token]+=1
else:
word_counts[token]=1
word_counts[''] = 1
return word_counts
def tokenize_list(string_input):
string=string_input.replace('\\n',' ')
text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
text = re.sub(r'\\n+', " ", text)
text = re.sub(r'http\S+', " ", text)
text = re.sub(r'\/[a-z]\/', " ", text)
text = re.sub(r'[^a-z]', " ", text)
text = re.sub(r'\s{2,}', " ", text)
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
text = re.sub(r'^\s', "", text)
return text
def prediction(input,output):
loaded_model = pickle.load(open('model_linear_reg.pkl','rb'))
#print(loaded_model)
weights, word, vocabulary = loaded_model
#print("WORD: ")
#print(word)
#print(" WEIGHTS: ")
#print(weights)
output_f = open(output,'w')
with open(input, encoding='utf-8') as input_f:
for line in input_f:
text, timestamp = line.rstrip('\n').split('\t')
tokens = tokenize_list(text.lower())
line_vocabulary = calculate_words(tokens)
tokens = tokens.split(' ')
y_hat = weights[0]
for token in tokens:
if token in vocabulary.keys():
y_hat += weights[word[token]] * line_vocabulary[token]
if y_hat > 0.5:
output_f.write("1\n")
print(y_hat)
else:
output_f.write("0\n")
print(y_hat)
output_f.close()
def main():
prediction("dev-0/in.tsv","dev-0/out.tsv")
prediction("test-A/in.tsv","test-A/out.tsv")
main()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff