laptop commit linear regression
This commit is contained in:
parent
0c3e331712
commit
53cd39d670
@ -2,7 +2,7 @@
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.7 (PyEnv)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
@ -3,5 +3,5 @@
|
||||
<component name="JavaScriptSettings">
|
||||
<option name="languageLevel" value="ES6" />
|
||||
</component>
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (PyEnv)" project-jdk-type="Python SDK" />
|
||||
</project>
|
61
code_regression2
Normal file
61
code_regression2
Normal file
@ -0,0 +1,61 @@
|
||||
import pickle
|
||||
import re
|
||||
|
||||
|
||||
def calculate_words(linetxt):
|
||||
word_counts = {}
|
||||
tokens = linetxt.split(' ')
|
||||
for token in tokens:
|
||||
if token in word_counts.keys():
|
||||
word_counts[token]+=1
|
||||
else:
|
||||
word_counts[token]=1
|
||||
word_counts[''] = 1
|
||||
return word_counts
|
||||
|
||||
def tokenize_list(string_input):
|
||||
string=string_input.replace('\\n',' ')
|
||||
text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
|
||||
text = re.sub(r'\\n+', " ", text)
|
||||
text = re.sub(r'http\S+', " ", text)
|
||||
text = re.sub(r'\/[a-z]\/', " ", text)
|
||||
text = re.sub(r'[^a-z]', " ", text)
|
||||
text = re.sub(r'\s{2,}', " ", text)
|
||||
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
|
||||
text = re.sub(r'^\s', "", text)
|
||||
|
||||
return text
|
||||
|
||||
def prediction(input,output):
|
||||
loaded_model = pickle.load(open('model_linear_reg.pkl','rb'))
|
||||
#print(loaded_model)
|
||||
weights, word, vocabulary = loaded_model
|
||||
#print("WORD: ")
|
||||
#print(word)
|
||||
#print(" WEIGHTS: ")
|
||||
#print(weights)
|
||||
output_f = open(output,'w')
|
||||
with open(input, encoding='utf-8') as input_f:
|
||||
for line in input_f:
|
||||
text, timestamp = line.rstrip('\n').split('\t')
|
||||
tokens = tokenize_list(text.lower())
|
||||
line_vocabulary = calculate_words(tokens)
|
||||
tokens = tokens.split(' ')
|
||||
y_hat = weights[0]
|
||||
for token in tokens:
|
||||
if token in vocabulary.keys():
|
||||
y_hat += weights[word[token]] * line_vocabulary[token]
|
||||
if y_hat > 0.5:
|
||||
output_f.write("1\n")
|
||||
print(y_hat)
|
||||
else:
|
||||
output_f.write("0\n")
|
||||
print(y_hat)
|
||||
output_f.close()
|
||||
|
||||
|
||||
def main():
|
||||
prediction("dev-0/in.tsv","dev-0/out.tsv")
|
||||
prediction("test-A/in.tsv","test-A/out.tsv")
|
||||
|
||||
main()
|
2978
dev-0/out.tsv
2978
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
3034
test-A/out.tsv
3034
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user