laptop commit linear regression
This commit is contained in:
parent
0c3e331712
commit
53cd39d670
@ -2,7 +2,7 @@
|
|||||||
<module type="PYTHON_MODULE" version="4">
|
<module type="PYTHON_MODULE" version="4">
|
||||||
<component name="NewModuleRootManager">
|
<component name="NewModuleRootManager">
|
||||||
<content url="file://$MODULE_DIR$" />
|
<content url="file://$MODULE_DIR$" />
|
||||||
<orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" />
|
<orderEntry type="jdk" jdkName="Python 3.7 (PyEnv)" jdkType="Python SDK" />
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
</component>
|
</component>
|
||||||
</module>
|
</module>
|
@ -3,5 +3,5 @@
|
|||||||
<component name="JavaScriptSettings">
|
<component name="JavaScriptSettings">
|
||||||
<option name="languageLevel" value="ES6" />
|
<option name="languageLevel" value="ES6" />
|
||||||
</component>
|
</component>
|
||||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8" project-jdk-type="Python SDK" />
|
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (PyEnv)" project-jdk-type="Python SDK" />
|
||||||
</project>
|
</project>
|
61
code_regression2
Normal file
61
code_regression2
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
import pickle
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_words(linetxt):
|
||||||
|
word_counts = {}
|
||||||
|
tokens = linetxt.split(' ')
|
||||||
|
for token in tokens:
|
||||||
|
if token in word_counts.keys():
|
||||||
|
word_counts[token]+=1
|
||||||
|
else:
|
||||||
|
word_counts[token]=1
|
||||||
|
word_counts[''] = 1
|
||||||
|
return word_counts
|
||||||
|
|
||||||
|
def tokenize_list(string_input):
|
||||||
|
string=string_input.replace('\\n',' ')
|
||||||
|
text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
|
||||||
|
text = re.sub(r'\\n+', " ", text)
|
||||||
|
text = re.sub(r'http\S+', " ", text)
|
||||||
|
text = re.sub(r'\/[a-z]\/', " ", text)
|
||||||
|
text = re.sub(r'[^a-z]', " ", text)
|
||||||
|
text = re.sub(r'\s{2,}', " ", text)
|
||||||
|
text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
|
||||||
|
text = re.sub(r'^\s', "", text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def prediction(input,output):
|
||||||
|
loaded_model = pickle.load(open('model_linear_reg.pkl','rb'))
|
||||||
|
#print(loaded_model)
|
||||||
|
weights, word, vocabulary = loaded_model
|
||||||
|
#print("WORD: ")
|
||||||
|
#print(word)
|
||||||
|
#print(" WEIGHTS: ")
|
||||||
|
#print(weights)
|
||||||
|
output_f = open(output,'w')
|
||||||
|
with open(input, encoding='utf-8') as input_f:
|
||||||
|
for line in input_f:
|
||||||
|
text, timestamp = line.rstrip('\n').split('\t')
|
||||||
|
tokens = tokenize_list(text.lower())
|
||||||
|
line_vocabulary = calculate_words(tokens)
|
||||||
|
tokens = tokens.split(' ')
|
||||||
|
y_hat = weights[0]
|
||||||
|
for token in tokens:
|
||||||
|
if token in vocabulary.keys():
|
||||||
|
y_hat += weights[word[token]] * line_vocabulary[token]
|
||||||
|
if y_hat > 0.5:
|
||||||
|
output_f.write("1\n")
|
||||||
|
print(y_hat)
|
||||||
|
else:
|
||||||
|
output_f.write("0\n")
|
||||||
|
print(y_hat)
|
||||||
|
output_f.close()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
prediction("dev-0/in.tsv","dev-0/out.tsv")
|
||||||
|
prediction("test-A/in.tsv","test-A/out.tsv")
|
||||||
|
|
||||||
|
main()
|
2978
dev-0/out.tsv
2978
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
3034
test-A/out.tsv
3034
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user