Fixed problem with code

This commit is contained in:
Bartusiak 2020-05-25 20:21:18 +02:00
parent 3d9f5c9950
commit 7b1a3da3a6
3 changed files with 8341 additions and 8341 deletions

View File

@ -19,10 +19,10 @@ def create_train(input,output1,output0):
def tokenize(input,output):
with open(input,'rt',encoding="utf8") as input_f, open(output,'w',encoding="utf-8") as file:
for line in input_f:
text = line.lower()
text = re.sub(r'\\n+', " ", text)
text = re.sub(r'http\S+', " ", text)
text = re.sub(r'\/[a-z]\/', " ", text)
#text = line.lower()
#text = re.sub(r'\\n+', " ", text)
text = re.sub(r'http\S+', " ", line)
#text = re.sub(r'\/[a-z]\/', " ", text)
#text = re.sub(r'\s{2,}', " ", text)
#text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)
#text = re.sub(r'\W\w{1,3}\W|\A\w{1,3}\W', " ", text)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff