This commit is contained in:
Bartosz Karwacki 2022-04-03 19:50:05 +02:00
parent a2064b7ed9
commit 2be2a96fe1
3 changed files with 16539 additions and 16546 deletions

File diff suppressed because it is too large Load Diff

23
run.py
View File

@ -12,7 +12,6 @@ data = pd.read_csv(
error_bad_lines=False,
header=None,
quoting=csv.QUOTE_NONE,
nrows=250000
)
train_labels = pd.read_csv(
"train/expected.tsv",
@ -20,7 +19,6 @@ train_labels = pd.read_csv(
error_bad_lines=False,
header=None,
quoting=csv.QUOTE_NONE,
nrows=250000
)
train_data = data[[6, 7]]
@ -33,22 +31,17 @@ model = defaultdict(lambda: defaultdict(lambda: 0))
def clean(text):
text = str(text)
# normalize text
text = (
unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode(
'utf-8', 'ignore'))
# replace html chars with ' '
text = re.sub('<.*?>', ' ', text)
# remove punctuation
text = text.translate(str.maketrans(' ', ' ', string.punctuation))
# only alphabets and numerics
text = re.sub('[^a-zA-Z]', ' ', text)
# replace newline with space
unicodedata.normalize("NFKD", text)
.encode("ascii", "ignore")
.decode("utf-8", "ignore")
)
text = re.sub("<.*?>", " ", text)
text = text.translate(str.maketrans(" ", " ", string.punctuation))
text = re.sub("[^a-zA-Z]", " ", text)
text = re.sub("\n", " ", text)
# lower case
text = text.lower()
# split and join the words
text = ' '.join(text.split())
text = " ".join(text.split())
return text

File diff suppressed because it is too large Load Diff