This commit is contained in:
Iwona Christop 2022-04-25 01:15:46 +02:00
parent b775a221e6
commit 9ea0faed9b
2 changed files with 187 additions and 0 deletions

152
main.ipynb Executable file

File diff suppressed because one or more lines are too long

35
main.py Executable file
View File

@ -0,0 +1,35 @@
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, lit, col
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
if __name__ == '__main__':
spark = SparkSession\
.builder\
.appName('He Said She Said')\
.getOrCreate()
X_schema = StructType([StructField('X', StringType(), True)])
X_train = spark.read.csv('train/in.tsv', schema=X_schema)
Y_train = spark.read.csv('train/expected.tsv', schema=StructType([StructField('Y', IntegerType(), True)]))
X_train.show()
train = X_train.join(Y_train, how='full_outer')
train.show()
# # $example on$
# # Load training data
# data = spark.read.format("libsvm") \
# .load("sample_libsvm_data.txt")
# print('data = ', data)
# # Split the data into train and test
# splits = data.randomSplit([0.6, 0.4], 1234)
# train = splits[0]
# test = splits[1]
# print('train = ', train)