from pyspark.sql import SparkSession from pyspark.sql.functions import concat, lit, col from pyspark.sql.types import StructType,StructField, StringType, IntegerType if __name__ == '__main__': spark = SparkSession\ .builder\ .appName('He Said She Said')\ .getOrCreate() X_schema = StructType([StructField('X', StringType(), True)]) X_train = spark.read.csv('train/in.tsv', schema=X_schema) Y_train = spark.read.csv('train/expected.tsv', schema=StructType([StructField('Y', IntegerType(), True)])) X_train.show() train = X_train.join(Y_train, how='full_outer') train.show() # # $example on$ # # Load training data # data = spark.read.format("libsvm") \ # .load("sample_libsvm_data.txt") # print('data = ', data) # # Split the data into train and test # splits = data.randomSplit([0.6, 0.4], 1234) # train = splits[0] # test = splits[1] # print('train = ', train)