Create DataFrame by Defining the Scheme
A schema defines the structure of your data. It could by defined implicitly (inheritted in runtime) or explicitly (and known at compile time).
A schema is described using StructType which is a collection of StructField objects:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType, StringType}
object DataFrameScheme {
def main(args: Array[String]): Unit = {
val inputFile = args(0);
//Initialize SparkSession
val sparkSession = SparkSession
.builder()
.appName("spark-read-csv")
.master("local[*]")
.getOrCreate();
//Define the scheme of the data
var titanicSchema = StructType(Array(
StructField("PassengerId", IntegerType, true),
StructField("Survived", IntegerType, true),
StructField("Pclass", IntegerType, true),
StructField("Name", StringType, true),
StructField("Sex", StringType, true),
StructField("Age", DoubleType, true),
StructField("SibSp", IntegerType, true),
StructField("Parch", IntegerType, true),
StructField("Ticket", StringType, true),
StructField("Fare", DoubleType, true),
StructField("Cabin", StringType, true),
StructField("Embarked", StringType, true)))
//Read CSV file to DF and define scheme on the fly
val passengers = sparkSession.read
.option("header", "true")
.option("delimiter", "\t")
.option("nullValue", "")
.option("treatEmptyValuesAsNulls", "true")
.schema(titanicSchema)
.csv(inputFile)
passengers.show(100)
passengers.printSchema()
}
}