- SparkSession initiation
I am using pyspark version 3.2.0
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,explode
from pyspark.sql.functions import row_number,max,month, unix_timestamp,floor
from pyspark.sql.functions import col,monotonically_increasing_id
import logging
from pyspark.sql.window import Window
from all_items_data_loropiana import items_data_all_orders
spark1 = SparkSession.builder
.appName("book_and_pack")
.config("spark.mongodb.input.uri", "mongodb://localhost:27017/")
.config("spark.sql.repl.eagerEval.enabled", True)
.config('spark.sql.caseSensitive', True)
.config("inferSchema",True)
.config("spark.mongodb.input.sampleSize", 50000)
.config('spark.jars.packages','org.mongodb.spark:mongo-spark-connector_2.12:2.4.2')
.config("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:log4j.properties")
.enableHiveSupport()
.getOrCreate()
logger = logging.getLogger("org.mongodb.spark.MongsoInferSchema")
logger.setLevel(logging.ERROR)
2.Data Schema
schema = StructType([
StructField("orderNumber", StringType(), True),
StructField("Orderstate", ArrayType(
StructType([
StructField("id", StringType(), True),
StructField("state", StringType(), True),
StructField("modifiedAt", StructType([StructField("$date", StringType(), True)]), True)
])
), True)
])
3.Selecting data from json file
ordersips_df=spark.read.json("myfile.json")
df=ordersips_df.select("Orderstate")
the output is :
df.show() // return null
I didn’t get out the problem yet, but i tried to pass the schema manually if it causes the problem by any chance, but it doesn’t solve the problem. thanks
New contributor
zied salhi is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.