I am trying to parse, flatten the nested data using pyspark. any suggestion on how to parse such json file
Here is sample code tried so far, but was not successful.
jsonData ="""{
"data": {
"unique_id1": {
"random_code1": {
"name": "some_name",
"status": "value1"
},
"random_code2": {
"name": "some_name",
"status": "value2"
}
},
"unique_id2": {
"random_code3": {
"name": "some_name",
"status": "value2"
},
"random_code4": {
"name": "some_name",
"status": "value2"
}
}
}
}"""
df = spark.read.option("multiLine", "true").json(spark.sparkContext.parallelize([jsonData]))
data_schema = df.schema["data"].dataType.simpleString()
data_schema = re.sub(r"([w-]+)(?=:struct<name)", "_RandomCode", data_schema)
data_schema = re.sub(r"([w-]+)(?=:struct<_RandomCode)", "_Ids", data_schema)
data_schema = re.sub(r"(?<=,|<)([^,<]+)(?=:)", r"`1`", data_schema)
Expected output
_Ids _RandomCode name
unique_id1 random_code1 some_name
unique_id1 random_code2 some_name
unique_id2 random_code3 some_name
unique_id2 random_code4 some_name