I was trying to load a tsv files from urls (max file size was 1.05 GB or 1129672402 Bytes)
I used java.net.URL for it.
But, it throwed the below error (for the largest one)-
java.lang.OutOfMemoryError: UTF16 String size is 1129672402, should be less than 1073741823
Is there any way to increase the default String size in spark or any other solution for processing this ?
def getGeoFeedsDataNew(spark: SparkSession, url: String, schema: StructType): DataFrame = {
val inputStream = new URL(url).openStream()
val reader = new BufferedReader(new InputStreamReader(inputStream))
var data = reader.lines().collect(Collectors.joining("n")).split("\n").map("1t".concat(_).concat("t2")).map(_.split("t"))
inputStream.close()
val size = data.length
logger.info(s"records found: ${size-1}")
if(size < 2){
return spark.createDataFrame(spark.sparkContext.emptyRDD[Row], schema)
}
data = data.slice(1,size)
val rowsRDD = spark.sparkContext.parallelize(data).map(row => Row.fromSeq(row.toSeq))
val geoDataDF: DataFrame = spark.createDataFrame(rowsRDD, schema)
return geoDataDF
}