RuntimeError: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
# Define the Operation class
class Operation:
def __init__(self):
pass
def add(self, a, b):
return a + b
def sub(self, a, b):
return a - b
# Define another class to work with DataFrame
class DataFrameProcessor:
def __init__(self, spark):
self.spark = spark
self.operation = Operation()
def register_add_udf(self):
add_udf = udf(lambda a, b: self.operation.add(a, b), IntegerType())
return add_udf
def apply_udf_on_df(self, df):
add_udf = self.register_add_udf()
result_df = df.withColumn("added", add_udf(df["col1"], df["col2"]))
return result_df
# Sample DataFrame
data = [(1, 2), (3, 4), (5, 6)]
df = spark.createDataFrame(data, ["col1", "col2"])
# Initialize DataFrameProcessor
processor = DataFrameProcessor(spark)
# Apply UDF on DataFrame
result_df = processor.apply_udf_on_df(df)
result_df.show()