Thiết kế website giá rẻ

Question

I am using PySpark for data processing. I have tried it on both Windows 11 and WSL2 with Python version 3.10.12, java version 21.0.3 and 17, winutils and hadoop.dll for Hadoop 3.3.6, and Spark version 3.5.1. When I run my code, I usually run into either an EOF or a CRC Java exception. There are also a few other exceptions that occur, less frequently. The error that occurs changes depending on when I run it, even for the exact same code. I experienced the same errors on different code, and I was able to get it to run all the way through only by running each part in Jupyter notebooks until there wasn’t an error. Here is the code:

from pyspark.sql import SparkSession 
from pyspark.sql.functions import udf, explode, col, array_contains, collect_list
from pyspark.sql.types import StructType, StructField, StringType, MapType, IntegerType, ArrayType, BooleanType, FloatType

import nltk
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker
import re
import numpy as np
from numpy.linalg import norm

# Functions from bidir_edge_liwc_mrjob_2 (some are modified, but have the same functionality)
class Parser:
    def __init__(self):
        # Init code no longer relevant
        print('filler')

    def count_user_langs(self, timeline):
        d = {}
        for i in timeline:
            try:
                d[i] += 1
            except KeyError:
                d[i] = 1
        return d

    def _n_most_values(self, d, n_values, proportion_threshold):  
        if 'und' in d: # 'und' is undefined (language)
            del d['und']
        if '' in d:
            del d[''] 
        sum1 = sum(d.values()) #number of tweets with location information.
        if len( d ) > 0:
            attrib = []
            t=0
            for w in sorted(d, key=d.get, reverse=True): #sort keys in descending "value"
                if n_values == 1: #location with maximum frequency
                    attrib.append(w)
                    break
                else:
                    if t < n_values:
                        if t==0:
                            attrib.append(w)
                        else:
                            if d[w] / float(sum1) > proportion_threshold:
                                attrib.append(w)
                        t+=1
            return attrib
        
        else: #If there was no location information
            return []                        

class Vectorizer:
    def __init__(self):
        nltk.download('stopwords')
        nltk.download('wordnet')
        nltk.download('omw-1.4')
        nltk.download('punkt')

        self.lemma = WordNetLemmatizer()
        self.spell = SpellChecker()
        self.vector_path = 'C:\Users\jjhen\Personal Documents\research - Copy\glove_twitter\glove.twitter.27B.200d.txt'

    def remove_unwanted(self, document):
        # remove email addresses
        document = re.sub(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+.[a-zA-Z0-9-.]+)", '', document)
        # remove user mentions
        document = re.sub("@[A-Za-z0-9_]+","", document)
        # remove URLS
        document = re.sub(r'httpS+', '', document)
        # remove hashtags
        document = re.sub("#[A-Za-z0-9_]+","", document)
        # remove unwanted characters
        document = re.sub("[^0-9A-Za-z ]", "" , document)
        # remove multiple spaces
        document = document.replace(' +',' ')
        return document.strip()

    def remove_words(self, tokens):
        stopwords = nltk.corpus.stopwords.words('english')
        # remove puntcuation from stopwords
        stopwords = [self.remove_unwanted(word) for word in stopwords] 
        cleaned_tokens = [token for token in tokens if token not in stopwords]
        return cleaned_tokens

    def lemmatize(self, tokens):
        lemmatized_tokens = [self.lemma.lemmatize(token, pos = 'v') for token in tokens]
        return lemmatized_tokens

    def preprocess(self, document):
        document = document.lower()
        document = re.sub(r'n', '', document)
        document = self.remove_unwanted(document)
        tokens = document.split()
        tokens = self.remove_words(tokens)
        # tokens = correct(tokens)
        tokens = self.lemmatize(tokens)
        return tokens

    # Returns vector as an array for Spark
    def load_glove_vectors(self, path=None):
        if path is None:
            path = self.vector_path
        embeddings = {}
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split()
                word = parts[0]
                vector = np.array(parts[1:], dtype=float)
                embeddings[word] = vector
        return embeddings

    # TODO handle case where no words
    def get_average_vector(self, document, embeddings):
        # tokens = self.preprocess(document)
        # print(tokens)
        # vectors = [embeddings[token] for token in tokens if token in embeddings]
        # return np.array(np.mean(vectors, axis=0)).tolist()
        return [1.0,2.0,3.0]

    def cosine_similarity(self, vec_1, vec_2):
        return np.dot(vec_1, vec_2) / (norm(vec_1) * norm(vec_2))


vectorizer = Vectorizer()
embeddings = vectorizer.load_glove_vectors()
print(vectorizer.get_average_vector('', embeddings))
spark = SparkSession.builder.appName('spark_parse').config('spark.network.timeout', '800s').config('spark.network.timeout', '800s').config('spark.executor.heartbeatInterval', '120s').config("spark.executor.memory", "30g").config("spark.driver.memory", "30g").getOrCreate()
spark.conf.set('spark.sql.debug.maxToStringFields', 1000)
spark.sparkContext.setLogLevel("WARN")

@udf(returnType = StringType())
def get_first_udf(list):
    return list[0]

# Change these paths to match your directory location
tweets_path = 'C:\Users\jjhen\Personal Documents\research - Copy\NZ'
triads_path = 'C:\Users\jjhen\Personal Documents\research - Copy\Triads_NZ'

# Read and format tweets, filter by language, ensure dominant language, and exclude retweets
tweets_df = spark.read.option('compression', 'bzip2').option('header', 'true').json(tweets_path)
# tweets_df = tweets_df.filter(has_dom_lang_udf(col('tweets.lang')))
tweets_df = tweets_df.select(
    col('user.id_str').alias('user_id'),
    explode(col('tweets')).alias('tweets'),
)
tweets_df = tweets_df.withColumn('mentioned_users', col('tweets.entities.user_mentions.id_str'))
tweets_df = tweets_df.withColumn('user_id', get_first_udf(col('user_id')))
tweets_df = tweets_df.filter((~col('tweets.text').contains('RT')) &
                             ((col('tweets.lang') == 'en') | 
                              (col('tweets.lang') == 'und')))

# Read and format 
triads_df = spark.read.option('compression', 'bzip2').option('delimiter', 't').option('header', 'true').csv(triads_path)
triads = triads_df.select(triads_df.columns[:3])

# Match tweets with dyads (user is dyad's node_1)
joined_df1 = tweets_df.join(
    triads_df,
    (tweets_df.user_id == triads_df.node_1) &
    ((array_contains(tweets_df.mentioned_users, triads_df.node_2)) |
    (col('tweets.in_reply_to_user_id_str') == triads_df.node_2)) &
    ((array_contains(tweets_df.mentioned_users, triads_df.node_3)) |
    (col('tweets.in_reply_to_user_id_str') == triads_df.node_3)),
    'inner'
)

# Match tweets with dyads (user is dyad's node_2)
joined_df2 = tweets_df.join(
    triads_df,
    (tweets_df.user_id == triads_df.node_2) &
    ((array_contains(tweets_df.mentioned_users, triads_df.node_1)) |
    (col('tweets.in_reply_to_user_id_str') == triads_df.node_1)) &
    ((array_contains(tweets_df.mentioned_users, triads_df.node_3)) |
    (col('tweets.in_reply_to_user_id_str') == triads_df.node_3)),
    'inner'
)

# Match tweets with dyads (user is dyad's node_3)
joined_df3 = tweets_df.join(
    triads_df,
    (tweets_df.user_id == triads_df.node_3) &
    ((array_contains(tweets_df.mentioned_users, triads_df.node_1)) |
    (col('tweets.in_reply_to_user_id_str') == triads_df.node_1)) &
    ((array_contains(tweets_df.mentioned_users, triads_df.node_2)) |
    (col('tweets.in_reply_to_user_id_str') == triads_df.node_2)),
    'inner'
)

union_df = joined_df1.union(joined_df2).union(joined_df3)

# Group by triad and collect LIWC info
grouped_df = union_df.groupBy('node_1', 'node_2', 'node_3').agg(
    collect_list('tweets.text').alias('tweets')
)

def append_lists(lists):
    return ' '.join(lists)

# Aggregate LIWC data for dyad by summing values for each LIWC category
append_udf = udf(append_lists, StringType())
grouped_df = grouped_df.withColumn('tweets', append_udf(col('tweets')))

vectorize_udf = udf(lambda tweet: vectorizer.get_average_vector(tweet, embeddings), ArrayType(FloatType()))
vector_udf = grouped_df.select(col('node_1'), col('node_2'), col('node_3'), vectorize_udf(col('tweets')).alias('vector'))
vector_udf.write.option('header', 'true').csv('C:\Users\jjhen\Personal Documents\research - Copy\vectors')

The stacktraces for the 2 most common errors are given below:

Traceback (most recent call last):
  File "C:UsersjjhenPersonal Documentsresearch - Copyvector_triads.py", line 175, in <module>
    tweets_df.show()
  File "C:UsersjjhenAppDataLocalPackagesPythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0LocalCachelocal-packagesPython312site-packagespysparksqldataframe.py", line 945, in show
    print(self._show_string(n, truncate, vertical))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:UsersjjhenAppDataLocalPackagesPythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0LocalCachelocal-packagesPython312site-packagespysparksqldataframe.py", line 963, in _show_string
    return self._jdf.showString(n, 20, vertical)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:UsersjjhenAppDataLocalPackagesPythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0LocalCachelocal-packagesPython312site-packagespy4jjava_gateway.py", line 1322, in __call__
    return_value = get_return_value(
                   ^^^^^^^^^^^^^^^^^
  File "C:UsersjjhenAppDataLocalPackagesPythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0LocalCachelocal-packagesPython312site-packagespysparkerrorsexceptionscaptured.py", line 179, in deco
    return f(*a, **kw)
           ^^^^^^^^^^^
  File "C:UsersjjhenAppDataLocalPackagesPythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0LocalCachelocal-packagesPython312site-packagespy4jprotocol.py", line 326, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o75.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 3.0 failed 1 times, most recent failure: Lost task 0.0 in stage 3.0 (TID 10481) (192.168.0.34 executor driver): org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
        at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
        at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
        at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
        at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:99)
        at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
        at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
        at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
        at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
        at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
        at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
        at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
        at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
        at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
        at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
        at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
        at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
        at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
        at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
        at org.apache.spark.scheduler.Task.run(Task.scala:141)
        at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
        at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
        at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
        at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
        at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
        at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
        at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: java.io.EOFException
        at java.base/java.io.DataInputStream.readInt(DataInputStream.java:398)
        at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:83)
        ... 26 more

The second error:

ERROR Executor: Exception in task 0.0 in stage 5.0 (TID 10483)
org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
        at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
        at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
        at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
        at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:99)
        at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
        at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
        at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
        at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
        at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
        at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
        at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
        at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
        at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
        at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
        at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
        at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
        at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
        at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
        at org.apache.spark.scheduler.Task.run(Task.scala:141)
        at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
        at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
        at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
        at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
        at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
        at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
        at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: java.io.EOFException
        at java.base/java.io.DataInputStream.readInt(DataInputStream.java:398)
        at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:83)
        ... 26 more
24/07/02 19:41:46 WARN TaskSetManager: Lost task 0.0 in stage 5.0 (TID 10483) (192.168.0.34 executor driver): org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
        at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
        at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
        at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
        at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:99)
        at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
        at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
        at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
        at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
        at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
        at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
        at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
        at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
        at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
        at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
        at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
        at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
        at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
        at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
        at org.apache.spark.scheduler.Task.run(Task.scala:141)
        at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
        at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
        at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
        at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
        at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
        at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
        at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: java.io.EOFException
        at java.base/java.io.DataInputStream.readInt(DataInputStream.java:398)
        at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:83)
        ... 26 more
...

A third error:

ERROR Executor: Exception in task 187.0 in stage 1.0 (TID 10187)
java.lang.ArrayIndexOutOfBoundsException: Index 134886018 out of bounds for length 900000
        at org.apache.hadoop.io.compress.bzip2.CBZip2InputStream.setupNoRandPartA(CBZip2InputStream.java:1095)
        at org.apache.hadoop.io.compress.bzip2.CBZip2InputStream.setupNoRandPartB(CBZip2InputStream.java:1158)
        at org.apache.hadoop.io.compress.bzip2.CBZip2InputStream.read0(CBZip2InputStream.java:464)
        at org.apache.hadoop.io.compress.bzip2.CBZip2InputStream.read(CBZip2InputStream.java:419)
        at org.apache.hadoop.io.compress.BZip2Codec$BZip2CompressionInputStream.read(BZip2Codec.java:490)
        at java.base/java.io.InputStream.read(InputStream.java:218)
        at org.apache.hadoop.mapreduce.lib.input.CompressedSplitLineReader.fillBuffer(CompressedSplitLineReader.java:130)
        at org.apache.hadoop.util.LineReader.readDefaultLine(LineReader.java:227)
        at org.apache.hadoop.util.LineReader.readLine(LineReader.java:185)
        at org.apache.hadoop.mapreduce.lib.input.CompressedSplitLineReader.readLine(CompressedSplitLineReader.java:159)
        at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.nextKeyValue(LineRecordReader.java:200)
        at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
        at ...

Other forum posts online indicated that this may be the cause of interupts due to insufficient resources. I set the spark-defaults.conf file to allow 30g of memory to both the driver and the executor, and I also configured this in the code. I have also monitored its memory usage while running (just by using task manager), and it does not seem to get close to my computer’s max memory of 32gbs (when running on windows, it gets close on WSL).

I have also tried to verify that none of the compressed .json.bz2 files in the tweets_path folder were corruped. I used:

bzip2 -tv *.bz2

to verify that every file was ok. I also wrote a python script to verify this.

I have tried running on WSL and windows, with multiple different versions of Java. I have also used multiple different winutils.exe and hadoop.dll sources to make sure it is not a configuration error.

Any help would be appreciated.

Thiết kế website giá rẻ

Danh mục

PySpark EOF and CRC Java errors