Thiết kế website giá rẻ

Question

In a Python program, I have to load a couple of thousands of files. I want to maintain their cache. I am able to create the cache using joblib. But for me, the program loads the data faster if I disable the caching.

Step to run MWE:

For my MWE, you can download my dummy files from Github.
set the paths to dummy file directory and any directory to store cache in the variables dummy_files_basepath and cache_dir

My MWE:

<code>import os

import sys

from concurrent.futures import ThreadPoolExecutor

import joblib

def __load_file(filepath):

with open(filepath, 'r') as file:

return filepath, file.read()

def _load_files_data(basepath, filenames, enable_cache = False, cache_dir = None):

if enable_cache:

memory = joblib.Memory(cache_dir, verbose=0)

load_file_data = memory.cache(__load_file)

with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:

data = list(executor.map(lambda filename: load_file_data(os.path.join(basepath, filename)), filenames))

else:

with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:

data = list(executor.map(lambda filename: __load_file(os.path.join(basepath, filename)), filenames))

return data

class FilesLoader:

@classmethod

def load_files_data(cls, basepath, enable_cache = False, cache_dir = None):

# find the files with extension .txt

filenames = os.listdir(basepath)

filenames = [filename for filename in filenames if filename.endswith('.txt')]

# load the data using multipe threads

data = _load_files_data(basepath, filenames, enable_cache, cache_dir)

return data

####----------------------------------------Sample Run------------------------------------####

if __name__ == '__main__':

dummy_files_basepath = "SET_PATH_TO_MY_DUMMY_FILES"

cache_dir = "SET_ANY_PATH"

# record the time taken to load the data

import time

start = time.time()

data = FilesLoader.load_files_data(dummy_files_basepath, enable_cache = True, cache_dir = cache_dir)

print(f"Time taken to load the data: {time.time() - start}")

print(len(data))

print(f"First data filepath. {data[0][0]}")

</code>

<code>import os import sys from concurrent.futures import ThreadPoolExecutor import joblib def __load_file(filepath): with open(filepath, 'r') as file: return filepath, file.read() def _load_files_data(basepath, filenames, enable_cache = False, cache_dir = None): if enable_cache: memory = joblib.Memory(cache_dir, verbose=0) load_file_data = memory.cache(__load_file) with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: data = list(executor.map(lambda filename: load_file_data(os.path.join(basepath, filename)), filenames)) else: with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: data = list(executor.map(lambda filename: __load_file(os.path.join(basepath, filename)), filenames)) return data class FilesLoader: @classmethod def load_files_data(cls, basepath, enable_cache = False, cache_dir = None): # find the files with extension .txt filenames = os.listdir(basepath) filenames = [filename for filename in filenames if filename.endswith('.txt')] # load the data using multipe threads data = _load_files_data(basepath, filenames, enable_cache, cache_dir) return data ####----------------------------------------Sample Run------------------------------------#### if __name__ == '__main__': dummy_files_basepath = "SET_PATH_TO_MY_DUMMY_FILES" cache_dir = "SET_ANY_PATH" # record the time taken to load the data import time start = time.time() data = FilesLoader.load_files_data(dummy_files_basepath, enable_cache = True, cache_dir = cache_dir) print(f"Time taken to load the data: {time.time() - start}") print(len(data)) print(f"First data filepath. {data[0][0]}") </code>

import os
import sys
from concurrent.futures import ThreadPoolExecutor
import joblib

def __load_file(filepath):
        with open(filepath, 'r') as file:
                return filepath, file.read()
    
def _load_files_data(basepath, filenames, enable_cache = False, cache_dir = None):
    if enable_cache:
        memory = joblib.Memory(cache_dir, verbose=0)
        load_file_data = memory.cache(__load_file)
        with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
            data = list(executor.map(lambda filename: load_file_data(os.path.join(basepath, filename)), filenames))
    else:
        with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
            data = list(executor.map(lambda filename: __load_file(os.path.join(basepath, filename)), filenames))

    return data    

class FilesLoader:    
    @classmethod
    def load_files_data(cls, basepath, enable_cache = False, cache_dir = None):
        # find the files with extension .txt
        filenames = os.listdir(basepath)
        filenames = [filename for filename in filenames if filename.endswith('.txt')]

        # load the data using multipe threads
        data = _load_files_data(basepath, filenames, enable_cache, cache_dir)            

        return data

####----------------------------------------Sample Run------------------------------------####
if __name__ == '__main__':
    dummy_files_basepath = "SET_PATH_TO_MY_DUMMY_FILES"  
    cache_dir = "SET_ANY_PATH"

    # record the time taken to load the data
    import time
    start = time.time()
    data = FilesLoader.load_files_data(dummy_files_basepath, enable_cache = True, cache_dir = cache_dir)
    print(f"Time taken to load the data: {time.time() - start}")
    print(len(data))
    print(f"First data filepath. {data[0][0]}")

Question: Why accessing data is slower with caching enabled? In this example, I am loading .txt file but in my actual program I have .pkl files

Thiết kế website giá rẻ

Danh mục

Caching in python is working slower than without caching