In a Python program, I have to load a couple of thousands of files. I want to maintain their cache. I am able to create the cache using joblib
. But for me, the program loads the data faster if I disable the caching.
Step to run MWE:
- For my MWE, you can download my dummy files from Github.
- set the paths to dummy file directory and any directory to store cache in the variables
dummy_files_basepath
andcache_dir
My MWE:
<code>import os
import sys
from concurrent.futures import ThreadPoolExecutor
import joblib
def __load_file(filepath):
with open(filepath, 'r') as file:
return filepath, file.read()
def _load_files_data(basepath, filenames, enable_cache = False, cache_dir = None):
if enable_cache:
memory = joblib.Memory(cache_dir, verbose=0)
load_file_data = memory.cache(__load_file)
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
data = list(executor.map(lambda filename: load_file_data(os.path.join(basepath, filename)), filenames))
else:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
data = list(executor.map(lambda filename: __load_file(os.path.join(basepath, filename)), filenames))
return data
class FilesLoader:
@classmethod
def load_files_data(cls, basepath, enable_cache = False, cache_dir = None):
# find the files with extension .txt
filenames = os.listdir(basepath)
filenames = [filename for filename in filenames if filename.endswith('.txt')]
# load the data using multipe threads
data = _load_files_data(basepath, filenames, enable_cache, cache_dir)
return data
####----------------------------------------Sample Run------------------------------------####
if __name__ == '__main__':
dummy_files_basepath = "SET_PATH_TO_MY_DUMMY_FILES"
cache_dir = "SET_ANY_PATH"
# record the time taken to load the data
import time
start = time.time()
data = FilesLoader.load_files_data(dummy_files_basepath, enable_cache = True, cache_dir = cache_dir)
print(f"Time taken to load the data: {time.time() - start}")
print(len(data))
print(f"First data filepath. {data[0][0]}")
</code>
<code>import os
import sys
from concurrent.futures import ThreadPoolExecutor
import joblib
def __load_file(filepath):
with open(filepath, 'r') as file:
return filepath, file.read()
def _load_files_data(basepath, filenames, enable_cache = False, cache_dir = None):
if enable_cache:
memory = joblib.Memory(cache_dir, verbose=0)
load_file_data = memory.cache(__load_file)
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
data = list(executor.map(lambda filename: load_file_data(os.path.join(basepath, filename)), filenames))
else:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
data = list(executor.map(lambda filename: __load_file(os.path.join(basepath, filename)), filenames))
return data
class FilesLoader:
@classmethod
def load_files_data(cls, basepath, enable_cache = False, cache_dir = None):
# find the files with extension .txt
filenames = os.listdir(basepath)
filenames = [filename for filename in filenames if filename.endswith('.txt')]
# load the data using multipe threads
data = _load_files_data(basepath, filenames, enable_cache, cache_dir)
return data
####----------------------------------------Sample Run------------------------------------####
if __name__ == '__main__':
dummy_files_basepath = "SET_PATH_TO_MY_DUMMY_FILES"
cache_dir = "SET_ANY_PATH"
# record the time taken to load the data
import time
start = time.time()
data = FilesLoader.load_files_data(dummy_files_basepath, enable_cache = True, cache_dir = cache_dir)
print(f"Time taken to load the data: {time.time() - start}")
print(len(data))
print(f"First data filepath. {data[0][0]}")
</code>
import os
import sys
from concurrent.futures import ThreadPoolExecutor
import joblib
def __load_file(filepath):
with open(filepath, 'r') as file:
return filepath, file.read()
def _load_files_data(basepath, filenames, enable_cache = False, cache_dir = None):
if enable_cache:
memory = joblib.Memory(cache_dir, verbose=0)
load_file_data = memory.cache(__load_file)
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
data = list(executor.map(lambda filename: load_file_data(os.path.join(basepath, filename)), filenames))
else:
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
data = list(executor.map(lambda filename: __load_file(os.path.join(basepath, filename)), filenames))
return data
class FilesLoader:
@classmethod
def load_files_data(cls, basepath, enable_cache = False, cache_dir = None):
# find the files with extension .txt
filenames = os.listdir(basepath)
filenames = [filename for filename in filenames if filename.endswith('.txt')]
# load the data using multipe threads
data = _load_files_data(basepath, filenames, enable_cache, cache_dir)
return data
####----------------------------------------Sample Run------------------------------------####
if __name__ == '__main__':
dummy_files_basepath = "SET_PATH_TO_MY_DUMMY_FILES"
cache_dir = "SET_ANY_PATH"
# record the time taken to load the data
import time
start = time.time()
data = FilesLoader.load_files_data(dummy_files_basepath, enable_cache = True, cache_dir = cache_dir)
print(f"Time taken to load the data: {time.time() - start}")
print(len(data))
print(f"First data filepath. {data[0][0]}")
Question: Why accessing data is slower with caching enabled? In this example, I am loading .txt
file but in my actual program I have .pkl
files
6