In the following code I have the three functions get_dataset1, get_dataset2 and get_dataset3 that are all very similar, only differ when they call len(dataset)
and os.path.join = tmp
. The functions get_dataset1 and get_dataset3 behave as attended, they load a dataset and it has a length greater 0. However, in the case of get_dataset2, dataset has length 0. Why is that? Clearly, pinecone’s load_dataset
and Dataset
class have some strange behavior, potentially a bug.
import copy
import os
import time
from pinecone_datasets import load_dataset
datasetName = "langchain-python-docs-text-embedding-ada-002"
def get_dataset1():
os.path.join = lambda *s: "/".join(s) # pinecone bug workaround
dataset = load_dataset(datasetName)
print("Dataset loaded:", len(dataset) != 0) # dataset has length greater than 0
def get_dataset2():
os.path.join = lambda *s: "/".join(s) # pinecone bug workaround
dataset = load_dataset(datasetName)
os.path.join = tmp
print("Dataset loaded:", len(dataset) != 0) # dataset has length 0
def get_dataset3():
os.path.join = lambda *s: "/".join(s) # pinecone bug workaround
dataset = load_dataset(datasetName)
print("Dataset loaded:", len(dataset) != 0) # dataset has length greater than 0
os.path.join = tmp
print("Dataset loaded:", len(dataset) != 0) # dataset has length greater than 0
def main():
get_dataset1()
get_dataset2()
get_dataset3()
if __name__ == "__main__":
tmp = copy.deepcopy(os.path.join)
main()