I’m loading some data using PyTorch’s MemoryMappedTensor (version==2.4.0+cu118) and getting a Bus Error. I’m running the code using WSL2 on Windows 10:
Operating System: Windows 10 Pro 64-bit (10.0, Build 19045) (19041.vb_release.191206-1406) Language: English (Regional Setting: English) System Manufacturer: ASUS BIOS: 3603 (type: UEFI) Processor: AMD Ryzen 9 5950X 16-Core Processor (32 CPUs), ~3.4GHz Memory: 65536MB RAM Available OS Memory: 65436MB RAM Page File: 7675MB used, 67487MB available DirectX Version: DirectX 12 Card name: NVIDIA TITAN RTX Manufacturer: NVIDIA Chip type: NVIDIA TITAN RTX DAC type: Integrated RAMDAC Display Memory: 56985 MB Dedicated Memory: 24268 MB Shared Memory: 32717 MB
It seems most likely to me that this Bus Error is being caused by insufficient memory; however, I can’t understand why this would be an issue when all of my operations are on memory-mapped data, and I have plenty of memory to go around.
Here is the loop in which the error occurs:
# Populate the empty tensors from .npy files on disk.
batch = num_output_sensors // 4
dl = torch.utils.data.DataLoader(disk_dataset, batch_size=batch, num_workers=0)
i = 0
pbar = tqdm.tqdm(total=len(disk_dataset))
for c1, c2, f, kappa, output, sensor_coords in dl:
_batch = c1.shape[0]
pbar.update(_batch)
#
# =============
# PROBLEM CODE:
# =============
#
# data[i : i + _batch] = cls(
# f=f,
# c1=c1,
# c2=c2,
# kappa=kappa,
# output=output,
# sensor_coords=sensor_coords,
# batch_size=[_batch]
# )
i += _batch
and here it is in context:
@tensorclass
class MemoryMappedTensorDataset:
# Tensors to be memmapped
# =======================
f: torch.Tensor
kappa: torch.Tensor
c1: torch.Tensor
c2: torch.Tensor
output: torch.Tensor
sensor_coords: torch.Tensor
# Dict keys for metadata
# ======================
NUM_DATA = "num_data"
NUM_INPUT_SENSORS_2D = "num_input_sensors_2D"
NUM_OUTPUT_SENSORS_1D = "num_output_sensors"
NO_VELOCITY = "no_velocity"
ALL_KEYWORDS = [NUM_DATA, NUM_INPUT_SENSORS_2D, NUM_OUTPUT_SENSORS_1D, NO_VELOCITY]
@classmethod
def from_data_directory(cls, data_dir:Path):
( num_data,
num_input_sensors_2D,
num_input_sensors_total,
num_output_sensors,
no_velocity ) = cls._extract_meta_data(data_dir)
# Fix shape of data tensors
# NOTE: If there is no velocity, c1 and c2 are identically zero;
# however, the current implementation still passes these
# variables. So for now, I'll include them as a constant,
# stored once per data point
# NOTE: output_shape includes data for u, du/dx, and du/dy
input_shape_constant = (num_data, 1)
input_shape_2D = (num_data, *num_input_sensors_2D)
input_shape_flattened = (num_data, num_input_sensors_total, 1)
output_shape = (num_data, 3)
coord_shape = (num_data, 2)
c_shape = input_shape_constant if no_velocity else input_shape_2D
# Creates empty memmapped tensors
data = cls(
f = MemoryMappedTensor.empty(input_shape_flattened, dtype=torch.float),
kappa = MemoryMappedTensor.empty(input_shape_2D, dtype=torch.float),
c1 = MemoryMappedTensor.empty(c_shape, dtype=torch.float),
c2 = MemoryMappedTensor.empty(c_shape, dtype=torch.float),
output = MemoryMappedTensor.empty(output_shape, dtype=torch.float),
sensor_coords = MemoryMappedTensor.empty(coord_shape, dtype=torch.float),
batch_size=[num_data]
)
# Locks the tensorclass and ensures that is_memmap() will return True.
data.memmap_()
# Collect data from disk
disk_dataset = cls._extract_data(data_dir)
# Populate the empty tensors from .npy files on disk.
batch = num_output_sensors // 4
dl = torch.utils.data.DataLoader(disk_dataset, batch_size=batch, num_workers=0)
i = 0
pbar = tqdm.tqdm(total=len(disk_dataset))
for c1, c2, f, kappa, output, sensor_coords in dl:
_batch = c1.shape[0]
pbar.update(_batch)
#
# =============
# PROBLEM CODE:
# =============
#
# data[i : i + _batch] = cls(
# f=f,
# c1=c1,
# c2=c2,
# kappa=kappa,
# output=output,
# sensor_coords=sensor_coords,
# batch_size=[_batch]
# )
i += _batch
return disk_dataset
@classmethod
def _extract_data(cls, data_dir:Path):
npy_dir = data_dir / "npy_data"
if not npy_dir.exists():
raise RuntimeError(
"No numpy data directory found for the dataset at %s. "
"Aborting!" % npy_dir.as_posix()
)
for filename in ["c1", "c2", "f", "kappa", "output"]:
data_path = npy_dir / ("%s.npy" % filename)
if not data_path.exists():
raise RuntimeError(
"No numpy data file found at %s. "
"Aborting!" % data_path.as_posix()
)
c1 = torch.from_numpy(np.load(npy_dir / "c1.npy", mmap_mode="r+"))
c2 = torch.from_numpy(np.load(npy_dir / "c2.npy", mmap_mode="r+"))
f = torch.from_numpy(np.load(npy_dir / "f.npy", mmap_mode="r+"))
kappa = torch.from_numpy(np.load(npy_dir / "kappa.npy", mmap_mode="r+"))
output = torch.from_numpy(np.load(npy_dir / "output.npy", mmap_mode="r+"))
sensor_coords = torch.from_numpy(np.load(npy_dir / "sensor_coords.npy", mmap_mode="r+"))
return torch.utils.data.TensorDataset(
c1,
c2,
f,
kappa,
output,
sensor_coords
)
@classmethod
def _extract_meta_data(cls, data_dir:Path):
meta_data_file_path = data_dir / "meta.yaml"
if not meta_data_file_path.exists():
raise RuntimeError(
"No metadata found for the dataset at %s. "
"Aborting!" % meta_data_file_path.as_posix()
)
meta_data:dict
with meta_data_file_path.open(mode="r") as yml_file:
meta_data = yaml.safe_load(yml_file)
try:
num_data = meta_data[cls.NUM_DATA]
num_input_sensors_2D = meta_data[cls.NUM_INPUT_SENSORS_2D]
num_output_sensors_1D = meta_data[cls.NUM_OUTPUT_SENSORS_1D]
no_velocity = meta_data[cls.NO_VELOCITY]
except KeyError:
tabbed_keywords = "nt-".join(['']+cls.ALL_KEYWORDS)
raise RuntimeError(
f"One or more of the required metadata are missing from {(data_dir / 'metadata.yaml').as_posix()}:"
f"{tabbed_keywords}nAborting!"
)
try:
num_input_sensors_total = num_input_sensors_2D[0] * num_input_sensors_2D[1]
except IndexError:
raise RuntimeError(
f"Expected {cls.NUM_INPUT_SENSORS_2D} to be a tuple in {(data_dir / 'metadata.yaml').as_posix()}, "
f"but got {str(num_input_sensors_2D)} instead. nAborting!"
)
return num_data, num_input_sensors_2D, num_input_sensors_total, num_output_sensors_1D, no_velocity
The bus error only occurs if I uncomment the “PROBLEM CODE” that I’ve called out above, and it always occurs at 80% of completion. If it is caused by insufficient memory, I’m at a loss as to why:
- Shouldn’t the memory use be fairly constant between iterations? Why would it increase?
- And for that matter, the memory required to store an individual batch should not be especially large; is there a reason the memory use might be larger than expected?
- Is it possible the Bus Error is being caused, not by insufficient memory, but insufficient virtual disk space on WSL2? If so, why wouldn’t that happen when I first instantiate the MemoryMappedTensors?
- Is there another obvious possibility that I’m overlooking?