error
RuntimeError: ptxas
failed with error code 4294967295
log(Already streamlined)
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] Triton compilation failed: triton_red_fused__to_copy_add_arange_embedding_native_layer_norm_0
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] def triton_(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] xnumel = 24576
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] rnumel = 768
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] xoffset = tl.program_id(0) * XBLOCK
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] xmask = tl.full([XBLOCK, RBLOCK], True, tl.int1)
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] rbase = tl.arange(0, RBLOCK)[None, :]
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] x3 = xindex
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] tmp0 = tl.load(in_ptr0 + (x3), None, eviction_policy='evict_last')
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] tmp10_mean = tl.zeros([XBLOCK, RBLOCK], tl.float32)
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] for roffset in range(0, rnumel, RBLOCK):
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] rindex = roffset + rbase
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] rmask = rindex < rnumel
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] tmp7 = tl.load(in_ptr2 + (r2 + (768*x0)), rmask, eviction_policy='evict_last', other=0.0)
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] tmp10_mean_next, tmp10_m2_next, tmp10_weight_next = triton_helpers.welford_reduce(tmp9, tmp10_mean, tmp10_m2, tmp10_weight, roffset == 0)
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] metadata: {'signature': {0: '*i64', 1: '*fp32', 2: '*fp32', 3: '*fp32', 4: '*fp16', 5: 'i32', 6: 'i32'}, 'device': 0, 'constants': {7: 1, 8: 1024}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())], 'device_type': 'cuda', 'num_warps': 8, 'num_stages': 1, 'debug': True, 'cc': 75}
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] Traceback (most recent call last):
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] File "D:CodeProgramsanaconda3Libsite-packagestritonbackendsnvidiacompiler.py", line 295, in make_cubin
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] subprocess.run(ptxas_cmd, check=True, close_fds=False, stderr=flog)
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] File "D:CodeProgramsanaconda3Libsubprocess.py", line 571, in run
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] raise CalledProcessError(retcode, process.args,
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] subprocess.CalledProcessError: Command '['C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin\ptxas.exe', '-lineinfo', '-v', '--gpu-name=sm_75', 'C:\Users\17082\AppData\Local\Temp\tmpm5tmi0vi.ptx', '-o', 'C:\Users\17082\AppData\Local\Temp\tmpm5tmi0vi.ptx.o']' returned non-zero exit status 4294967295.
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] During handling of the above exception, another exception occurred:
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] Traceback (most recent call last):
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] File "D:CodeProgramsanaconda3Libsite-packagestorch_inductorruntimetriton_heuristics.py", line 443, in _precompile_config
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] binary = triton.compile(*compile_args, **compile_kwargs)
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] File "D:CodeProgramsanaconda3Libsite-packagestritoncompilercompiler.py", line 286, in compile
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] next_module = compile_ir(module, metadata)
E1207 21:35:09.316000
19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] File "D:CodeProgramsanaconda3Libsite-packagestritonbackendsnvidiacompiler.py", line 329, in <lambda>
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] stages["cubin"] = lambda src, metadata: self.make_cubin(src, metadata, options, self.capability)
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] File "D:CodeProgramsanaconda3Libsite-packagestritonbackendsnvidiacompiler.py", line 309, in make_cubin
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] raise RuntimeError(f'ptxas failed with error code {e.returncode}: n{log}')
E1207 21:35:09.316000 19144 site-packagestorch_inductorruntimetriton_heuristics.py:445] RuntimeError: ptxas failed with error code 4294967295:
W1207 21:35:09.332000 19144 site-packagestorch_dynamoconvert_frame.py:1125] WON'T CONVERT forward D:codepoetrylibmodel.py line 170
W1207 21:35:09.332000 19144 site-packagestorch_dynamoconvert_frame.py:1125] due to:
W1207 21:35:09.332000 19144 site-packagestorch_dynamoconvert_frame.py:1125] Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
complete log
nvidia-smi
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 566.14 Driver Version: 566.14 CUDA Version: 12.7 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA T600 WDDM | 00000000:01:00.0 On | N/A |
| 41% 53C P8 N/A / 41W | 704MiB / 4096MiB | 3% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Wed_Oct_30_01:18:48_Pacific_Daylight_Time_2024
Cuda compilation tools, rel
ease 12.6, V12.6.85
Build cuda_12.6.r12.6/compiler.35059454_0
command
python train.py
train.py
import os
import time
import math
import pickle
from contextlib import nullcontext
import numpy as np
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
from lib.model import GPTConfig, GPT
import torch._dynamo
torch._dynamo.config.suppress_errors = True
out_dir = 'out-poetry'
eval_interval = 2000
log_interval = 1
eval_iters = 200
eval_only = False
always_save_checkpoint = True
init_from = 'scratch'
wandb_log = False
wandb_project = 'owt'
wandb_run_name = 'gpt2'
dataset = 'poetry'
gradient_accumulation_steps = 5 * 8
batch_size = 12
block_size = 2048
n_layer = 12
n_head = 12
n_embd = 768
dropout = 0.0
bias = False
learning_rate = 6e-4
max_iters = 600000
weight_decay = 1e-1
beta1 = 0.9
beta2 = 0.95
grad_clip = 1.0
decay_lr = True
warmup_iters = 2000
lr_decay_iters = 600000
min_lr = 6e-5
backend = 'nccl'
device = 'cuda'
#dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
dtype = 'float16'
#compile = True
config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
exec(open('lib/configurator.py').read())
config = {k: globals()[k] for k in config_keys}
ddp = int(os.environ.get('RANK', -1)) != -1
if ddp:
init_process_group(backend=backend)
ddp_rank = int(os.environ['RANK'])
ddp_local_rank = int(os.environ['LOCAL_RANK'])
ddp_world_size = int(os.environ['WORLD_SIZE'])
device = f'cuda:{ddp_local_rank}'
torch.cuda.set_device(device)
master_process = ddp_rank == 0
seed_offset = ddp_rank
assert gradient_accumulation_steps % ddp_world_size == 0
gradient_accumulation_steps //= ddp_world_size
else:
master_process = True
seed_offset = 0
ddp_world_size = 1
tokens_per_iter = gradient_accumulation_steps * ddp_world_size * batch_size * block_size
print(f"tokens per iteration will be: {tokens_per_iter:,}")
if master_process:
os.makedirs(out_dir, exist_ok=True)
torch.manual_seed(1337 + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
device_type = 'cuda' if 'cuda' in device else 'cpu'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
data_dir = os.path.join('data', dataset)
def get_batch(split):
if split == 'train':
data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
else:
data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
if device_type == 'cuda':
x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
else:
x, y = x.to(device), y.to(device)
return x, y
iter_num = 0
best_val_loss = 1e9
meta_path = os.path.join(data_dir, 'meta.pkl')
meta_vocab_size = None
if os.path.exists(meta_path):
with open(meta_path, 'rb') as f:
meta = pickle.load(f)
meta_vocab_size = meta['vocab_size']
print(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")
model_args = dict(n_layer=n_layer, n_head=n_head, n_embd=n_embd, block_size=block_size,
bias=bias, vocab_size=None, dropout=dropout)
if init_from == 'scratch':
print("Initializing a new model from scratch")
if meta_vocab_size is None:
print("defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)")
model_args['vocab_size'] = meta_vocab_size if meta_vocab_size is not None else 50304
gptconf = GPTConfig(**model_args)
model = GPT(gptconf)
elif init_from == 'resume':
print(f"Resuming training from {out_dir}")
ckpt_path = os.path.join(out_dir, 'ckpt.pt')
checkpoint = torch.load(ckpt_path, map_location=device)
checkpoint_model_args = checkpoint['model_args']
for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
model_args[k] = checkpoint_model_args[k]
gptconf = GPTConfig(**model_args)
model = GPT(gptconf)
state_dict = checkpoint['model']
unwanted_prefix = '_orig_mod.'
for k,v in list(state_dict.items()):
if k.startswith(unwanted_prefix):
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict)
iter_num = checkpoint['iter_num']
best_val_loss = checkpoint['best_val_loss']
elif init_from.startswith('gpt2'):
print(f"Initializing from OpenAI GPT-2 weights: {init_from}")
override_args = dict(dropout=dropout)
model = GPT.from_pretrained(init_from, override_args)
for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
model_args[k] = getattr(model.config, k)
if block_size < model.config.block_size:
model.crop_block_size(block_size)
model_args['block_size'] = block_size
model.to(device)
#scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
# out of data
scaler = torch.amp.GradScaler('cuda', enabled=(dtype == 'float16'))
optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type)
if init_from == 'resume':
optimizer.load_state_dict(checkpoint['optimizer'])
checkpoint = None
if compile:
print("compiling the model... (takes a ~minute)")
unoptimized_model = model
model = torch.compile(model)
if ddp:
model = DDP(model, device_ids=[ddp_local_rank])
@torch.no_grad()
def estimate_loss():
out = {}
model.eval()
for split in ['train', 'val']:
losses = torch.zeros(eval_iters)
for k in range(eval_iters):
X, Y = get_batch(split)
with ctx:
logits, loss = model(X, Y)
losses[k] = loss.item()
out[split] = losses.mean()
model.train()
return out
def get_lr(it):
if it < warmup_iters:
return learning_rate * it / warmup_iters
if it > lr_decay_iters:
return min_lr
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
assert 0 <= decay_ratio <= 1
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
return min_lr + coeff * (learning_rate - min_lr)
if wandb_log and master_process:
import wandb
wandb.init(project=wandb_project, name=wandb_run_name, config=config)
X, Y = get_batch('train')
t0 = time.time()
local_iter_num = 0
raw_model = model.module if ddp else model
running_mfu = -1.0
while True:
lr = get_lr(iter_num) if decay_lr else learning_rate
for param_group in optimizer.param_groups:
param_group['lr'] = lr
if iter_num % eval_interval == 0 and master_process:
losses = estimate_loss()
print(f"step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
if wandb_log:
wandb.log({
"iter": iter_num,
"train/loss": losses['train'],
"val/loss": losses['val'],
"lr": lr,
"mfu": running_mfu*100,
})
if losses['val'] < best_val_loss or always_save_checkpoint:
best_val_loss = losses['val']
if iter_num > 0:
checkpoint = {
'model': raw_model.state_dict(),
'optimizer': optimizer.state_dict(),
'model_args': model_args,
'iter_num': iter_num,
'best_val_loss': best_val_loss,
'config': config,
}
print(f"saving checkpoint to {out_dir}")
torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt'))
if iter_num == 0 and eval_only:
break
for micro_step in range(gradient_accumulation_steps):
if ddp:
model.require_backward_grad_sync = (micro_step == gradient_accumulation_steps - 1)
with ctx:
logits, loss = model(X, Y)
loss = loss / gradient_accumulation_steps
X, Y = get_batch('train')
scaler.scale(loss).backward()
if grad_clip != 0.0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad(set_to_none=True)
t1 = time.time()
dt = t1 - t0
t0 = t1
if iter_num % log_interval == 0 and master_process:
lossf = loss.item() * gradient_accumulation_steps
if local_iter_num >= 5:
mfu = raw_model.estimate_mfu(batch_size * gradient_accumulation_steps, dt)
running_mfu = mfu if running_mfu == -1.0 else 0.9*running_mfu + 0.1*mfu
print(f"iter {iter_num}: loss {lossf:.4f}, time {dt*1000:.2f}ms, mfu {running_mfu*100:.2f}%")
iter_num += 1
local_iter_num += 1
if iter_num > max_iters:
break
if ddp:
destroy_process_group()
hardware info
GPU: NVIDIA T600 4GB
System: Windows 11 Pro 24h2 26100.2314 Windows Feature Experience Pack 1000.26100.32.0
Memory: KF3600C17D4/8GX *4 32GB DDR4 4000MHz(OverClocking)
CPU: 11th Gen Intel(R) Core(TM) i5-11600KF
pytorch
Name: torch
Version: 2.5.1+cu124
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: [email protected]
License: BSD-3-Clause
Location: D:CodeProgramsanaconda3Libsite-packages
Requires: filelock, fsspec, jinja2, networkx, setuptools, sympy, typing-extensions
Required-by: torchaudio, torchvision
New contributor
Pyl is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.