Skip to content

Error when training with Mosaic-Bert #447

@naveenkumar2703

Description

@naveenkumar2703

I have forked the docker from the README and installed the dependencies from requirements.txt. One difference is I'm using singularity instead of docker.

I face the following error only when executing main.py with mosaic-bert-base-uncased.yaml (hf_bert works fine)

Here is the error, I see after tokenization. I would appreciate any guidance you can give me. Thanks for the amazing work!

Traceback (most recent call last):
File "", line 21, in _fwd_kernel
KeyError: ('2-.-0-.-0-d82511111ad128294e9d31a6ac684238-7929002797455b30efce6e41eddc6b57-3aa563e00c5c695dd945e23b09a86848-d962222789c30252d492a16cca3bf467-ff946bd4b3b4a4cbdf8cedc6e1c658e0-5c5e32ff210f3b7f56c98ca29917c25e-06f0df2d61979d629033f4a22eff5198-0dd03b0bd512a184b3512b278d9dfa59-d35ab04ae841e2714a253c523530b071', (torch.bfloat16, torch.bfloat16, torch.bfloat16, torch.float32, torch.bfloat16, torch.float32, torch.float32, 'fp32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32'), ('matrix', False, 64, True, True, True, 128, 128), (True, True, True, True, True, True, True, (False,), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (False, False), (True, False), (True, False), (True, False), (True, False), (False, False), (False, False)))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "/u/user/examples/examples/benchmarks/bert/main.py", line 269, in
main(cfg)
File "/u/user/examples/examples/benchmarks/bert/main.py", line 256, in main
trainer.fit()
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 1766, in fit
self._train_loop()
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 1940, in _train_loop
total_loss_dict = self._train_batch(use_grad_scaling)
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 2115, in _train_batch
optimizer.step(closure=lambda **kwargs: self._train_microbatches(
File "/usr/lib/python3/dist-packages/torch/optim/lr_scheduler.py", line 68, in wrapper
return wrapped(*args, **kwargs)
File "/usr/lib/python3/dist-packages/torch/optim/optimizer.py", line 140, in wrapper
out = func(*args, **kwargs)
File "/usr/lib/python3/dist-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/u/user/.local/lib/python3.10/site-packages/composer/optim/decoupled_weight_decay.py", line 288, in step
loss = closure()
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 2115, in
optimizer.step(closure=lambda **kwargs: self._train_microbatches(
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 2213, in _train_microbatches
microbatch_loss_dict = self._train_microbatch(use_grad_scaling, current_batch_size, is_final_microbatch)
File "/u/user/.local/lib/python3.10/site-packages/composer/trainer/trainer.py", line 2276, in _train_microbatch
self.state.outputs = self.state.model(self.state.batch)
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/.local/lib/python3.10/site-packages/composer/models/huggingface.py", line 314, in forward
output = self.model(**batch) # type: ignore (thirdparty)
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 858, in forward
outputs = self.bert(
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 677, in forward
encoder_outputs = self.encoder(
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 533, in forward
hidden_states = layer_module(hidden_states,
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 395, in forward
attention_output = self.attention(hidden_states, cu_seqlens, seqlen,
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 307, in forward
self_output = self.self(input_tensor, cu_seqlens, max_s, indices,
File "/usr/lib/python3/dist-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/u/user/examples/examples/benchmarks/bert/src/bert_layers.py", line 241, in forward
attention = flash_attn_qkvpacked_func(qkv, bias)
File "/u/user/examples/examples/benchmarks/bert/src/flash_attn_triton.py", line 1021, in forward
o, lse, ctx.softmax_scale = _flash_attn_forward(
File "/u/user/examples/examples/benchmarks/bert/src/flash_attn_triton.py", line 826, in _flash_attn_forward
_fwd_kernel[grid]( # type: ignore
File "/u/user/.local/lib/python3.10/site-packages/triton/runtime/jit.py", line 106, in launcher
return self.run(*args, grid=grid, **kwargs)
File "/u/user/.local/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 86, in run
return self.fn.run(*args, num_warps=config.num_warps, num_stages=config.num_stages, **kwargs, **config.kwargs)
File "/u/user/.local/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 200, in run
return self.fn.run(*args, **kwargs)
File "", line 41, in _fwd_kernel
File "/u/user/.local/lib/python3.10/site-packages/triton/compiler.py", line 1239, in compile
so = _build(fn.name, src_path, tmpdir)
File "/u/user/.local/lib/python3.10/site-packages/triton/compiler.py", line 1169, in _build
ret = subprocess.check_call(cc_cmd)
File "/usr/lib/python3.10/subprocess.py", line 364, in check_call
retcode = call(*popenargs, **kwargs)
File "/usr/lib/python3.10/subprocess.py", line 345, in call
with Popen(*popenargs, **kwargs) as p:
File "/usr/lib/python3.10/subprocess.py", line 971, in init
self._execute_child(args, executable, preexec_fn, close_fds,
File "/usr/lib/python3.10/subprocess.py", line 1863, in _execute_child
raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: '/sw/spack/sys11-2023-03/apps/linux-rhel8-x86_64/gcc-8.5.0/gcc-11.4.0-yycklku/bin/gcc'

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions