Operator: aten._log_softmax.default
cnt: 1, ((T([127, 50257], f32), 1, False), {})
Operator: aten._log_softmax_backward_data.default
cnt: 1, ((T([127, 50257], f32), T([127, 50257], f32), 1, f32), {})
Operator: aten._softmax.default
cnt: 24, ((T([1, 16, 128, 128], f32), -1, False), {})
Operator: aten._softmax_backward_data.default
cnt: 24, ((T([1, 16, 128, 128], f32), T([1, 16, 128, 128], f32), -1, f32), {})
Operator: aten._to_copy.default
cnt: 48, ((T([1, 16, 128, 128], f16, stride=(262144, 128, 2048, 1)),), {'dtype': f32})
cnt: 24, ((T([1, 1, 128, 128], u8, stride=(4194304, 4194304, 2048, 1)),), {'dtype': torch.bool})
cnt: 24, ((T([], f32),), {'dtype': f32, 'layout': torch.strided, 'device': 'cuda'})
cnt: 24, ((T([1, 16, 128, 128], f32),), {'dtype': f16})
cnt: 1, ((T([1, 128, 50257], f16),), {'dtype': f32})
cnt: 1, ((T([1, 128, 50257], f32),), {'dtype': f16})
cnt: 1, ((T([], f32),), {'dtype': f16})
cnt: 1, ((T([], f16),), {'dtype': f32, 'layout': torch.strided, 'device': 'cuda'})
cnt: 1, ((T([1, 128, 50257], f32),), {'dtype': f16, 'layout': torch.strided, 'device': 'cuda'})
cnt: 24, ((T([1, 16, 128, 128], f16),), {'dtype': f32, 'layout': torch.strided, 'device': 'cuda'})
cnt: 24, ((T([1, 16, 128, 128], f32, stride=(262144, 16384, 1, 128)),), {'dtype': f16, 'layout': torch.strided, 'device': 'cuda'})
cnt: 24, ((T([1, 16, 128, 128], f32),), {'dtype': f16, 'layout': torch.strided, 'device': 'cuda'})
Operator: aten._unsafe_view.default
cnt: 72, ((T([128, 2048], f16), [1, 128, 2048]), {})
cnt: 24, ((T([16, 128, 128], f32), [1, 16, 128, 128]), {})
cnt: 24, ((T([16, 128, 128], f16), [1, 16, 128, 128]), {})
cnt: 1, ((T([128, 50257], f16), [1, 128, 50257]), {})
cnt: 48, ((T([1, 128, 16, 128], f16), [1, 128, 2048]), {})
Operator: aten.add.Tensor
cnt: 145, ((T([1, 128, 2048], f16), T([1, 128, 2048], f16)), {})
cnt: 72, ((T([1, 128, 8192], f16), T([1, 128, 8192], f16)), {})
cnt: 24, ((T([1, 128, 8192], f16), 1.0), {})
cnt: 1, ((T([50257, 2048], f16), T([50257, 2048], f16)), {})
Operator: aten.addmm.default
cnt: 24, ((T([2048], f16), T([128, 2048], f16), T([2048, 2048], f16, stride=(1, 2048))), {})
cnt: 24, ((T([8192], f16), T([128, 2048], f16), T([2048, 8192], f16, stride=(1, 2048))), {})
cnt: 24, ((T([2048], f16), T([128, 8192], f16), T([8192, 2048], f16, stride=(1, 8192))), {})
Operator: aten.bmm.default
cnt: 24, ((T([16, 128, 128], f32, stride=(128, 2048, 1)), T([16, 128, 128], f32, stride=(128, 1, 2048))), {})
cnt: 24, ((T([16, 128, 128], f16), T([16, 128, 128], f16, stride=(128, 2048, 1))), {})
cnt: 24, ((T([16, 128, 128], f16, stride=(16384, 1, 128)), T([16, 128, 128], f16, stride=(128, 2048, 1))), {})
cnt: 24, ((T([16, 128, 128], f16, stride=(128, 2048, 1)), T([16, 128, 128], f16, stride=(128, 1, 2048))), {})
cnt: 24, ((T([16, 128, 128], f32, stride=(128, 1, 2048)), T([16, 128, 128], f32)), {})
cnt: 24, ((T([16, 128, 128], f32), T([16, 128, 128], f32, stride=(128, 2048, 1))), {})
Operator: aten.clone.default
cnt: 2, ((T([1, 128], i64),), {})
Operator: aten.copy_.default
cnt: 2, ((T([1, 128], i64), T([1, 128], i64)), {})
Operator: aten.embedding.default
cnt: 1, ((T([50257, 2048], f16), T([1, 128], i64)), {})
cnt: 1, ((T([2048, 2048], f16), T([1, 128], i64)), {})
Operator: aten.embedding_dense_backward.default
cnt: 1, ((T([1, 128, 2048], f16), T([1, 128], i64), 2048, -1, False), {})
cnt: 1, ((T([1, 128, 2048], f16), T([1, 128], i64), 50257, -1, False), {})
Operator: aten.mm.default
cnt: 72, ((T([128, 2048], f16), T([2048, 2048], f16, stride=(1, 2048))), {})
cnt: 1, ((T([128, 2048], f16), T([2048, 50257], f16, stride=(1, 2048))), {})
cnt: 1, ((T([50257, 128], f16, stride=(1, 50257)), T([128, 2048], f16)), {})
cnt: 1, ((T([128, 50257], f16), T([50257, 2048], f16)), {})
cnt: 24, ((T([128, 2048], f16), T([2048, 8192], f16)), {})
cnt: 24, ((T([2048, 128], f16, stride=(1, 2048)), T([128, 8192], f16)), {})
cnt: 24, ((T([128, 8192], f16), T([8192, 2048], f16)), {})
cnt: 24, ((T([8192, 128], f16, stride=(1, 8192)), T([128, 2048], f16)), {})
cnt: 72, ((T([128, 2048], f16), T([2048, 2048], f16)), {})
cnt: 72, ((T([2048, 128], f16, stride=(1, 2048)), T([128, 2048], f16)), {})
cnt: 24, ((T([2048, 128], f16), T([128, 2048], f16)), {})
cnt: 24, ((T([128, 2048], f16, stride=(1, 128)), T([2048, 2048], f16)), {})
Operator: aten.mul.Scalar
cnt: 24, ((T([1, 128, 8192], f16), 3.0), {})
Operator: aten.mul.Tensor
cnt: 48, ((T([1, 128, 8192], f16), 0.5), {})
cnt: 48, ((T([1, 128, 8192], f16), 0.044715), {})
cnt: 48, ((T([1, 128, 8192], f16), 0.7978845608028654), {})
cnt: 96, ((T([1, 128, 8192], f16), T([1, 128, 8192], f16)), {})
Operator: aten.native_layer_norm.default
cnt: 49, ((T([1, 128, 2048], f16), [2048], T([2048], f16), T([2048], f16), 1e-05), {})
Operator: aten.native_layer_norm_backward.default
cnt: 49, ((T([1, 128, 2048], f16), T([1, 128, 2048], f16), [2048], T([1, 128, 1], f32), T([1, 128, 1], f32), T([2048], f16), T([2048], f16), [True, True, True]), {})
Operator: aten.nll_loss_backward.default
cnt: 1, ((T([], f32), T([127, 50257], f32), T([127], i64), None, 1, -100, T([], f32)), {})
Operator: aten.nll_loss_forward.default
cnt: 1, ((T([127, 50257], f32), T([127], i64), None, 1, -100), {})
Operator: aten.pow.Tensor_Scalar
cnt: 24, ((T([1, 128, 8192], f16), 3.0), {})
cnt: 24, ((T([1, 128, 8192], f16), 2.0), {})
Operator: aten.slice_backward.default
cnt: 1, ((T([1, 127, 50257], f32), [1, 127, 50257], 2, 0, 9223372036854775807, 1), {})
cnt: 1, ((T([1, 127, 50257], f32), [1, 128, 50257], 1, 0, -1, 1), {})
Operator: aten.sum.SymInt
cnt: 48, ((T([128, 2048], f16), [0], True), {})
cnt: 24, ((T([128, 8192], f16), [0], True), {})
Operator: aten.tanh.default
cnt: 24, ((T([1, 128, 8192], f16),), {})
Operator: aten.tanh_backward.default
cnt: 24, ((T([1, 128, 8192], f16), T([1, 128, 8192], f16)), {})
Operator: aten.where.self
cnt: 48, ((T([1, 1, 128, 128], b8), T([1, 16, 128, 128], f32), T([], f32)), {})
