module @compiled_flux_sampler {
  util.global private @__auto.sampler.img_in.weight = #stream.parameter.named<"model"::"sampler.img_in.weight"> : tensor<3072x64xf16>
  util.global private @__auto.sampler.img_in.bias = #stream.parameter.named<"model"::"sampler.img_in.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.time_in.in_layer.weight = #stream.parameter.named<"model"::"sampler.time_in.in_layer.weight"> : tensor<3072x256xf16>
  util.global private @__auto.sampler.time_in.in_layer.bias = #stream.parameter.named<"model"::"sampler.time_in.in_layer.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.time_in.out_layer.weight = #stream.parameter.named<"model"::"sampler.time_in.out_layer.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.time_in.out_layer.bias = #stream.parameter.named<"model"::"sampler.time_in.out_layer.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.guidance_in.in_layer.weight = #stream.parameter.named<"model"::"sampler.guidance_in.in_layer.weight"> : tensor<3072x256xf16>
  util.global private @__auto.sampler.guidance_in.in_layer.bias = #stream.parameter.named<"model"::"sampler.guidance_in.in_layer.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.guidance_in.out_layer.weight = #stream.parameter.named<"model"::"sampler.guidance_in.out_layer.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.guidance_in.out_layer.bias = #stream.parameter.named<"model"::"sampler.guidance_in.out_layer.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.vector_in.in_layer.weight = #stream.parameter.named<"model"::"sampler.vector_in.in_layer.weight"> : tensor<3072x768xf16>
  util.global private @__auto.sampler.vector_in.in_layer.bias = #stream.parameter.named<"model"::"sampler.vector_in.in_layer.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.vector_in.out_layer.weight = #stream.parameter.named<"model"::"sampler.vector_in.out_layer.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.vector_in.out_layer.bias = #stream.parameter.named<"model"::"sampler.vector_in.out_layer.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.txt_in.weight = #stream.parameter.named<"model"::"sampler.txt_in.weight"> : tensor<3072x4096xf16>
  util.global private @__auto.sampler.txt_in.bias = #stream.parameter.named<"model"::"sampler.txt_in.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.0.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.0.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.0.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.0.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.0.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.0.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.0.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.0.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.1.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.1.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.1.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.1.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.1.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.1.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.1.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.1.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.2.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.2.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.2.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.2.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.2.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.2.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.2.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.2.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.3.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.3.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.3.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.3.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.3.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.3.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.3.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.3.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.4.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.4.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.4.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.4.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.4.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.4.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.4.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.4.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.5.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.5.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.5.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.5.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.5.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.5.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.5.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.5.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.6.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.6.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.6.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.6.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.6.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.6.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.6.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.6.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.7.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.7.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.7.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.7.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.7.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.7.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.7.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.7.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.8.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.8.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.8.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.8.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.8.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.8.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.8.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.8.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.9.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.9.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.9.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.9.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.9.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.9.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.9.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.9.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.10.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.10.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.10.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.10.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.10.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.10.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.10.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.10.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.11.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.11.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.11.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.11.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.11.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.11.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.11.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.11.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.12.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.12.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.12.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.12.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.12.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.12.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.12.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.12.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.13.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.13.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.13.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.13.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.13.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.13.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.13.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.13.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.14.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.14.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.14.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.14.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.14.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.14.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.14.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.14.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.15.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.15.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.15.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.15.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.15.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.15.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.15.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.15.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.16.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.16.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.16.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.16.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.16.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.16.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.16.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.16.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.17.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.17.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.17.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.17.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.17.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.17.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.17.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.17.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.18.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.18.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.18.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.18.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.18.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.18.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.18.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.18.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.0.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.0.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.0.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.0.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.0.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.0.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.0.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.0.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.0.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.0.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.0.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.0.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.0.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.0.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.0.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.0.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.1.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.1.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.1.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.1.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.1.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.1.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.1.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.1.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.1.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.1.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.1.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.1.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.1.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.1.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.1.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.1.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.2.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.2.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.2.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.2.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.2.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.2.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.2.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.2.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.2.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.2.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.2.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.2.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.2.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.2.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.2.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.2.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.3.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.3.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.3.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.3.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.3.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.3.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.3.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.3.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.3.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.3.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.3.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.3.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.3.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.3.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.3.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.3.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.4.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.4.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.4.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.4.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.4.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.4.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.4.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.4.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.4.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.4.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.4.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.4.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.4.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.4.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.4.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.4.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.5.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.5.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.5.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.5.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.5.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.5.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.5.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.5.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.5.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.5.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.5.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.5.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.5.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.5.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.5.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.5.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.6.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.6.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.6.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.6.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.6.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.6.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.6.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.6.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.6.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.6.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.6.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.6.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.6.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.6.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.6.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.6.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.7.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.7.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.7.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.7.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.7.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.7.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.7.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.7.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.7.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.7.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.7.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.7.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.7.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.7.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.7.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.7.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.8.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.8.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.8.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.8.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.8.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.8.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.8.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.8.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.8.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.8.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.8.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.8.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.8.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.8.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.8.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.8.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.9.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.9.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.9.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.9.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.9.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.9.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.9.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.9.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.9.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.9.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.9.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.9.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.9.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.9.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.9.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.9.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.10.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.10.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.10.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.10.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.10.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.10.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.10.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.10.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.10.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.10.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.10.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.10.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.10.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.10.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.10.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.10.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.11.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.11.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.11.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.11.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.11.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.11.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.11.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.11.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.11.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.11.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.11.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.11.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.11.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.11.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.11.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.11.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.12.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.12.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.12.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.12.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.12.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.12.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.12.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.12.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.12.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.12.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.12.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.12.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.12.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.12.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.12.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.12.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.13.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.13.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.13.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.13.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.13.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.13.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.13.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.13.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.13.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.13.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.13.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.13.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.13.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.13.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.13.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.13.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.14.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.14.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.14.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.14.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.14.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.14.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.14.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.14.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.14.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.14.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.14.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.14.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.14.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.14.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.14.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.14.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.15.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.15.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.15.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.15.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.15.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.15.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.15.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.15.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.15.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.15.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.15.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.15.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.15.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.15.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.15.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.15.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.16.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.16.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.16.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.16.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.16.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.16.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.16.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.16.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.16.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.16.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.16.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.16.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.16.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.16.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.16.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.16.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.17.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.17.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.17.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.17.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.17.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.17.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.17.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.17.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.17.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.17.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.17.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.17.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.17.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.17.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.17.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.17.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.18.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.18.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.18.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.18.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.18.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.18.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.18.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.18.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.18.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.18.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.18.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.18.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.18.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.18.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.18.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.18.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.19.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.19.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.19.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.19.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.19.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.19.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.19.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.19.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.19.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.19.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.19.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.19.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.19.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.19.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.19.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.19.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.20.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.20.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.20.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.20.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.20.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.20.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.20.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.20.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.20.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.20.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.20.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.20.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.20.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.20.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.20.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.20.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.21.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.21.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.21.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.21.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.21.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.21.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.21.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.21.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.21.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.21.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.21.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.21.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.21.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.21.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.21.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.21.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.22.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.22.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.22.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.22.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.22.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.22.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.22.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.22.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.22.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.22.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.22.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.22.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.22.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.22.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.22.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.22.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.23.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.23.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.23.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.23.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.23.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.23.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.23.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.23.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.23.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.23.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.23.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.23.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.23.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.23.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.23.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.23.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.24.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.24.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.24.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.24.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.24.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.24.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.24.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.24.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.24.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.24.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.24.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.24.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.24.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.24.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.24.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.24.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.25.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.25.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.25.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.25.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.25.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.25.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.25.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.25.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.25.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.25.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.25.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.25.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.25.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.25.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.25.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.25.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.26.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.26.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.26.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.26.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.26.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.26.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.26.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.26.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.26.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.26.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.26.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.26.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.26.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.26.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.26.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.26.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.27.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.27.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.27.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.27.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.27.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.27.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.27.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.27.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.27.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.27.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.27.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.27.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.27.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.27.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.27.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.27.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.28.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.28.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.28.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.28.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.28.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.28.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.28.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.28.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.28.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.28.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.28.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.28.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.28.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.28.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.28.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.28.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.29.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.29.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.29.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.29.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.29.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.29.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.29.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.29.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.29.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.29.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.29.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.29.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.29.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.29.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.29.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.29.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.30.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.30.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.30.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.30.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.30.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.30.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.30.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.30.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.30.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.30.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.30.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.30.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.30.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.30.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.30.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.30.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.31.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.31.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.31.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.31.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.31.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.31.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.31.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.31.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.31.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.31.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.31.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.31.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.31.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.31.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.31.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.31.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.32.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.32.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.32.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.32.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.32.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.32.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.32.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.32.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.32.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.32.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.32.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.32.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.32.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.32.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.32.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.32.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.33.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.33.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.33.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.33.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.33.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.33.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.33.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.33.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.33.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.33.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.33.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.33.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.33.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.33.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.33.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.33.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.34.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.34.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.34.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.34.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.34.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.34.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.34.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.34.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.34.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.34.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.34.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.34.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.34.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.34.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.34.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.34.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.35.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.35.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.35.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.35.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.35.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.35.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.35.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.35.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.35.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.35.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.35.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.35.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.35.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.35.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.35.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.35.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.36.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.36.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.36.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.36.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.36.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.36.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.36.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.36.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.36.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.36.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.36.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.36.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.36.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.36.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.36.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.36.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.37.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.37.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.37.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.37.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.37.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.37.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.37.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.37.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.37.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.37.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.37.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.37.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.37.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.37.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.37.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.37.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.final_layer.adaLN_modulation.1.weight = #stream.parameter.named<"model"::"sampler.final_layer.adaLN_modulation.1.weight"> : tensor<6144x3072xf16>
  util.global private @__auto.sampler.final_layer.adaLN_modulation.1.bias = #stream.parameter.named<"model"::"sampler.final_layer.adaLN_modulation.1.bias"> : tensor<6144xf16>
  util.global private @__auto.sampler.final_layer.linear.weight = #stream.parameter.named<"model"::"sampler.final_layer.linear.weight"> : tensor<64x3072xf16>
  util.global private @__auto.sampler.final_layer.linear.bias = #stream.parameter.named<"model"::"sampler.final_layer.linear.bias"> : tensor<64xf16>
  func.func @run_forward(%arg0: !torch.vtensor<[1,4096,64],f16>, %arg1: !torch.vtensor<[1,4096,3],f16>, %arg2: !torch.vtensor<[1,512,4096],f16>, %arg3: !torch.vtensor<[1,512,3],f16>, %arg4: !torch.vtensor<[1,768],f16>, %arg5: !torch.vtensor<[1],f16>, %arg6: !torch.vtensor<[1],f16>, %arg7: !torch.vtensor<[1],f16>) -> !torch.vtensor<[1,4096,64],f16> attributes {iree.reflection = {model_name = "flux_sampler"}, torch.assume_strict_symbolic_shapes} {
    %int1 = torch.constant.int 1
    %0 = torch.prim.ListConstruct %int1 : (!torch.int) -> !torch.list<int>
    %false = torch.constant.bool false
    %1 = torch.aten.expand %arg5, %0, %false : !torch.vtensor<[1],f16>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1],f16>
    %int4096 = torch.constant.int 4096
    %int64 = torch.constant.int 64
    %2 = torch.prim.ListConstruct %int4096, %int64 : (!torch.int, !torch.int) -> !torch.list<int>
    %3 = torch.aten.view %arg0, %2 : !torch.vtensor<[1,4096,64],f16>, !torch.list<int> -> !torch.vtensor<[4096,64],f16>
    %__auto.sampler.img_in.weight = util.global.load @__auto.sampler.img_in.weight : tensor<3072x64xf16>
    %4 = torch_c.from_builtin_tensor %__auto.sampler.img_in.weight : tensor<3072x64xf16> -> !torch.vtensor<[3072,64],f16>
    %int0 = torch.constant.int 0
    %int1_0 = torch.constant.int 1
    %5 = torch.aten.transpose.int %4, %int0, %int1_0 : !torch.vtensor<[3072,64],f16>, !torch.int, !torch.int -> !torch.vtensor<[64,3072],f16>
    %__auto.sampler.img_in.bias = util.global.load @__auto.sampler.img_in.bias : tensor<3072xf16>
    %6 = torch_c.from_builtin_tensor %__auto.sampler.img_in.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6 = torch.constant.int 6
    %7 = torch.prims.convert_element_type %6, %int6 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1 = torch.constant.int 6
    %8 = torch.prims.convert_element_type %3, %int6_1 : !torch.vtensor<[4096,64],f16>, !torch.int -> !torch.vtensor<[4096,64],f32>
    %int6_2 = torch.constant.int 6
    %9 = torch.prims.convert_element_type %5, %int6_2 : !torch.vtensor<[64,3072],f16>, !torch.int -> !torch.vtensor<[64,3072],f32>
    %10 = torch.aten.mm %8, %9 : !torch.vtensor<[4096,64],f32>, !torch.vtensor<[64,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_3 = torch.constant.int 1
    %11 = torch.aten.mul.Scalar %10, %int1_3 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_4 = torch.constant.int 1
    %12 = torch.aten.mul.Scalar %7, %int1_4 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5 = torch.constant.int 1
    %13 = torch.aten.add.Tensor %11, %12, %int1_5 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5 = torch.constant.int 5
    %14 = torch.prims.convert_element_type %13, %int5 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_6 = torch.constant.int 1
    %int4096_7 = torch.constant.int 4096
    %int3072 = torch.constant.int 3072
    %15 = torch.prim.ListConstruct %int1_6, %int4096_7, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16 = torch.aten.view %14, %15 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %float1.000000e03 = torch.constant.float 1.000000e+03
    %17 = torch.aten.mul.Scalar %1, %float1.000000e03 : !torch.vtensor<[1],f16>, !torch.float -> !torch.vtensor<[1],f16>
    %int0_8 = torch.constant.int 0
    %int128 = torch.constant.int 128
    %int6_9 = torch.constant.int 6
    %none = torch.constant.none
    %cpu = torch.constant.device "cpu"
    %false_10 = torch.constant.bool false
    %18 = torch.aten.arange.start %int0_8, %int128, %int6_9, %none, %cpu, %false_10 : !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],f32>
    %float-9.210340e00 = torch.constant.float -9.2103403719761836
    %19 = torch.aten.mul.Scalar %18, %float-9.210340e00 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int128_11 = torch.constant.int 128
    %20 = torch.aten.div.Scalar %19, %int128_11 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %21 = torch.aten.exp %20 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %int6_12 = torch.constant.int 6
    %22 = torch.prims.convert_element_type %21, %int6_12 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int0_13 = torch.constant.int 0
    %int0_14 = torch.constant.int 0
    %int9223372036854775807 = torch.constant.int 9223372036854775807
    %int1_15 = torch.constant.int 1
    %23 = torch.aten.slice.Tensor %17, %int0_13, %int0_14, %int9223372036854775807, %int1_15 : !torch.vtensor<[1],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1],f16>
    %int1_16 = torch.constant.int 1
    %24 = torch.aten.unsqueeze %23, %int1_16 : !torch.vtensor<[1],f16>, !torch.int -> !torch.vtensor<[1,1],f16>
    %int6_17 = torch.constant.int 6
    %25 = torch.prims.convert_element_type %24, %int6_17 : !torch.vtensor<[1,1],f16>, !torch.int -> !torch.vtensor<[1,1],f32>
    %int0_18 = torch.constant.int 0
    %26 = torch.aten.unsqueeze %22, %int0_18 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %27 = torch.aten.mul.Tensor %25, %26 : !torch.vtensor<[1,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[1,128],f32>
    %28 = torch.aten.cos %27 : !torch.vtensor<[1,128],f32> -> !torch.vtensor<[1,128],f32>
    %29 = torch.aten.sin %27 : !torch.vtensor<[1,128],f32> -> !torch.vtensor<[1,128],f32>
    %30 = torch.prim.ListConstruct %28, %29 : (!torch.vtensor<[1,128],f32>, !torch.vtensor<[1,128],f32>) -> !torch.list<vtensor>
    %int-1 = torch.constant.int -1
    %31 = torch.aten.cat %30, %int-1 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,256],f32>
    %int5_19 = torch.constant.int 5
    %32 = torch.prims.convert_element_type %31, %int5_19 : !torch.vtensor<[1,256],f32>, !torch.int -> !torch.vtensor<[1,256],f16>
    %__auto.sampler.time_in.in_layer.weight = util.global.load @__auto.sampler.time_in.in_layer.weight : tensor<3072x256xf16>
    %33 = torch_c.from_builtin_tensor %__auto.sampler.time_in.in_layer.weight : tensor<3072x256xf16> -> !torch.vtensor<[3072,256],f16>
    %int0_20 = torch.constant.int 0
    %int1_21 = torch.constant.int 1
    %34 = torch.aten.transpose.int %33, %int0_20, %int1_21 : !torch.vtensor<[3072,256],f16>, !torch.int, !torch.int -> !torch.vtensor<[256,3072],f16>
    %__auto.sampler.time_in.in_layer.bias = util.global.load @__auto.sampler.time_in.in_layer.bias : tensor<3072xf16>
    %35 = torch_c.from_builtin_tensor %__auto.sampler.time_in.in_layer.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_22 = torch.constant.int 6
    %36 = torch.prims.convert_element_type %35, %int6_22 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_23 = torch.constant.int 6
    %37 = torch.prims.convert_element_type %32, %int6_23 : !torch.vtensor<[1,256],f16>, !torch.int -> !torch.vtensor<[1,256],f32>
    %int6_24 = torch.constant.int 6
    %38 = torch.prims.convert_element_type %34, %int6_24 : !torch.vtensor<[256,3072],f16>, !torch.int -> !torch.vtensor<[256,3072],f32>
    %39 = torch.aten.mm %37, %38 : !torch.vtensor<[1,256],f32>, !torch.vtensor<[256,3072],f32> -> !torch.vtensor<[1,3072],f32>
    %int1_25 = torch.constant.int 1
    %40 = torch.aten.mul.Scalar %39, %int1_25 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int1_26 = torch.constant.int 1
    %41 = torch.aten.mul.Scalar %36, %int1_26 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_27 = torch.constant.int 1
    %42 = torch.aten.add.Tensor %40, %41, %int1_27 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int5_28 = torch.constant.int 5
    %43 = torch.prims.convert_element_type %42, %int5_28 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %44 = torch.aten.silu %43 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.time_in.out_layer.weight = util.global.load @__auto.sampler.time_in.out_layer.weight : tensor<3072x3072xf16>
    %45 = torch_c.from_builtin_tensor %__auto.sampler.time_in.out_layer.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_29 = torch.constant.int 0
    %int1_30 = torch.constant.int 1
    %46 = torch.aten.transpose.int %45, %int0_29, %int1_30 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.time_in.out_layer.bias = util.global.load @__auto.sampler.time_in.out_layer.bias : tensor<3072xf16>
    %47 = torch_c.from_builtin_tensor %__auto.sampler.time_in.out_layer.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_31 = torch.constant.int 6
    %48 = torch.prims.convert_element_type %47, %int6_31 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_32 = torch.constant.int 6
    %49 = torch.prims.convert_element_type %44, %int6_32 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_33 = torch.constant.int 6
    %50 = torch.prims.convert_element_type %46, %int6_33 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %51 = torch.aten.mm %49, %50 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[1,3072],f32>
    %int1_34 = torch.constant.int 1
    %52 = torch.aten.mul.Scalar %51, %int1_34 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int1_35 = torch.constant.int 1
    %53 = torch.aten.mul.Scalar %48, %int1_35 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_36 = torch.constant.int 1
    %54 = torch.aten.add.Tensor %52, %53, %int1_36 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int5_37 = torch.constant.int 5
    %55 = torch.prims.convert_element_type %54, %int5_37 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %float1.000000e03_38 = torch.constant.float 1.000000e+03
    %56 = torch.aten.mul.Scalar %arg7, %float1.000000e03_38 : !torch.vtensor<[1],f16>, !torch.float -> !torch.vtensor<[1],f16>
    %int0_39 = torch.constant.int 0
    %int128_40 = torch.constant.int 128
    %int6_41 = torch.constant.int 6
    %none_42 = torch.constant.none
    %cpu_43 = torch.constant.device "cpu"
    %false_44 = torch.constant.bool false
    %57 = torch.aten.arange.start %int0_39, %int128_40, %int6_41, %none_42, %cpu_43, %false_44 : !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],f32>
    %float-9.210340e00_45 = torch.constant.float -9.2103403719761836
    %58 = torch.aten.mul.Scalar %57, %float-9.210340e00_45 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int128_46 = torch.constant.int 128
    %59 = torch.aten.div.Scalar %58, %int128_46 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %60 = torch.aten.exp %59 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %int6_47 = torch.constant.int 6
    %61 = torch.prims.convert_element_type %60, %int6_47 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int0_48 = torch.constant.int 0
    %int0_49 = torch.constant.int 0
    %int9223372036854775807_50 = torch.constant.int 9223372036854775807
    %int1_51 = torch.constant.int 1
    %62 = torch.aten.slice.Tensor %56, %int0_48, %int0_49, %int9223372036854775807_50, %int1_51 : !torch.vtensor<[1],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1],f16>
    %int1_52 = torch.constant.int 1
    %63 = torch.aten.unsqueeze %62, %int1_52 : !torch.vtensor<[1],f16>, !torch.int -> !torch.vtensor<[1,1],f16>
    %int6_53 = torch.constant.int 6
    %64 = torch.prims.convert_element_type %63, %int6_53 : !torch.vtensor<[1,1],f16>, !torch.int -> !torch.vtensor<[1,1],f32>
    %int0_54 = torch.constant.int 0
    %65 = torch.aten.unsqueeze %61, %int0_54 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %66 = torch.aten.mul.Tensor %64, %65 : !torch.vtensor<[1,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[1,128],f32>
    %67 = torch.aten.cos %66 : !torch.vtensor<[1,128],f32> -> !torch.vtensor<[1,128],f32>
    %68 = torch.aten.sin %66 : !torch.vtensor<[1,128],f32> -> !torch.vtensor<[1,128],f32>
    %69 = torch.prim.ListConstruct %67, %68 : (!torch.vtensor<[1,128],f32>, !torch.vtensor<[1,128],f32>) -> !torch.list<vtensor>
    %int-1_55 = torch.constant.int -1
    %70 = torch.aten.cat %69, %int-1_55 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,256],f32>
    %int5_56 = torch.constant.int 5
    %71 = torch.prims.convert_element_type %70, %int5_56 : !torch.vtensor<[1,256],f32>, !torch.int -> !torch.vtensor<[1,256],f16>
    %__auto.sampler.guidance_in.in_layer.weight = util.global.load @__auto.sampler.guidance_in.in_layer.weight : tensor<3072x256xf16>
    %72 = torch_c.from_builtin_tensor %__auto.sampler.guidance_in.in_layer.weight : tensor<3072x256xf16> -> !torch.vtensor<[3072,256],f16>
    %int0_57 = torch.constant.int 0
    %int1_58 = torch.constant.int 1
    %73 = torch.aten.transpose.int %72, %int0_57, %int1_58 : !torch.vtensor<[3072,256],f16>, !torch.int, !torch.int -> !torch.vtensor<[256,3072],f16>
    %__auto.sampler.guidance_in.in_layer.bias = util.global.load @__auto.sampler.guidance_in.in_layer.bias : tensor<3072xf16>
    %74 = torch_c.from_builtin_tensor %__auto.sampler.guidance_in.in_layer.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_59 = torch.constant.int 6
    %75 = torch.prims.convert_element_type %74, %int6_59 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_60 = torch.constant.int 6
    %76 = torch.prims.convert_element_type %71, %int6_60 : !torch.vtensor<[1,256],f16>, !torch.int -> !torch.vtensor<[1,256],f32>
    %int6_61 = torch.constant.int 6
    %77 = torch.prims.convert_element_type %73, %int6_61 : !torch.vtensor<[256,3072],f16>, !torch.int -> !torch.vtensor<[256,3072],f32>
    %78 = torch.aten.mm %76, %77 : !torch.vtensor<[1,256],f32>, !torch.vtensor<[256,3072],f32> -> !torch.vtensor<[1,3072],f32>
    %int1_62 = torch.constant.int 1
    %79 = torch.aten.mul.Scalar %78, %int1_62 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int1_63 = torch.constant.int 1
    %80 = torch.aten.mul.Scalar %75, %int1_63 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_64 = torch.constant.int 1
    %81 = torch.aten.add.Tensor %79, %80, %int1_64 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int5_65 = torch.constant.int 5
    %82 = torch.prims.convert_element_type %81, %int5_65 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %83 = torch.aten.silu %82 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.guidance_in.out_layer.weight = util.global.load @__auto.sampler.guidance_in.out_layer.weight : tensor<3072x3072xf16>
    %84 = torch_c.from_builtin_tensor %__auto.sampler.guidance_in.out_layer.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_66 = torch.constant.int 0
    %int1_67 = torch.constant.int 1
    %85 = torch.aten.transpose.int %84, %int0_66, %int1_67 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.guidance_in.out_layer.bias = util.global.load @__auto.sampler.guidance_in.out_layer.bias : tensor<3072xf16>
    %86 = torch_c.from_builtin_tensor %__auto.sampler.guidance_in.out_layer.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_68 = torch.constant.int 6
    %87 = torch.prims.convert_element_type %86, %int6_68 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_69 = torch.constant.int 6
    %88 = torch.prims.convert_element_type %83, %int6_69 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_70 = torch.constant.int 6
    %89 = torch.prims.convert_element_type %85, %int6_70 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %90 = torch.aten.mm %88, %89 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[1,3072],f32>
    %int1_71 = torch.constant.int 1
    %91 = torch.aten.mul.Scalar %90, %int1_71 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int1_72 = torch.constant.int 1
    %92 = torch.aten.mul.Scalar %87, %int1_72 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_73 = torch.constant.int 1
    %93 = torch.aten.add.Tensor %91, %92, %int1_73 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int5_74 = torch.constant.int 5
    %94 = torch.prims.convert_element_type %93, %int5_74 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %int1_75 = torch.constant.int 1
    %95 = torch.aten.add.Tensor %55, %94, %int1_75 : !torch.vtensor<[1,3072],f16>, !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.vector_in.in_layer.weight = util.global.load @__auto.sampler.vector_in.in_layer.weight : tensor<3072x768xf16>
    %96 = torch_c.from_builtin_tensor %__auto.sampler.vector_in.in_layer.weight : tensor<3072x768xf16> -> !torch.vtensor<[3072,768],f16>
    %int0_76 = torch.constant.int 0
    %int1_77 = torch.constant.int 1
    %97 = torch.aten.transpose.int %96, %int0_76, %int1_77 : !torch.vtensor<[3072,768],f16>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f16>
    %__auto.sampler.vector_in.in_layer.bias = util.global.load @__auto.sampler.vector_in.in_layer.bias : tensor<3072xf16>
    %98 = torch_c.from_builtin_tensor %__auto.sampler.vector_in.in_layer.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_78 = torch.constant.int 6
    %99 = torch.prims.convert_element_type %98, %int6_78 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_79 = torch.constant.int 6
    %100 = torch.prims.convert_element_type %arg4, %int6_79 : !torch.vtensor<[1,768],f16>, !torch.int -> !torch.vtensor<[1,768],f32>
    %int6_80 = torch.constant.int 6
    %101 = torch.prims.convert_element_type %97, %int6_80 : !torch.vtensor<[768,3072],f16>, !torch.int -> !torch.vtensor<[768,3072],f32>
    %102 = torch.aten.mm %100, %101 : !torch.vtensor<[1,768],f32>, !torch.vtensor<[768,3072],f32> -> !torch.vtensor<[1,3072],f32>
    %int1_81 = torch.constant.int 1
    %103 = torch.aten.mul.Scalar %102, %int1_81 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int1_82 = torch.constant.int 1
    %104 = torch.aten.mul.Scalar %99, %int1_82 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_83 = torch.constant.int 1
    %105 = torch.aten.add.Tensor %103, %104, %int1_83 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int5_84 = torch.constant.int 5
    %106 = torch.prims.convert_element_type %105, %int5_84 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %107 = torch.aten.silu %106 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.vector_in.out_layer.weight = util.global.load @__auto.sampler.vector_in.out_layer.weight : tensor<3072x3072xf16>
    %108 = torch_c.from_builtin_tensor %__auto.sampler.vector_in.out_layer.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_85 = torch.constant.int 0
    %int1_86 = torch.constant.int 1
    %109 = torch.aten.transpose.int %108, %int0_85, %int1_86 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.vector_in.out_layer.bias = util.global.load @__auto.sampler.vector_in.out_layer.bias : tensor<3072xf16>
    %110 = torch_c.from_builtin_tensor %__auto.sampler.vector_in.out_layer.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_87 = torch.constant.int 6
    %111 = torch.prims.convert_element_type %110, %int6_87 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_88 = torch.constant.int 6
    %112 = torch.prims.convert_element_type %107, %int6_88 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_89 = torch.constant.int 6
    %113 = torch.prims.convert_element_type %109, %int6_89 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %114 = torch.aten.mm %112, %113 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[1,3072],f32>
    %int1_90 = torch.constant.int 1
    %115 = torch.aten.mul.Scalar %114, %int1_90 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int1_91 = torch.constant.int 1
    %116 = torch.aten.mul.Scalar %111, %int1_91 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_92 = torch.constant.int 1
    %117 = torch.aten.add.Tensor %115, %116, %int1_92 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int5_93 = torch.constant.int 5
    %118 = torch.prims.convert_element_type %117, %int5_93 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %int1_94 = torch.constant.int 1
    %119 = torch.aten.add.Tensor %95, %118, %int1_94 : !torch.vtensor<[1,3072],f16>, !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %int512 = torch.constant.int 512
    %int4096_95 = torch.constant.int 4096
    %120 = torch.prim.ListConstruct %int512, %int4096_95 : (!torch.int, !torch.int) -> !torch.list<int>
    %121 = torch.aten.view %arg2, %120 : !torch.vtensor<[1,512,4096],f16>, !torch.list<int> -> !torch.vtensor<[512,4096],f16>
    %__auto.sampler.txt_in.weight = util.global.load @__auto.sampler.txt_in.weight : tensor<3072x4096xf16>
    %122 = torch_c.from_builtin_tensor %__auto.sampler.txt_in.weight : tensor<3072x4096xf16> -> !torch.vtensor<[3072,4096],f16>
    %int0_96 = torch.constant.int 0
    %int1_97 = torch.constant.int 1
    %123 = torch.aten.transpose.int %122, %int0_96, %int1_97 : !torch.vtensor<[3072,4096],f16>, !torch.int, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.txt_in.bias = util.global.load @__auto.sampler.txt_in.bias : tensor<3072xf16>
    %124 = torch_c.from_builtin_tensor %__auto.sampler.txt_in.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_98 = torch.constant.int 6
    %125 = torch.prims.convert_element_type %124, %int6_98 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_99 = torch.constant.int 6
    %126 = torch.prims.convert_element_type %121, %int6_99 : !torch.vtensor<[512,4096],f16>, !torch.int -> !torch.vtensor<[512,4096],f32>
    %int6_100 = torch.constant.int 6
    %127 = torch.prims.convert_element_type %123, %int6_100 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %128 = torch.aten.mm %126, %127 : !torch.vtensor<[512,4096],f32>, !torch.vtensor<[4096,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_101 = torch.constant.int 1
    %129 = torch.aten.mul.Scalar %128, %int1_101 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_102 = torch.constant.int 1
    %130 = torch.aten.mul.Scalar %125, %int1_102 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_103 = torch.constant.int 1
    %131 = torch.aten.add.Tensor %129, %130, %int1_103 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_104 = torch.constant.int 5
    %132 = torch.prims.convert_element_type %131, %int5_104 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_105 = torch.constant.int 1
    %int512_106 = torch.constant.int 512
    %int3072_107 = torch.constant.int 3072
    %133 = torch.prim.ListConstruct %int1_105, %int512_106, %int3072_107 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %134 = torch.aten.view %132, %133 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %135 = torch.prim.ListConstruct %arg3, %arg1 : (!torch.vtensor<[1,512,3],f16>, !torch.vtensor<[1,4096,3],f16>) -> !torch.list<vtensor>
    %int1_108 = torch.constant.int 1
    %136 = torch.aten.cat %135, %int1_108 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,3],f16>
    %int2 = torch.constant.int 2
    %int0_109 = torch.constant.int 0
    %137 = torch.aten.select.int %136, %int2, %int0_109 : !torch.vtensor<[1,4608,3],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,4608],f16>
    %int0_110 = torch.constant.int 0
    %int16 = torch.constant.int 16
    %int2_111 = torch.constant.int 2
    %int7 = torch.constant.int 7
    %none_112 = torch.constant.none
    %cpu_113 = torch.constant.device "cpu"
    %false_114 = torch.constant.bool false
    %138 = torch.aten.arange.start_step %int0_110, %int16, %int2_111, %int7, %none_112, %cpu_113, %false_114 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[8],f64>
    %int16_115 = torch.constant.int 16
    %139 = torch.aten.div.Scalar %138, %int16_115 : !torch.vtensor<[8],f64>, !torch.int -> !torch.vtensor<[8],f64>
    %int10000 = torch.constant.int 10000
    %140 = torch.aten.pow.Scalar %int10000, %139 : !torch.int, !torch.vtensor<[8],f64> -> !torch.vtensor<[8],f64>
    %141 = torch.aten.reciprocal %140 : !torch.vtensor<[8],f64> -> !torch.vtensor<[8],f64>
    %float1.000000e00 = torch.constant.float 1.000000e+00
    %142 = torch.aten.mul.Scalar %141, %float1.000000e00 : !torch.vtensor<[8],f64>, !torch.float -> !torch.vtensor<[8],f64>
    %int2_116 = torch.constant.int 2
    %143 = torch.aten.unsqueeze %137, %int2_116 : !torch.vtensor<[1,4608],f16>, !torch.int -> !torch.vtensor<[1,4608,1],f16>
    %int0_117 = torch.constant.int 0
    %int1_118 = torch.constant.int 1
    %int2_119 = torch.constant.int 2
    %144 = torch.prim.ListConstruct %int0_117, %int1_118, %int2_119 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %145 = torch.aten.permute %143, %144 : !torch.vtensor<[1,4608,1],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,1],f16>
    %int1_120 = torch.constant.int 1
    %146 = torch.aten.unsqueeze %142, %int1_120 : !torch.vtensor<[8],f64>, !torch.int -> !torch.vtensor<[8,1],f64>
    %int2_121 = torch.constant.int 2
    %147 = torch.aten.unsqueeze %146, %int2_121 : !torch.vtensor<[8,1],f64>, !torch.int -> !torch.vtensor<[8,1,1],f64>
    %int1_122 = torch.constant.int 1
    %int2_123 = torch.constant.int 2
    %int0_124 = torch.constant.int 0
    %148 = torch.prim.ListConstruct %int1_122, %int2_123, %int0_124 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %149 = torch.aten.permute %147, %148 : !torch.vtensor<[8,1,1],f64>, !torch.list<int> -> !torch.vtensor<[1,1,8],f64>
    %150 = torch.aten.mul.Tensor %145, %149 : !torch.vtensor<[1,4608,1],f16>, !torch.vtensor<[1,1,8],f64> -> !torch.vtensor<[1,4608,8],f64>
    %151 = torch.aten.cos %150 : !torch.vtensor<[1,4608,8],f64> -> !torch.vtensor<[1,4608,8],f64>
    %152 = torch.aten.sin %150 : !torch.vtensor<[1,4608,8],f64> -> !torch.vtensor<[1,4608,8],f64>
    %153 = torch.aten.neg %152 : !torch.vtensor<[1,4608,8],f64> -> !torch.vtensor<[1,4608,8],f64>
    %154 = torch.aten.sin %150 : !torch.vtensor<[1,4608,8],f64> -> !torch.vtensor<[1,4608,8],f64>
    %155 = torch.aten.cos %150 : !torch.vtensor<[1,4608,8],f64> -> !torch.vtensor<[1,4608,8],f64>
    %156 = torch.prim.ListConstruct %151, %153, %154, %155 : (!torch.vtensor<[1,4608,8],f64>, !torch.vtensor<[1,4608,8],f64>, !torch.vtensor<[1,4608,8],f64>, !torch.vtensor<[1,4608,8],f64>) -> !torch.list<vtensor>
    %int-1_125 = torch.constant.int -1
    %157 = torch.aten.stack %156, %int-1_125 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,8,4],f64>
    %int1_126 = torch.constant.int 1
    %int4608 = torch.constant.int 4608
    %int8 = torch.constant.int 8
    %int2_127 = torch.constant.int 2
    %int2_128 = torch.constant.int 2
    %158 = torch.prim.ListConstruct %int1_126, %int4608, %int8, %int2_127, %int2_128 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %159 = torch.aten.view %157, %158 : !torch.vtensor<[1,4608,8,4],f64>, !torch.list<int> -> !torch.vtensor<[1,4608,8,2,2],f64>
    %int6_129 = torch.constant.int 6
    %160 = torch.prims.convert_element_type %159, %int6_129 : !torch.vtensor<[1,4608,8,2,2],f64>, !torch.int -> !torch.vtensor<[1,4608,8,2,2],f32>
    %int2_130 = torch.constant.int 2
    %int1_131 = torch.constant.int 1
    %161 = torch.aten.select.int %136, %int2_130, %int1_131 : !torch.vtensor<[1,4608,3],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,4608],f16>
    %int0_132 = torch.constant.int 0
    %int56 = torch.constant.int 56
    %int2_133 = torch.constant.int 2
    %int7_134 = torch.constant.int 7
    %none_135 = torch.constant.none
    %cpu_136 = torch.constant.device "cpu"
    %false_137 = torch.constant.bool false
    %162 = torch.aten.arange.start_step %int0_132, %int56, %int2_133, %int7_134, %none_135, %cpu_136, %false_137 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[28],f64>
    %int56_138 = torch.constant.int 56
    %163 = torch.aten.div.Scalar %162, %int56_138 : !torch.vtensor<[28],f64>, !torch.int -> !torch.vtensor<[28],f64>
    %int10000_139 = torch.constant.int 10000
    %164 = torch.aten.pow.Scalar %int10000_139, %163 : !torch.int, !torch.vtensor<[28],f64> -> !torch.vtensor<[28],f64>
    %165 = torch.aten.reciprocal %164 : !torch.vtensor<[28],f64> -> !torch.vtensor<[28],f64>
    %float1.000000e00_140 = torch.constant.float 1.000000e+00
    %166 = torch.aten.mul.Scalar %165, %float1.000000e00_140 : !torch.vtensor<[28],f64>, !torch.float -> !torch.vtensor<[28],f64>
    %int2_141 = torch.constant.int 2
    %167 = torch.aten.unsqueeze %161, %int2_141 : !torch.vtensor<[1,4608],f16>, !torch.int -> !torch.vtensor<[1,4608,1],f16>
    %int0_142 = torch.constant.int 0
    %int1_143 = torch.constant.int 1
    %int2_144 = torch.constant.int 2
    %168 = torch.prim.ListConstruct %int0_142, %int1_143, %int2_144 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %169 = torch.aten.permute %167, %168 : !torch.vtensor<[1,4608,1],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,1],f16>
    %int1_145 = torch.constant.int 1
    %170 = torch.aten.unsqueeze %166, %int1_145 : !torch.vtensor<[28],f64>, !torch.int -> !torch.vtensor<[28,1],f64>
    %int2_146 = torch.constant.int 2
    %171 = torch.aten.unsqueeze %170, %int2_146 : !torch.vtensor<[28,1],f64>, !torch.int -> !torch.vtensor<[28,1,1],f64>
    %int1_147 = torch.constant.int 1
    %int2_148 = torch.constant.int 2
    %int0_149 = torch.constant.int 0
    %172 = torch.prim.ListConstruct %int1_147, %int2_148, %int0_149 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %173 = torch.aten.permute %171, %172 : !torch.vtensor<[28,1,1],f64>, !torch.list<int> -> !torch.vtensor<[1,1,28],f64>
    %174 = torch.aten.mul.Tensor %169, %173 : !torch.vtensor<[1,4608,1],f16>, !torch.vtensor<[1,1,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %175 = torch.aten.cos %174 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %176 = torch.aten.sin %174 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %177 = torch.aten.neg %176 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %178 = torch.aten.sin %174 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %179 = torch.aten.cos %174 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %180 = torch.prim.ListConstruct %175, %177, %178, %179 : (!torch.vtensor<[1,4608,28],f64>, !torch.vtensor<[1,4608,28],f64>, !torch.vtensor<[1,4608,28],f64>, !torch.vtensor<[1,4608,28],f64>) -> !torch.list<vtensor>
    %int-1_150 = torch.constant.int -1
    %181 = torch.aten.stack %180, %int-1_150 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,28,4],f64>
    %int1_151 = torch.constant.int 1
    %int4608_152 = torch.constant.int 4608
    %int28 = torch.constant.int 28
    %int2_153 = torch.constant.int 2
    %int2_154 = torch.constant.int 2
    %182 = torch.prim.ListConstruct %int1_151, %int4608_152, %int28, %int2_153, %int2_154 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %183 = torch.aten.view %181, %182 : !torch.vtensor<[1,4608,28,4],f64>, !torch.list<int> -> !torch.vtensor<[1,4608,28,2,2],f64>
    %int6_155 = torch.constant.int 6
    %184 = torch.prims.convert_element_type %183, %int6_155 : !torch.vtensor<[1,4608,28,2,2],f64>, !torch.int -> !torch.vtensor<[1,4608,28,2,2],f32>
    %int2_156 = torch.constant.int 2
    %int2_157 = torch.constant.int 2
    %185 = torch.aten.select.int %136, %int2_156, %int2_157 : !torch.vtensor<[1,4608,3],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,4608],f16>
    %int0_158 = torch.constant.int 0
    %int56_159 = torch.constant.int 56
    %int2_160 = torch.constant.int 2
    %int7_161 = torch.constant.int 7
    %none_162 = torch.constant.none
    %cpu_163 = torch.constant.device "cpu"
    %false_164 = torch.constant.bool false
    %186 = torch.aten.arange.start_step %int0_158, %int56_159, %int2_160, %int7_161, %none_162, %cpu_163, %false_164 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[28],f64>
    %int56_165 = torch.constant.int 56
    %187 = torch.aten.div.Scalar %186, %int56_165 : !torch.vtensor<[28],f64>, !torch.int -> !torch.vtensor<[28],f64>
    %int10000_166 = torch.constant.int 10000
    %188 = torch.aten.pow.Scalar %int10000_166, %187 : !torch.int, !torch.vtensor<[28],f64> -> !torch.vtensor<[28],f64>
    %189 = torch.aten.reciprocal %188 : !torch.vtensor<[28],f64> -> !torch.vtensor<[28],f64>
    %float1.000000e00_167 = torch.constant.float 1.000000e+00
    %190 = torch.aten.mul.Scalar %189, %float1.000000e00_167 : !torch.vtensor<[28],f64>, !torch.float -> !torch.vtensor<[28],f64>
    %int2_168 = torch.constant.int 2
    %191 = torch.aten.unsqueeze %185, %int2_168 : !torch.vtensor<[1,4608],f16>, !torch.int -> !torch.vtensor<[1,4608,1],f16>
    %int0_169 = torch.constant.int 0
    %int1_170 = torch.constant.int 1
    %int2_171 = torch.constant.int 2
    %192 = torch.prim.ListConstruct %int0_169, %int1_170, %int2_171 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %193 = torch.aten.permute %191, %192 : !torch.vtensor<[1,4608,1],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,1],f16>
    %int1_172 = torch.constant.int 1
    %194 = torch.aten.unsqueeze %190, %int1_172 : !torch.vtensor<[28],f64>, !torch.int -> !torch.vtensor<[28,1],f64>
    %int2_173 = torch.constant.int 2
    %195 = torch.aten.unsqueeze %194, %int2_173 : !torch.vtensor<[28,1],f64>, !torch.int -> !torch.vtensor<[28,1,1],f64>
    %int1_174 = torch.constant.int 1
    %int2_175 = torch.constant.int 2
    %int0_176 = torch.constant.int 0
    %196 = torch.prim.ListConstruct %int1_174, %int2_175, %int0_176 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %197 = torch.aten.permute %195, %196 : !torch.vtensor<[28,1,1],f64>, !torch.list<int> -> !torch.vtensor<[1,1,28],f64>
    %198 = torch.aten.mul.Tensor %193, %197 : !torch.vtensor<[1,4608,1],f16>, !torch.vtensor<[1,1,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %199 = torch.aten.cos %198 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %200 = torch.aten.sin %198 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %201 = torch.aten.neg %200 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %202 = torch.aten.sin %198 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %203 = torch.aten.cos %198 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %204 = torch.prim.ListConstruct %199, %201, %202, %203 : (!torch.vtensor<[1,4608,28],f64>, !torch.vtensor<[1,4608,28],f64>, !torch.vtensor<[1,4608,28],f64>, !torch.vtensor<[1,4608,28],f64>) -> !torch.list<vtensor>
    %int-1_177 = torch.constant.int -1
    %205 = torch.aten.stack %204, %int-1_177 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,28,4],f64>
    %int1_178 = torch.constant.int 1
    %int4608_179 = torch.constant.int 4608
    %int28_180 = torch.constant.int 28
    %int2_181 = torch.constant.int 2
    %int2_182 = torch.constant.int 2
    %206 = torch.prim.ListConstruct %int1_178, %int4608_179, %int28_180, %int2_181, %int2_182 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %207 = torch.aten.view %205, %206 : !torch.vtensor<[1,4608,28,4],f64>, !torch.list<int> -> !torch.vtensor<[1,4608,28,2,2],f64>
    %int6_183 = torch.constant.int 6
    %208 = torch.prims.convert_element_type %207, %int6_183 : !torch.vtensor<[1,4608,28,2,2],f64>, !torch.int -> !torch.vtensor<[1,4608,28,2,2],f32>
    %209 = torch.prim.ListConstruct %160, %184, %208 : (!torch.vtensor<[1,4608,8,2,2],f32>, !torch.vtensor<[1,4608,28,2,2],f32>, !torch.vtensor<[1,4608,28,2,2],f32>) -> !torch.list<vtensor>
    %int-3 = torch.constant.int -3
    %210 = torch.aten.cat %209, %int-3 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,64,2,2],f32>
    %int1_184 = torch.constant.int 1
    %211 = torch.aten.unsqueeze %210, %int1_184 : !torch.vtensor<[1,4608,64,2,2],f32>, !torch.int -> !torch.vtensor<[1,1,4608,64,2,2],f32>
    %212 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.0.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.0.img_mod.lin.weight : tensor<18432x3072xf16>
    %213 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_185 = torch.constant.int 0
    %int1_186 = torch.constant.int 1
    %214 = torch.aten.transpose.int %213, %int0_185, %int1_186 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.0.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.0.img_mod.lin.bias : tensor<18432xf16>
    %215 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_187 = torch.constant.int 6
    %216 = torch.prims.convert_element_type %215, %int6_187 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_188 = torch.constant.int 6
    %217 = torch.prims.convert_element_type %212, %int6_188 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_189 = torch.constant.int 6
    %218 = torch.prims.convert_element_type %214, %int6_189 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %219 = torch.aten.mm %217, %218 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_190 = torch.constant.int 1
    %220 = torch.aten.mul.Scalar %219, %int1_190 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_191 = torch.constant.int 1
    %221 = torch.aten.mul.Scalar %216, %int1_191 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_192 = torch.constant.int 1
    %222 = torch.aten.add.Tensor %220, %221, %int1_192 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_193 = torch.constant.int 5
    %223 = torch.prims.convert_element_type %222, %int5_193 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_194 = torch.constant.int 0
    %int0_195 = torch.constant.int 0
    %int9223372036854775807_196 = torch.constant.int 9223372036854775807
    %int1_197 = torch.constant.int 1
    %224 = torch.aten.slice.Tensor %223, %int0_194, %int0_195, %int9223372036854775807_196, %int1_197 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_198 = torch.constant.int 1
    %225 = torch.aten.unsqueeze %224, %int1_198 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_199 = torch.constant.int 2
    %int0_200 = torch.constant.int 0
    %int9223372036854775807_201 = torch.constant.int 9223372036854775807
    %int1_202 = torch.constant.int 1
    %226 = torch.aten.slice.Tensor %225, %int2_199, %int0_200, %int9223372036854775807_201, %int1_202 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_203 = torch.constant.int -1
    %int0_204 = torch.constant.int 0
    %int3072_205 = torch.constant.int 3072
    %int1_206 = torch.constant.int 1
    %227 = torch.aten.slice.Tensor %226, %int-1_203, %int0_204, %int3072_205, %int1_206 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_207 = torch.constant.int -1
    %int3072_208 = torch.constant.int 3072
    %int6144 = torch.constant.int 6144
    %int1_209 = torch.constant.int 1
    %228 = torch.aten.slice.Tensor %226, %int-1_207, %int3072_208, %int6144, %int1_209 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_210 = torch.constant.int -1
    %int6144_211 = torch.constant.int 6144
    %int9216 = torch.constant.int 9216
    %int1_212 = torch.constant.int 1
    %229 = torch.aten.slice.Tensor %226, %int-1_210, %int6144_211, %int9216, %int1_212 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_213 = torch.constant.int -1
    %int9216_214 = torch.constant.int 9216
    %int12288 = torch.constant.int 12288
    %int1_215 = torch.constant.int 1
    %230 = torch.aten.slice.Tensor %226, %int-1_213, %int9216_214, %int12288, %int1_215 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_216 = torch.constant.int -1
    %int12288_217 = torch.constant.int 12288
    %int15360 = torch.constant.int 15360
    %int1_218 = torch.constant.int 1
    %231 = torch.aten.slice.Tensor %226, %int-1_216, %int12288_217, %int15360, %int1_218 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_219 = torch.constant.int -1
    %int15360_220 = torch.constant.int 15360
    %int18432 = torch.constant.int 18432
    %int1_221 = torch.constant.int 1
    %232 = torch.aten.slice.Tensor %226, %int-1_219, %int15360_220, %int18432, %int1_221 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %233 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.0.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.0.txt_mod.lin.weight : tensor<18432x3072xf16>
    %234 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_222 = torch.constant.int 0
    %int1_223 = torch.constant.int 1
    %235 = torch.aten.transpose.int %234, %int0_222, %int1_223 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.0.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.0.txt_mod.lin.bias : tensor<18432xf16>
    %236 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_224 = torch.constant.int 6
    %237 = torch.prims.convert_element_type %236, %int6_224 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_225 = torch.constant.int 6
    %238 = torch.prims.convert_element_type %233, %int6_225 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_226 = torch.constant.int 6
    %239 = torch.prims.convert_element_type %235, %int6_226 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %240 = torch.aten.mm %238, %239 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_227 = torch.constant.int 1
    %241 = torch.aten.mul.Scalar %240, %int1_227 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_228 = torch.constant.int 1
    %242 = torch.aten.mul.Scalar %237, %int1_228 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_229 = torch.constant.int 1
    %243 = torch.aten.add.Tensor %241, %242, %int1_229 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_230 = torch.constant.int 5
    %244 = torch.prims.convert_element_type %243, %int5_230 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_231 = torch.constant.int 0
    %int0_232 = torch.constant.int 0
    %int9223372036854775807_233 = torch.constant.int 9223372036854775807
    %int1_234 = torch.constant.int 1
    %245 = torch.aten.slice.Tensor %244, %int0_231, %int0_232, %int9223372036854775807_233, %int1_234 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_235 = torch.constant.int 1
    %246 = torch.aten.unsqueeze %245, %int1_235 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_236 = torch.constant.int 2
    %int0_237 = torch.constant.int 0
    %int9223372036854775807_238 = torch.constant.int 9223372036854775807
    %int1_239 = torch.constant.int 1
    %247 = torch.aten.slice.Tensor %246, %int2_236, %int0_237, %int9223372036854775807_238, %int1_239 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_240 = torch.constant.int -1
    %int0_241 = torch.constant.int 0
    %int3072_242 = torch.constant.int 3072
    %int1_243 = torch.constant.int 1
    %248 = torch.aten.slice.Tensor %247, %int-1_240, %int0_241, %int3072_242, %int1_243 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_244 = torch.constant.int -1
    %int3072_245 = torch.constant.int 3072
    %int6144_246 = torch.constant.int 6144
    %int1_247 = torch.constant.int 1
    %249 = torch.aten.slice.Tensor %247, %int-1_244, %int3072_245, %int6144_246, %int1_247 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_248 = torch.constant.int -1
    %int6144_249 = torch.constant.int 6144
    %int9216_250 = torch.constant.int 9216
    %int1_251 = torch.constant.int 1
    %250 = torch.aten.slice.Tensor %247, %int-1_248, %int6144_249, %int9216_250, %int1_251 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_252 = torch.constant.int -1
    %int9216_253 = torch.constant.int 9216
    %int12288_254 = torch.constant.int 12288
    %int1_255 = torch.constant.int 1
    %251 = torch.aten.slice.Tensor %247, %int-1_252, %int9216_253, %int12288_254, %int1_255 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_256 = torch.constant.int -1
    %int12288_257 = torch.constant.int 12288
    %int15360_258 = torch.constant.int 15360
    %int1_259 = torch.constant.int 1
    %252 = torch.aten.slice.Tensor %247, %int-1_256, %int12288_257, %int15360_258, %int1_259 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_260 = torch.constant.int -1
    %int15360_261 = torch.constant.int 15360
    %int18432_262 = torch.constant.int 18432
    %int1_263 = torch.constant.int 1
    %253 = torch.aten.slice.Tensor %247, %int-1_260, %int15360_261, %int18432_262, %int1_263 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_264 = torch.constant.int 6
    %254 = torch.prims.convert_element_type %16, %int6_264 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_265 = torch.constant.int 2
    %255 = torch.prim.ListConstruct %int2_265 : (!torch.int) -> !torch.list<int>
    %int0_266 = torch.constant.int 0
    %true = torch.constant.bool true
    %result0, %result1 = torch.aten.var_mean.correction %254, %255, %int0_266, %true : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07 = torch.constant.float 9.9999999999999995E-7
    %int1_267 = torch.constant.int 1
    %256 = torch.aten.add.Scalar %result0, %float9.999990e-07, %int1_267 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %257 = torch.aten.rsqrt %256 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_268 = torch.constant.int 1
    %258 = torch.aten.sub.Tensor %16, %result1, %int1_268 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %259 = torch.aten.mul.Tensor %258, %257 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_269 = torch.constant.int 5
    %260 = torch.prims.convert_element_type %259, %int5_269 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_270 = torch.constant.int 1
    %int1_271 = torch.constant.int 1
    %261 = torch.aten.add.Scalar %228, %int1_270, %int1_271 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %262 = torch.aten.mul.Tensor %261, %260 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_272 = torch.constant.int 1
    %263 = torch.aten.add.Tensor %262, %227, %int1_272 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_273 = torch.constant.int 4096
    %int3072_274 = torch.constant.int 3072
    %264 = torch.prim.ListConstruct %int4096_273, %int3072_274 : (!torch.int, !torch.int) -> !torch.list<int>
    %265 = torch.aten.view %263, %264 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.0.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.0.img_attn.qkv.weight : tensor<9216x3072xf16>
    %266 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_275 = torch.constant.int 0
    %int1_276 = torch.constant.int 1
    %267 = torch.aten.transpose.int %266, %int0_275, %int1_276 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.0.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.0.img_attn.qkv.bias : tensor<9216xf16>
    %268 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_277 = torch.constant.int 6
    %269 = torch.prims.convert_element_type %268, %int6_277 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_278 = torch.constant.int 6
    %270 = torch.prims.convert_element_type %265, %int6_278 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_279 = torch.constant.int 6
    %271 = torch.prims.convert_element_type %267, %int6_279 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %272 = torch.aten.mm %270, %271 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_280 = torch.constant.int 1
    %273 = torch.aten.mul.Scalar %272, %int1_280 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_281 = torch.constant.int 1
    %274 = torch.aten.mul.Scalar %269, %int1_281 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_282 = torch.constant.int 1
    %275 = torch.aten.add.Tensor %273, %274, %int1_282 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_283 = torch.constant.int 5
    %276 = torch.prims.convert_element_type %275, %int5_283 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_284 = torch.constant.int 1
    %int4096_285 = torch.constant.int 4096
    %int9216_286 = torch.constant.int 9216
    %277 = torch.prim.ListConstruct %int1_284, %int4096_285, %int9216_286 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %278 = torch.aten.view %276, %277 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %279 = torch_c.to_builtin_tensor %278 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast = tensor.cast %279 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0 = arith.constant 0 : index
    %dim = tensor.dim %cast, %c0 : tensor<?x?x?xf16>
    %c1 = arith.constant 1 : index
    %dim_287 = tensor.dim %cast, %c1 : tensor<?x?x?xf16>
    %c2 = arith.constant 2 : index
    %dim_288 = tensor.dim %cast, %c2 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast : tensor<?x?x?xf16>{%dim, %dim_287, %dim_288}]
    %cast_289 = tensor.cast %cast : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %280 = torch_c.from_builtin_tensor %cast_289 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_290 = torch.constant.int 1
    %int4096_291 = torch.constant.int 4096
    %int3 = torch.constant.int 3
    %int24 = torch.constant.int 24
    %int128_292 = torch.constant.int 128
    %281 = torch.prim.ListConstruct %int1_290, %int4096_291, %int3, %int24, %int128_292 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %282 = torch.aten.view %280, %281 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_293 = torch.constant.int 2
    %int0_294 = torch.constant.int 0
    %int3_295 = torch.constant.int 3
    %int1_296 = torch.constant.int 1
    %int4 = torch.constant.int 4
    %283 = torch.prim.ListConstruct %int2_293, %int0_294, %int3_295, %int1_296, %int4 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %284 = torch.aten.permute %282, %283 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_297 = torch.constant.int 0
    %int0_298 = torch.constant.int 0
    %285 = torch.aten.select.int %284, %int0_297, %int0_298 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_299 = torch.constant.int 6
    %286 = torch.prims.convert_element_type %285, %int6_299 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_300 = torch.constant.int 2
    %287 = torch.aten.pow.Tensor_Scalar %286, %int2_300 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_301 = torch.constant.int -1
    %288 = torch.prim.ListConstruct %int-1_301 : (!torch.int) -> !torch.list<int>
    %true_302 = torch.constant.bool true
    %none_303 = torch.constant.none
    %289 = torch.aten.mean.dim %287, %288, %true_302, %none_303 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_304 = torch.constant.float 9.9999999999999995E-7
    %int1_305 = torch.constant.int 1
    %290 = torch.aten.add.Scalar %289, %float9.999990e-07_304, %int1_305 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %291 = torch.aten.rsqrt %290 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %292 = torch.aten.mul.Tensor %286, %291 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_306 = torch.constant.int 5
    %293 = torch.prims.convert_element_type %292, %int5_306 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.0.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.0.img_attn.norm.query_norm.scale : tensor<128xf16>
    %294 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %295 = torch.aten.mul.Tensor %293, %294 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_307 = torch.constant.int 1
    %int4096_308 = torch.constant.int 4096
    %int3_309 = torch.constant.int 3
    %int24_310 = torch.constant.int 24
    %int128_311 = torch.constant.int 128
    %296 = torch.prim.ListConstruct %int1_307, %int4096_308, %int3_309, %int24_310, %int128_311 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %297 = torch.aten.view %280, %296 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_312 = torch.constant.int 2
    %int0_313 = torch.constant.int 0
    %int3_314 = torch.constant.int 3
    %int1_315 = torch.constant.int 1
    %int4_316 = torch.constant.int 4
    %298 = torch.prim.ListConstruct %int2_312, %int0_313, %int3_314, %int1_315, %int4_316 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %299 = torch.aten.permute %297, %298 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_317 = torch.constant.int 0
    %int1_318 = torch.constant.int 1
    %300 = torch.aten.select.int %299, %int0_317, %int1_318 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_319 = torch.constant.int 6
    %301 = torch.prims.convert_element_type %300, %int6_319 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_320 = torch.constant.int 2
    %302 = torch.aten.pow.Tensor_Scalar %301, %int2_320 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_321 = torch.constant.int -1
    %303 = torch.prim.ListConstruct %int-1_321 : (!torch.int) -> !torch.list<int>
    %true_322 = torch.constant.bool true
    %none_323 = torch.constant.none
    %304 = torch.aten.mean.dim %302, %303, %true_322, %none_323 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_324 = torch.constant.float 9.9999999999999995E-7
    %int1_325 = torch.constant.int 1
    %305 = torch.aten.add.Scalar %304, %float9.999990e-07_324, %int1_325 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %306 = torch.aten.rsqrt %305 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %307 = torch.aten.mul.Tensor %301, %306 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_326 = torch.constant.int 5
    %308 = torch.prims.convert_element_type %307, %int5_326 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.0.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.0.img_attn.norm.key_norm.scale : tensor<128xf16>
    %309 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %310 = torch.aten.mul.Tensor %308, %309 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_327 = torch.constant.int 5
    %311 = torch.prims.convert_element_type %295, %int5_327 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_328 = torch.constant.int 5
    %312 = torch.prims.convert_element_type %310, %int5_328 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_329 = torch.constant.int 6
    %313 = torch.prims.convert_element_type %134, %int6_329 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_330 = torch.constant.int 2
    %314 = torch.prim.ListConstruct %int2_330 : (!torch.int) -> !torch.list<int>
    %int0_331 = torch.constant.int 0
    %true_332 = torch.constant.bool true
    %result0_333, %result1_334 = torch.aten.var_mean.correction %313, %314, %int0_331, %true_332 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_335 = torch.constant.float 9.9999999999999995E-7
    %int1_336 = torch.constant.int 1
    %315 = torch.aten.add.Scalar %result0_333, %float9.999990e-07_335, %int1_336 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %316 = torch.aten.rsqrt %315 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_337 = torch.constant.int 1
    %317 = torch.aten.sub.Tensor %134, %result1_334, %int1_337 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %318 = torch.aten.mul.Tensor %317, %316 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_338 = torch.constant.int 5
    %319 = torch.prims.convert_element_type %318, %int5_338 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_339 = torch.constant.int 1
    %int1_340 = torch.constant.int 1
    %320 = torch.aten.add.Scalar %249, %int1_339, %int1_340 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %321 = torch.aten.mul.Tensor %320, %319 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_341 = torch.constant.int 1
    %322 = torch.aten.add.Tensor %321, %248, %int1_341 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_342 = torch.constant.int 512
    %int3072_343 = torch.constant.int 3072
    %323 = torch.prim.ListConstruct %int512_342, %int3072_343 : (!torch.int, !torch.int) -> !torch.list<int>
    %324 = torch.aten.view %322, %323 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.0.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.0.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %325 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_344 = torch.constant.int 0
    %int1_345 = torch.constant.int 1
    %326 = torch.aten.transpose.int %325, %int0_344, %int1_345 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.0.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.0.txt_attn.qkv.bias : tensor<9216xf16>
    %327 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_346 = torch.constant.int 6
    %328 = torch.prims.convert_element_type %327, %int6_346 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_347 = torch.constant.int 6
    %329 = torch.prims.convert_element_type %324, %int6_347 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_348 = torch.constant.int 6
    %330 = torch.prims.convert_element_type %326, %int6_348 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %331 = torch.aten.mm %329, %330 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_349 = torch.constant.int 1
    %332 = torch.aten.mul.Scalar %331, %int1_349 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_350 = torch.constant.int 1
    %333 = torch.aten.mul.Scalar %328, %int1_350 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_351 = torch.constant.int 1
    %334 = torch.aten.add.Tensor %332, %333, %int1_351 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_352 = torch.constant.int 5
    %335 = torch.prims.convert_element_type %334, %int5_352 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_353 = torch.constant.int 1
    %int512_354 = torch.constant.int 512
    %int9216_355 = torch.constant.int 9216
    %336 = torch.prim.ListConstruct %int1_353, %int512_354, %int9216_355 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %337 = torch.aten.view %335, %336 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %338 = torch_c.to_builtin_tensor %337 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_356 = tensor.cast %338 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_357 = arith.constant 0 : index
    %dim_358 = tensor.dim %cast_356, %c0_357 : tensor<?x?x?xf16>
    %c1_359 = arith.constant 1 : index
    %dim_360 = tensor.dim %cast_356, %c1_359 : tensor<?x?x?xf16>
    %c2_361 = arith.constant 2 : index
    %dim_362 = tensor.dim %cast_356, %c2_361 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_356 : tensor<?x?x?xf16>{%dim_358, %dim_360, %dim_362}]
    %cast_363 = tensor.cast %cast_356 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %339 = torch_c.from_builtin_tensor %cast_363 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_364 = torch.constant.int 1
    %int512_365 = torch.constant.int 512
    %int3_366 = torch.constant.int 3
    %int24_367 = torch.constant.int 24
    %int128_368 = torch.constant.int 128
    %340 = torch.prim.ListConstruct %int1_364, %int512_365, %int3_366, %int24_367, %int128_368 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %341 = torch.aten.view %339, %340 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_369 = torch.constant.int 2
    %int0_370 = torch.constant.int 0
    %int3_371 = torch.constant.int 3
    %int1_372 = torch.constant.int 1
    %int4_373 = torch.constant.int 4
    %342 = torch.prim.ListConstruct %int2_369, %int0_370, %int3_371, %int1_372, %int4_373 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %343 = torch.aten.permute %341, %342 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_374 = torch.constant.int 0
    %int0_375 = torch.constant.int 0
    %344 = torch.aten.select.int %343, %int0_374, %int0_375 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_376 = torch.constant.int 6
    %345 = torch.prims.convert_element_type %344, %int6_376 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_377 = torch.constant.int 2
    %346 = torch.aten.pow.Tensor_Scalar %345, %int2_377 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_378 = torch.constant.int -1
    %347 = torch.prim.ListConstruct %int-1_378 : (!torch.int) -> !torch.list<int>
    %true_379 = torch.constant.bool true
    %none_380 = torch.constant.none
    %348 = torch.aten.mean.dim %346, %347, %true_379, %none_380 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_381 = torch.constant.float 9.9999999999999995E-7
    %int1_382 = torch.constant.int 1
    %349 = torch.aten.add.Scalar %348, %float9.999990e-07_381, %int1_382 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %350 = torch.aten.rsqrt %349 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %351 = torch.aten.mul.Tensor %345, %350 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_383 = torch.constant.int 5
    %352 = torch.prims.convert_element_type %351, %int5_383 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.0.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.0.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %353 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %354 = torch.aten.mul.Tensor %352, %353 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_384 = torch.constant.int 1
    %int512_385 = torch.constant.int 512
    %int3_386 = torch.constant.int 3
    %int24_387 = torch.constant.int 24
    %int128_388 = torch.constant.int 128
    %355 = torch.prim.ListConstruct %int1_384, %int512_385, %int3_386, %int24_387, %int128_388 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %356 = torch.aten.view %339, %355 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_389 = torch.constant.int 2
    %int0_390 = torch.constant.int 0
    %int3_391 = torch.constant.int 3
    %int1_392 = torch.constant.int 1
    %int4_393 = torch.constant.int 4
    %357 = torch.prim.ListConstruct %int2_389, %int0_390, %int3_391, %int1_392, %int4_393 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %358 = torch.aten.permute %356, %357 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_394 = torch.constant.int 0
    %int1_395 = torch.constant.int 1
    %359 = torch.aten.select.int %358, %int0_394, %int1_395 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_396 = torch.constant.int 6
    %360 = torch.prims.convert_element_type %359, %int6_396 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_397 = torch.constant.int 2
    %361 = torch.aten.pow.Tensor_Scalar %360, %int2_397 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_398 = torch.constant.int -1
    %362 = torch.prim.ListConstruct %int-1_398 : (!torch.int) -> !torch.list<int>
    %true_399 = torch.constant.bool true
    %none_400 = torch.constant.none
    %363 = torch.aten.mean.dim %361, %362, %true_399, %none_400 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_401 = torch.constant.float 9.9999999999999995E-7
    %int1_402 = torch.constant.int 1
    %364 = torch.aten.add.Scalar %363, %float9.999990e-07_401, %int1_402 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %365 = torch.aten.rsqrt %364 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %366 = torch.aten.mul.Tensor %360, %365 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_403 = torch.constant.int 5
    %367 = torch.prims.convert_element_type %366, %int5_403 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.0.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.0.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %368 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %369 = torch.aten.mul.Tensor %367, %368 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_404 = torch.constant.int 5
    %370 = torch.prims.convert_element_type %354, %int5_404 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_405 = torch.constant.int 5
    %371 = torch.prims.convert_element_type %369, %int5_405 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %372 = torch.prim.ListConstruct %370, %311 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_406 = torch.constant.int 2
    %373 = torch.aten.cat %372, %int2_406 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %374 = torch.prim.ListConstruct %371, %312 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_407 = torch.constant.int 2
    %375 = torch.aten.cat %374, %int2_407 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_408 = torch.constant.int 1
    %int512_409 = torch.constant.int 512
    %int3_410 = torch.constant.int 3
    %int24_411 = torch.constant.int 24
    %int128_412 = torch.constant.int 128
    %376 = torch.prim.ListConstruct %int1_408, %int512_409, %int3_410, %int24_411, %int128_412 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %377 = torch.aten.view %339, %376 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_413 = torch.constant.int 2
    %int0_414 = torch.constant.int 0
    %int3_415 = torch.constant.int 3
    %int1_416 = torch.constant.int 1
    %int4_417 = torch.constant.int 4
    %378 = torch.prim.ListConstruct %int2_413, %int0_414, %int3_415, %int1_416, %int4_417 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %379 = torch.aten.permute %377, %378 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_418 = torch.constant.int 0
    %int2_419 = torch.constant.int 2
    %380 = torch.aten.select.int %379, %int0_418, %int2_419 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_420 = torch.constant.int 1
    %int4096_421 = torch.constant.int 4096
    %int3_422 = torch.constant.int 3
    %int24_423 = torch.constant.int 24
    %int128_424 = torch.constant.int 128
    %381 = torch.prim.ListConstruct %int1_420, %int4096_421, %int3_422, %int24_423, %int128_424 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %382 = torch.aten.view %280, %381 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_425 = torch.constant.int 2
    %int0_426 = torch.constant.int 0
    %int3_427 = torch.constant.int 3
    %int1_428 = torch.constant.int 1
    %int4_429 = torch.constant.int 4
    %383 = torch.prim.ListConstruct %int2_425, %int0_426, %int3_427, %int1_428, %int4_429 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %384 = torch.aten.permute %382, %383 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_430 = torch.constant.int 0
    %int2_431 = torch.constant.int 2
    %385 = torch.aten.select.int %384, %int0_430, %int2_431 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %386 = torch.prim.ListConstruct %380, %385 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_432 = torch.constant.int 2
    %387 = torch.aten.cat %386, %int2_432 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %388 = torch_c.to_builtin_tensor %373 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_433 = tensor.cast %388 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_434 = arith.constant 0 : index
    %dim_435 = tensor.dim %cast_433, %c0_434 : tensor<?x?x?x?xf16>
    %c1_436 = arith.constant 1 : index
    %dim_437 = tensor.dim %cast_433, %c1_436 : tensor<?x?x?x?xf16>
    %c2_438 = arith.constant 2 : index
    %dim_439 = tensor.dim %cast_433, %c2_438 : tensor<?x?x?x?xf16>
    %c3 = arith.constant 3 : index
    %dim_440 = tensor.dim %cast_433, %c3 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_433 : tensor<?x?x?x?xf16>{%dim_435, %dim_437, %dim_439, %dim_440}]
    %cast_441 = tensor.cast %cast_433 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %389 = torch_c.from_builtin_tensor %cast_441 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %390 = torch_c.to_builtin_tensor %375 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_442 = tensor.cast %390 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_443 = arith.constant 0 : index
    %dim_444 = tensor.dim %cast_442, %c0_443 : tensor<?x?x?x?xf16>
    %c1_445 = arith.constant 1 : index
    %dim_446 = tensor.dim %cast_442, %c1_445 : tensor<?x?x?x?xf16>
    %c2_447 = arith.constant 2 : index
    %dim_448 = tensor.dim %cast_442, %c2_447 : tensor<?x?x?x?xf16>
    %c3_449 = arith.constant 3 : index
    %dim_450 = tensor.dim %cast_442, %c3_449 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_442 : tensor<?x?x?x?xf16>{%dim_444, %dim_446, %dim_448, %dim_450}]
    %cast_451 = tensor.cast %cast_442 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %391 = torch_c.from_builtin_tensor %cast_451 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %392 = torch_c.to_builtin_tensor %387 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_452 = tensor.cast %392 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_453 = arith.constant 0 : index
    %dim_454 = tensor.dim %cast_452, %c0_453 : tensor<?x?x?x?xf16>
    %c1_455 = arith.constant 1 : index
    %dim_456 = tensor.dim %cast_452, %c1_455 : tensor<?x?x?x?xf16>
    %c2_457 = arith.constant 2 : index
    %dim_458 = tensor.dim %cast_452, %c2_457 : tensor<?x?x?x?xf16>
    %c3_459 = arith.constant 3 : index
    %dim_460 = tensor.dim %cast_452, %c3_459 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_452 : tensor<?x?x?x?xf16>{%dim_454, %dim_456, %dim_458, %dim_460}]
    %cast_461 = tensor.cast %cast_452 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %393 = torch_c.from_builtin_tensor %cast_461 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_462 = torch.constant.int 6
    %394 = torch.prims.convert_element_type %389, %int6_462 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_463 = torch.constant.int 1
    %int24_464 = torch.constant.int 24
    %int4608_465 = torch.constant.int 4608
    %int-1_466 = torch.constant.int -1
    %int1_467 = torch.constant.int 1
    %int2_468 = torch.constant.int 2
    %395 = torch.prim.ListConstruct %int1_463, %int24_464, %int4608_465, %int-1_466, %int1_467, %int2_468 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %396 = torch.aten.view %394, %395 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_469 = torch.constant.int 6
    %397 = torch.prims.convert_element_type %391, %int6_469 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_470 = torch.constant.int 1
    %int24_471 = torch.constant.int 24
    %int4608_472 = torch.constant.int 4608
    %int-1_473 = torch.constant.int -1
    %int1_474 = torch.constant.int 1
    %int2_475 = torch.constant.int 2
    %398 = torch.prim.ListConstruct %int1_470, %int24_471, %int4608_472, %int-1_473, %int1_474, %int2_475 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %399 = torch.aten.view %397, %398 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_476 = torch.constant.int 5
    %int0_477 = torch.constant.int 0
    %400 = torch.aten.select.int %211, %int5_476, %int0_477 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_478 = torch.constant.int 5
    %int0_479 = torch.constant.int 0
    %401 = torch.aten.select.int %396, %int5_478, %int0_479 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %402 = torch.aten.mul.Tensor %400, %401 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_480 = torch.constant.int 5
    %int1_481 = torch.constant.int 1
    %403 = torch.aten.select.int %211, %int5_480, %int1_481 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_482 = torch.constant.int 5
    %int1_483 = torch.constant.int 1
    %404 = torch.aten.select.int %396, %int5_482, %int1_483 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %405 = torch.aten.mul.Tensor %403, %404 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_484 = torch.constant.int 1
    %406 = torch.aten.add.Tensor %402, %405, %int1_484 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_485 = torch.constant.int 5
    %int0_486 = torch.constant.int 0
    %407 = torch.aten.select.int %211, %int5_485, %int0_486 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_487 = torch.constant.int 5
    %int0_488 = torch.constant.int 0
    %408 = torch.aten.select.int %399, %int5_487, %int0_488 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %409 = torch.aten.mul.Tensor %407, %408 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_489 = torch.constant.int 5
    %int1_490 = torch.constant.int 1
    %410 = torch.aten.select.int %211, %int5_489, %int1_490 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_491 = torch.constant.int 5
    %int1_492 = torch.constant.int 1
    %411 = torch.aten.select.int %399, %int5_491, %int1_492 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %412 = torch.aten.mul.Tensor %410, %411 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_493 = torch.constant.int 1
    %413 = torch.aten.add.Tensor %409, %412, %int1_493 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_494 = torch.constant.int 1
    %int24_495 = torch.constant.int 24
    %int4608_496 = torch.constant.int 4608
    %int128_497 = torch.constant.int 128
    %414 = torch.prim.ListConstruct %int1_494, %int24_495, %int4608_496, %int128_497 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %415 = torch.aten.view %406, %414 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_498 = torch.constant.int 5
    %416 = torch.prims.convert_element_type %415, %int5_498 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_499 = torch.constant.int 1
    %int24_500 = torch.constant.int 24
    %int4608_501 = torch.constant.int 4608
    %int128_502 = torch.constant.int 128
    %417 = torch.prim.ListConstruct %int1_499, %int24_500, %int4608_501, %int128_502 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %418 = torch.aten.view %413, %417 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_503 = torch.constant.int 5
    %419 = torch.prims.convert_element_type %418, %int5_503 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00 = torch.constant.float 0.000000e+00
    %false_504 = torch.constant.bool false
    %none_505 = torch.constant.none
    %none_506 = torch.constant.none
    %420:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%416, %419, %393, %float0.000000e00, %false_504, %none_505, %none_506) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_507 = torch.constant.int 0
    %int2_508 = torch.constant.int 2
    %int1_509 = torch.constant.int 1
    %int3_510 = torch.constant.int 3
    %421 = torch.prim.ListConstruct %int0_507, %int2_508, %int1_509, %int3_510 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %422 = torch.aten.permute %420#0, %421 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_511 = torch.constant.int 1
    %int4608_512 = torch.constant.int 4608
    %int3072_513 = torch.constant.int 3072
    %423 = torch.prim.ListConstruct %int1_511, %int4608_512, %int3072_513 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %424 = torch.aten.view %422, %423 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_514 = torch.constant.int 0
    %int0_515 = torch.constant.int 0
    %int9223372036854775807_516 = torch.constant.int 9223372036854775807
    %int1_517 = torch.constant.int 1
    %425 = torch.aten.slice.Tensor %424, %int0_514, %int0_515, %int9223372036854775807_516, %int1_517 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_518 = torch.constant.int 1
    %int0_519 = torch.constant.int 0
    %int512_520 = torch.constant.int 512
    %int1_521 = torch.constant.int 1
    %426 = torch.aten.slice.Tensor %425, %int1_518, %int0_519, %int512_520, %int1_521 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_522 = torch.constant.int 0
    %int0_523 = torch.constant.int 0
    %int9223372036854775807_524 = torch.constant.int 9223372036854775807
    %int1_525 = torch.constant.int 1
    %427 = torch.aten.slice.Tensor %424, %int0_522, %int0_523, %int9223372036854775807_524, %int1_525 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_526 = torch.constant.int 1
    %int512_527 = torch.constant.int 512
    %int9223372036854775807_528 = torch.constant.int 9223372036854775807
    %int1_529 = torch.constant.int 1
    %428 = torch.aten.slice.Tensor %427, %int1_526, %int512_527, %int9223372036854775807_528, %int1_529 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_530 = torch.constant.int 4096
    %int3072_531 = torch.constant.int 3072
    %429 = torch.prim.ListConstruct %int4096_530, %int3072_531 : (!torch.int, !torch.int) -> !torch.list<int>
    %430 = torch.aten.view %428, %429 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.0.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.0.img_attn.proj.weight : tensor<3072x3072xf16>
    %431 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_532 = torch.constant.int 0
    %int1_533 = torch.constant.int 1
    %432 = torch.aten.transpose.int %431, %int0_532, %int1_533 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.0.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.0.img_attn.proj.bias : tensor<3072xf16>
    %433 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_534 = torch.constant.int 6
    %434 = torch.prims.convert_element_type %433, %int6_534 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_535 = torch.constant.int 6
    %435 = torch.prims.convert_element_type %430, %int6_535 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_536 = torch.constant.int 6
    %436 = torch.prims.convert_element_type %432, %int6_536 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %437 = torch.aten.mm %435, %436 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_537 = torch.constant.int 1
    %438 = torch.aten.mul.Scalar %437, %int1_537 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_538 = torch.constant.int 1
    %439 = torch.aten.mul.Scalar %434, %int1_538 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_539 = torch.constant.int 1
    %440 = torch.aten.add.Tensor %438, %439, %int1_539 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_540 = torch.constant.int 5
    %441 = torch.prims.convert_element_type %440, %int5_540 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_541 = torch.constant.int 1
    %int4096_542 = torch.constant.int 4096
    %int3072_543 = torch.constant.int 3072
    %442 = torch.prim.ListConstruct %int1_541, %int4096_542, %int3072_543 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %443 = torch.aten.view %441, %442 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %444 = torch.aten.mul.Tensor %229, %443 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_544 = torch.constant.int 1
    %445 = torch.aten.add.Tensor %16, %444, %int1_544 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_545 = torch.constant.int 1
    %int1_546 = torch.constant.int 1
    %446 = torch.aten.add.Scalar %231, %int1_545, %int1_546 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_547 = torch.constant.int 6
    %447 = torch.prims.convert_element_type %445, %int6_547 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_548 = torch.constant.int 2
    %448 = torch.prim.ListConstruct %int2_548 : (!torch.int) -> !torch.list<int>
    %int0_549 = torch.constant.int 0
    %true_550 = torch.constant.bool true
    %result0_551, %result1_552 = torch.aten.var_mean.correction %447, %448, %int0_549, %true_550 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_553 = torch.constant.float 9.9999999999999995E-7
    %int1_554 = torch.constant.int 1
    %449 = torch.aten.add.Scalar %result0_551, %float9.999990e-07_553, %int1_554 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %450 = torch.aten.rsqrt %449 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_555 = torch.constant.int 1
    %451 = torch.aten.sub.Tensor %445, %result1_552, %int1_555 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %452 = torch.aten.mul.Tensor %451, %450 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_556 = torch.constant.int 5
    %453 = torch.prims.convert_element_type %452, %int5_556 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %454 = torch.aten.mul.Tensor %446, %453 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_557 = torch.constant.int 1
    %455 = torch.aten.add.Tensor %454, %230, %int1_557 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_558 = torch.constant.int 4096
    %int3072_559 = torch.constant.int 3072
    %456 = torch.prim.ListConstruct %int4096_558, %int3072_559 : (!torch.int, !torch.int) -> !torch.list<int>
    %457 = torch.aten.view %455, %456 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.0.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.0.img_mlp.0.weight : tensor<12288x3072xf16>
    %458 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_560 = torch.constant.int 0
    %int1_561 = torch.constant.int 1
    %459 = torch.aten.transpose.int %458, %int0_560, %int1_561 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.0.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.0.img_mlp.0.bias : tensor<12288xf16>
    %460 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_562 = torch.constant.int 6
    %461 = torch.prims.convert_element_type %460, %int6_562 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_563 = torch.constant.int 6
    %462 = torch.prims.convert_element_type %457, %int6_563 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_564 = torch.constant.int 6
    %463 = torch.prims.convert_element_type %459, %int6_564 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %464 = torch.aten.mm %462, %463 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_565 = torch.constant.int 1
    %465 = torch.aten.mul.Scalar %464, %int1_565 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_566 = torch.constant.int 1
    %466 = torch.aten.mul.Scalar %461, %int1_566 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_567 = torch.constant.int 1
    %467 = torch.aten.add.Tensor %465, %466, %int1_567 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_568 = torch.constant.int 5
    %468 = torch.prims.convert_element_type %467, %int5_568 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_569 = torch.constant.int 1
    %int4096_570 = torch.constant.int 4096
    %int12288_571 = torch.constant.int 12288
    %469 = torch.prim.ListConstruct %int1_569, %int4096_570, %int12288_571 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %470 = torch.aten.view %468, %469 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str = torch.constant.str "tanh"
    %471 = torch.aten.gelu %470, %str : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_572 = torch.constant.int 4096
    %int12288_573 = torch.constant.int 12288
    %472 = torch.prim.ListConstruct %int4096_572, %int12288_573 : (!torch.int, !torch.int) -> !torch.list<int>
    %473 = torch.aten.view %471, %472 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.0.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.0.img_mlp.2.weight : tensor<3072x12288xf16>
    %474 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_574 = torch.constant.int 0
    %int1_575 = torch.constant.int 1
    %475 = torch.aten.transpose.int %474, %int0_574, %int1_575 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.0.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.0.img_mlp.2.bias : tensor<3072xf16>
    %476 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_576 = torch.constant.int 6
    %477 = torch.prims.convert_element_type %476, %int6_576 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_577 = torch.constant.int 6
    %478 = torch.prims.convert_element_type %473, %int6_577 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_578 = torch.constant.int 6
    %479 = torch.prims.convert_element_type %475, %int6_578 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %480 = torch.aten.mm %478, %479 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_579 = torch.constant.int 1
    %481 = torch.aten.mul.Scalar %480, %int1_579 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_580 = torch.constant.int 1
    %482 = torch.aten.mul.Scalar %477, %int1_580 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_581 = torch.constant.int 1
    %483 = torch.aten.add.Tensor %481, %482, %int1_581 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_582 = torch.constant.int 5
    %484 = torch.prims.convert_element_type %483, %int5_582 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_583 = torch.constant.int 1
    %int4096_584 = torch.constant.int 4096
    %int3072_585 = torch.constant.int 3072
    %485 = torch.prim.ListConstruct %int1_583, %int4096_584, %int3072_585 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %486 = torch.aten.view %484, %485 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %487 = torch.aten.mul.Tensor %232, %486 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_586 = torch.constant.int 1
    %488 = torch.aten.add.Tensor %445, %487, %int1_586 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_587 = torch.constant.int 512
    %int3072_588 = torch.constant.int 3072
    %489 = torch.prim.ListConstruct %int512_587, %int3072_588 : (!torch.int, !torch.int) -> !torch.list<int>
    %490 = torch.aten.view %426, %489 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.0.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.0.txt_attn.proj.weight : tensor<3072x3072xf16>
    %491 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_589 = torch.constant.int 0
    %int1_590 = torch.constant.int 1
    %492 = torch.aten.transpose.int %491, %int0_589, %int1_590 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.0.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.0.txt_attn.proj.bias : tensor<3072xf16>
    %493 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_591 = torch.constant.int 6
    %494 = torch.prims.convert_element_type %493, %int6_591 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_592 = torch.constant.int 6
    %495 = torch.prims.convert_element_type %490, %int6_592 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_593 = torch.constant.int 6
    %496 = torch.prims.convert_element_type %492, %int6_593 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %497 = torch.aten.mm %495, %496 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_594 = torch.constant.int 1
    %498 = torch.aten.mul.Scalar %497, %int1_594 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_595 = torch.constant.int 1
    %499 = torch.aten.mul.Scalar %494, %int1_595 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_596 = torch.constant.int 1
    %500 = torch.aten.add.Tensor %498, %499, %int1_596 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_597 = torch.constant.int 5
    %501 = torch.prims.convert_element_type %500, %int5_597 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_598 = torch.constant.int 1
    %int512_599 = torch.constant.int 512
    %int3072_600 = torch.constant.int 3072
    %502 = torch.prim.ListConstruct %int1_598, %int512_599, %int3072_600 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %503 = torch.aten.view %501, %502 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %504 = torch.aten.mul.Tensor %250, %503 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_601 = torch.constant.int 1
    %505 = torch.aten.add.Tensor %134, %504, %int1_601 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_602 = torch.constant.int 1
    %int1_603 = torch.constant.int 1
    %506 = torch.aten.add.Scalar %252, %int1_602, %int1_603 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_604 = torch.constant.int 6
    %507 = torch.prims.convert_element_type %505, %int6_604 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_605 = torch.constant.int 2
    %508 = torch.prim.ListConstruct %int2_605 : (!torch.int) -> !torch.list<int>
    %int0_606 = torch.constant.int 0
    %true_607 = torch.constant.bool true
    %result0_608, %result1_609 = torch.aten.var_mean.correction %507, %508, %int0_606, %true_607 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_610 = torch.constant.float 9.9999999999999995E-7
    %int1_611 = torch.constant.int 1
    %509 = torch.aten.add.Scalar %result0_608, %float9.999990e-07_610, %int1_611 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %510 = torch.aten.rsqrt %509 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_612 = torch.constant.int 1
    %511 = torch.aten.sub.Tensor %505, %result1_609, %int1_612 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %512 = torch.aten.mul.Tensor %511, %510 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_613 = torch.constant.int 5
    %513 = torch.prims.convert_element_type %512, %int5_613 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %514 = torch.aten.mul.Tensor %506, %513 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_614 = torch.constant.int 1
    %515 = torch.aten.add.Tensor %514, %251, %int1_614 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_615 = torch.constant.int 512
    %int3072_616 = torch.constant.int 3072
    %516 = torch.prim.ListConstruct %int512_615, %int3072_616 : (!torch.int, !torch.int) -> !torch.list<int>
    %517 = torch.aten.view %515, %516 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.0.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.0.txt_mlp.0.weight : tensor<12288x3072xf16>
    %518 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_617 = torch.constant.int 0
    %int1_618 = torch.constant.int 1
    %519 = torch.aten.transpose.int %518, %int0_617, %int1_618 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.0.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.0.txt_mlp.0.bias : tensor<12288xf16>
    %520 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_619 = torch.constant.int 6
    %521 = torch.prims.convert_element_type %520, %int6_619 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_620 = torch.constant.int 6
    %522 = torch.prims.convert_element_type %517, %int6_620 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_621 = torch.constant.int 6
    %523 = torch.prims.convert_element_type %519, %int6_621 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %524 = torch.aten.mm %522, %523 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_622 = torch.constant.int 1
    %525 = torch.aten.mul.Scalar %524, %int1_622 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_623 = torch.constant.int 1
    %526 = torch.aten.mul.Scalar %521, %int1_623 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_624 = torch.constant.int 1
    %527 = torch.aten.add.Tensor %525, %526, %int1_624 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_625 = torch.constant.int 5
    %528 = torch.prims.convert_element_type %527, %int5_625 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_626 = torch.constant.int 1
    %int512_627 = torch.constant.int 512
    %int12288_628 = torch.constant.int 12288
    %529 = torch.prim.ListConstruct %int1_626, %int512_627, %int12288_628 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %530 = torch.aten.view %528, %529 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_629 = torch.constant.str "tanh"
    %531 = torch.aten.gelu %530, %str_629 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_630 = torch.constant.int 512
    %int12288_631 = torch.constant.int 12288
    %532 = torch.prim.ListConstruct %int512_630, %int12288_631 : (!torch.int, !torch.int) -> !torch.list<int>
    %533 = torch.aten.view %531, %532 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.0.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.0.txt_mlp.2.weight : tensor<3072x12288xf16>
    %534 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_632 = torch.constant.int 0
    %int1_633 = torch.constant.int 1
    %535 = torch.aten.transpose.int %534, %int0_632, %int1_633 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.0.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.0.txt_mlp.2.bias : tensor<3072xf16>
    %536 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_634 = torch.constant.int 6
    %537 = torch.prims.convert_element_type %536, %int6_634 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_635 = torch.constant.int 6
    %538 = torch.prims.convert_element_type %533, %int6_635 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_636 = torch.constant.int 6
    %539 = torch.prims.convert_element_type %535, %int6_636 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %540 = torch.aten.mm %538, %539 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_637 = torch.constant.int 1
    %541 = torch.aten.mul.Scalar %540, %int1_637 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_638 = torch.constant.int 1
    %542 = torch.aten.mul.Scalar %537, %int1_638 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_639 = torch.constant.int 1
    %543 = torch.aten.add.Tensor %541, %542, %int1_639 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_640 = torch.constant.int 5
    %544 = torch.prims.convert_element_type %543, %int5_640 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_641 = torch.constant.int 1
    %int512_642 = torch.constant.int 512
    %int3072_643 = torch.constant.int 3072
    %545 = torch.prim.ListConstruct %int1_641, %int512_642, %int3072_643 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %546 = torch.aten.view %544, %545 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %547 = torch.aten.mul.Tensor %253, %546 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_644 = torch.constant.int 1
    %548 = torch.aten.add.Tensor %505, %547, %int1_644 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %549 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.1.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.1.img_mod.lin.weight : tensor<18432x3072xf16>
    %550 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_645 = torch.constant.int 0
    %int1_646 = torch.constant.int 1
    %551 = torch.aten.transpose.int %550, %int0_645, %int1_646 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.1.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.1.img_mod.lin.bias : tensor<18432xf16>
    %552 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_647 = torch.constant.int 6
    %553 = torch.prims.convert_element_type %552, %int6_647 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_648 = torch.constant.int 6
    %554 = torch.prims.convert_element_type %549, %int6_648 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_649 = torch.constant.int 6
    %555 = torch.prims.convert_element_type %551, %int6_649 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %556 = torch.aten.mm %554, %555 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_650 = torch.constant.int 1
    %557 = torch.aten.mul.Scalar %556, %int1_650 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_651 = torch.constant.int 1
    %558 = torch.aten.mul.Scalar %553, %int1_651 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_652 = torch.constant.int 1
    %559 = torch.aten.add.Tensor %557, %558, %int1_652 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_653 = torch.constant.int 5
    %560 = torch.prims.convert_element_type %559, %int5_653 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_654 = torch.constant.int 0
    %int0_655 = torch.constant.int 0
    %int9223372036854775807_656 = torch.constant.int 9223372036854775807
    %int1_657 = torch.constant.int 1
    %561 = torch.aten.slice.Tensor %560, %int0_654, %int0_655, %int9223372036854775807_656, %int1_657 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_658 = torch.constant.int 1
    %562 = torch.aten.unsqueeze %561, %int1_658 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_659 = torch.constant.int 2
    %int0_660 = torch.constant.int 0
    %int9223372036854775807_661 = torch.constant.int 9223372036854775807
    %int1_662 = torch.constant.int 1
    %563 = torch.aten.slice.Tensor %562, %int2_659, %int0_660, %int9223372036854775807_661, %int1_662 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_663 = torch.constant.int -1
    %int0_664 = torch.constant.int 0
    %int3072_665 = torch.constant.int 3072
    %int1_666 = torch.constant.int 1
    %564 = torch.aten.slice.Tensor %563, %int-1_663, %int0_664, %int3072_665, %int1_666 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_667 = torch.constant.int -1
    %int3072_668 = torch.constant.int 3072
    %int6144_669 = torch.constant.int 6144
    %int1_670 = torch.constant.int 1
    %565 = torch.aten.slice.Tensor %563, %int-1_667, %int3072_668, %int6144_669, %int1_670 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_671 = torch.constant.int -1
    %int6144_672 = torch.constant.int 6144
    %int9216_673 = torch.constant.int 9216
    %int1_674 = torch.constant.int 1
    %566 = torch.aten.slice.Tensor %563, %int-1_671, %int6144_672, %int9216_673, %int1_674 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_675 = torch.constant.int -1
    %int9216_676 = torch.constant.int 9216
    %int12288_677 = torch.constant.int 12288
    %int1_678 = torch.constant.int 1
    %567 = torch.aten.slice.Tensor %563, %int-1_675, %int9216_676, %int12288_677, %int1_678 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_679 = torch.constant.int -1
    %int12288_680 = torch.constant.int 12288
    %int15360_681 = torch.constant.int 15360
    %int1_682 = torch.constant.int 1
    %568 = torch.aten.slice.Tensor %563, %int-1_679, %int12288_680, %int15360_681, %int1_682 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_683 = torch.constant.int -1
    %int15360_684 = torch.constant.int 15360
    %int18432_685 = torch.constant.int 18432
    %int1_686 = torch.constant.int 1
    %569 = torch.aten.slice.Tensor %563, %int-1_683, %int15360_684, %int18432_685, %int1_686 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %570 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.1.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.1.txt_mod.lin.weight : tensor<18432x3072xf16>
    %571 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_687 = torch.constant.int 0
    %int1_688 = torch.constant.int 1
    %572 = torch.aten.transpose.int %571, %int0_687, %int1_688 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.1.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.1.txt_mod.lin.bias : tensor<18432xf16>
    %573 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_689 = torch.constant.int 6
    %574 = torch.prims.convert_element_type %573, %int6_689 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_690 = torch.constant.int 6
    %575 = torch.prims.convert_element_type %570, %int6_690 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_691 = torch.constant.int 6
    %576 = torch.prims.convert_element_type %572, %int6_691 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %577 = torch.aten.mm %575, %576 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_692 = torch.constant.int 1
    %578 = torch.aten.mul.Scalar %577, %int1_692 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_693 = torch.constant.int 1
    %579 = torch.aten.mul.Scalar %574, %int1_693 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_694 = torch.constant.int 1
    %580 = torch.aten.add.Tensor %578, %579, %int1_694 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_695 = torch.constant.int 5
    %581 = torch.prims.convert_element_type %580, %int5_695 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_696 = torch.constant.int 0
    %int0_697 = torch.constant.int 0
    %int9223372036854775807_698 = torch.constant.int 9223372036854775807
    %int1_699 = torch.constant.int 1
    %582 = torch.aten.slice.Tensor %581, %int0_696, %int0_697, %int9223372036854775807_698, %int1_699 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_700 = torch.constant.int 1
    %583 = torch.aten.unsqueeze %582, %int1_700 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_701 = torch.constant.int 2
    %int0_702 = torch.constant.int 0
    %int9223372036854775807_703 = torch.constant.int 9223372036854775807
    %int1_704 = torch.constant.int 1
    %584 = torch.aten.slice.Tensor %583, %int2_701, %int0_702, %int9223372036854775807_703, %int1_704 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_705 = torch.constant.int -1
    %int0_706 = torch.constant.int 0
    %int3072_707 = torch.constant.int 3072
    %int1_708 = torch.constant.int 1
    %585 = torch.aten.slice.Tensor %584, %int-1_705, %int0_706, %int3072_707, %int1_708 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_709 = torch.constant.int -1
    %int3072_710 = torch.constant.int 3072
    %int6144_711 = torch.constant.int 6144
    %int1_712 = torch.constant.int 1
    %586 = torch.aten.slice.Tensor %584, %int-1_709, %int3072_710, %int6144_711, %int1_712 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_713 = torch.constant.int -1
    %int6144_714 = torch.constant.int 6144
    %int9216_715 = torch.constant.int 9216
    %int1_716 = torch.constant.int 1
    %587 = torch.aten.slice.Tensor %584, %int-1_713, %int6144_714, %int9216_715, %int1_716 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_717 = torch.constant.int -1
    %int9216_718 = torch.constant.int 9216
    %int12288_719 = torch.constant.int 12288
    %int1_720 = torch.constant.int 1
    %588 = torch.aten.slice.Tensor %584, %int-1_717, %int9216_718, %int12288_719, %int1_720 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_721 = torch.constant.int -1
    %int12288_722 = torch.constant.int 12288
    %int15360_723 = torch.constant.int 15360
    %int1_724 = torch.constant.int 1
    %589 = torch.aten.slice.Tensor %584, %int-1_721, %int12288_722, %int15360_723, %int1_724 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_725 = torch.constant.int -1
    %int15360_726 = torch.constant.int 15360
    %int18432_727 = torch.constant.int 18432
    %int1_728 = torch.constant.int 1
    %590 = torch.aten.slice.Tensor %584, %int-1_725, %int15360_726, %int18432_727, %int1_728 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_729 = torch.constant.int 6
    %591 = torch.prims.convert_element_type %488, %int6_729 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_730 = torch.constant.int 2
    %592 = torch.prim.ListConstruct %int2_730 : (!torch.int) -> !torch.list<int>
    %int0_731 = torch.constant.int 0
    %true_732 = torch.constant.bool true
    %result0_733, %result1_734 = torch.aten.var_mean.correction %591, %592, %int0_731, %true_732 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_735 = torch.constant.float 9.9999999999999995E-7
    %int1_736 = torch.constant.int 1
    %593 = torch.aten.add.Scalar %result0_733, %float9.999990e-07_735, %int1_736 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %594 = torch.aten.rsqrt %593 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_737 = torch.constant.int 1
    %595 = torch.aten.sub.Tensor %488, %result1_734, %int1_737 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %596 = torch.aten.mul.Tensor %595, %594 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_738 = torch.constant.int 5
    %597 = torch.prims.convert_element_type %596, %int5_738 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_739 = torch.constant.int 1
    %int1_740 = torch.constant.int 1
    %598 = torch.aten.add.Scalar %565, %int1_739, %int1_740 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %599 = torch.aten.mul.Tensor %598, %597 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_741 = torch.constant.int 1
    %600 = torch.aten.add.Tensor %599, %564, %int1_741 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_742 = torch.constant.int 4096
    %int3072_743 = torch.constant.int 3072
    %601 = torch.prim.ListConstruct %int4096_742, %int3072_743 : (!torch.int, !torch.int) -> !torch.list<int>
    %602 = torch.aten.view %600, %601 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.1.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.1.img_attn.qkv.weight : tensor<9216x3072xf16>
    %603 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_744 = torch.constant.int 0
    %int1_745 = torch.constant.int 1
    %604 = torch.aten.transpose.int %603, %int0_744, %int1_745 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.1.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.1.img_attn.qkv.bias : tensor<9216xf16>
    %605 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_746 = torch.constant.int 6
    %606 = torch.prims.convert_element_type %605, %int6_746 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_747 = torch.constant.int 6
    %607 = torch.prims.convert_element_type %602, %int6_747 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_748 = torch.constant.int 6
    %608 = torch.prims.convert_element_type %604, %int6_748 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %609 = torch.aten.mm %607, %608 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_749 = torch.constant.int 1
    %610 = torch.aten.mul.Scalar %609, %int1_749 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_750 = torch.constant.int 1
    %611 = torch.aten.mul.Scalar %606, %int1_750 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_751 = torch.constant.int 1
    %612 = torch.aten.add.Tensor %610, %611, %int1_751 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_752 = torch.constant.int 5
    %613 = torch.prims.convert_element_type %612, %int5_752 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_753 = torch.constant.int 1
    %int4096_754 = torch.constant.int 4096
    %int9216_755 = torch.constant.int 9216
    %614 = torch.prim.ListConstruct %int1_753, %int4096_754, %int9216_755 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %615 = torch.aten.view %613, %614 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %616 = torch_c.to_builtin_tensor %615 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_756 = tensor.cast %616 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_757 = arith.constant 0 : index
    %dim_758 = tensor.dim %cast_756, %c0_757 : tensor<?x?x?xf16>
    %c1_759 = arith.constant 1 : index
    %dim_760 = tensor.dim %cast_756, %c1_759 : tensor<?x?x?xf16>
    %c2_761 = arith.constant 2 : index
    %dim_762 = tensor.dim %cast_756, %c2_761 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_756 : tensor<?x?x?xf16>{%dim_758, %dim_760, %dim_762}]
    %cast_763 = tensor.cast %cast_756 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %617 = torch_c.from_builtin_tensor %cast_763 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_764 = torch.constant.int 1
    %int4096_765 = torch.constant.int 4096
    %int3_766 = torch.constant.int 3
    %int24_767 = torch.constant.int 24
    %int128_768 = torch.constant.int 128
    %618 = torch.prim.ListConstruct %int1_764, %int4096_765, %int3_766, %int24_767, %int128_768 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %619 = torch.aten.view %617, %618 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_769 = torch.constant.int 2
    %int0_770 = torch.constant.int 0
    %int3_771 = torch.constant.int 3
    %int1_772 = torch.constant.int 1
    %int4_773 = torch.constant.int 4
    %620 = torch.prim.ListConstruct %int2_769, %int0_770, %int3_771, %int1_772, %int4_773 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %621 = torch.aten.permute %619, %620 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_774 = torch.constant.int 0
    %int0_775 = torch.constant.int 0
    %622 = torch.aten.select.int %621, %int0_774, %int0_775 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_776 = torch.constant.int 6
    %623 = torch.prims.convert_element_type %622, %int6_776 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_777 = torch.constant.int 2
    %624 = torch.aten.pow.Tensor_Scalar %623, %int2_777 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_778 = torch.constant.int -1
    %625 = torch.prim.ListConstruct %int-1_778 : (!torch.int) -> !torch.list<int>
    %true_779 = torch.constant.bool true
    %none_780 = torch.constant.none
    %626 = torch.aten.mean.dim %624, %625, %true_779, %none_780 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_781 = torch.constant.float 9.9999999999999995E-7
    %int1_782 = torch.constant.int 1
    %627 = torch.aten.add.Scalar %626, %float9.999990e-07_781, %int1_782 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %628 = torch.aten.rsqrt %627 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %629 = torch.aten.mul.Tensor %623, %628 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_783 = torch.constant.int 5
    %630 = torch.prims.convert_element_type %629, %int5_783 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.1.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.1.img_attn.norm.query_norm.scale : tensor<128xf16>
    %631 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %632 = torch.aten.mul.Tensor %630, %631 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_784 = torch.constant.int 1
    %int4096_785 = torch.constant.int 4096
    %int3_786 = torch.constant.int 3
    %int24_787 = torch.constant.int 24
    %int128_788 = torch.constant.int 128
    %633 = torch.prim.ListConstruct %int1_784, %int4096_785, %int3_786, %int24_787, %int128_788 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %634 = torch.aten.view %617, %633 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_789 = torch.constant.int 2
    %int0_790 = torch.constant.int 0
    %int3_791 = torch.constant.int 3
    %int1_792 = torch.constant.int 1
    %int4_793 = torch.constant.int 4
    %635 = torch.prim.ListConstruct %int2_789, %int0_790, %int3_791, %int1_792, %int4_793 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %636 = torch.aten.permute %634, %635 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_794 = torch.constant.int 0
    %int1_795 = torch.constant.int 1
    %637 = torch.aten.select.int %636, %int0_794, %int1_795 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_796 = torch.constant.int 6
    %638 = torch.prims.convert_element_type %637, %int6_796 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_797 = torch.constant.int 2
    %639 = torch.aten.pow.Tensor_Scalar %638, %int2_797 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_798 = torch.constant.int -1
    %640 = torch.prim.ListConstruct %int-1_798 : (!torch.int) -> !torch.list<int>
    %true_799 = torch.constant.bool true
    %none_800 = torch.constant.none
    %641 = torch.aten.mean.dim %639, %640, %true_799, %none_800 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_801 = torch.constant.float 9.9999999999999995E-7
    %int1_802 = torch.constant.int 1
    %642 = torch.aten.add.Scalar %641, %float9.999990e-07_801, %int1_802 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %643 = torch.aten.rsqrt %642 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %644 = torch.aten.mul.Tensor %638, %643 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_803 = torch.constant.int 5
    %645 = torch.prims.convert_element_type %644, %int5_803 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.1.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.1.img_attn.norm.key_norm.scale : tensor<128xf16>
    %646 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %647 = torch.aten.mul.Tensor %645, %646 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_804 = torch.constant.int 5
    %648 = torch.prims.convert_element_type %632, %int5_804 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_805 = torch.constant.int 5
    %649 = torch.prims.convert_element_type %647, %int5_805 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_806 = torch.constant.int 6
    %650 = torch.prims.convert_element_type %548, %int6_806 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_807 = torch.constant.int 2
    %651 = torch.prim.ListConstruct %int2_807 : (!torch.int) -> !torch.list<int>
    %int0_808 = torch.constant.int 0
    %true_809 = torch.constant.bool true
    %result0_810, %result1_811 = torch.aten.var_mean.correction %650, %651, %int0_808, %true_809 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_812 = torch.constant.float 9.9999999999999995E-7
    %int1_813 = torch.constant.int 1
    %652 = torch.aten.add.Scalar %result0_810, %float9.999990e-07_812, %int1_813 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %653 = torch.aten.rsqrt %652 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_814 = torch.constant.int 1
    %654 = torch.aten.sub.Tensor %548, %result1_811, %int1_814 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %655 = torch.aten.mul.Tensor %654, %653 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_815 = torch.constant.int 5
    %656 = torch.prims.convert_element_type %655, %int5_815 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_816 = torch.constant.int 1
    %int1_817 = torch.constant.int 1
    %657 = torch.aten.add.Scalar %586, %int1_816, %int1_817 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %658 = torch.aten.mul.Tensor %657, %656 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_818 = torch.constant.int 1
    %659 = torch.aten.add.Tensor %658, %585, %int1_818 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_819 = torch.constant.int 512
    %int3072_820 = torch.constant.int 3072
    %660 = torch.prim.ListConstruct %int512_819, %int3072_820 : (!torch.int, !torch.int) -> !torch.list<int>
    %661 = torch.aten.view %659, %660 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.1.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.1.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %662 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_821 = torch.constant.int 0
    %int1_822 = torch.constant.int 1
    %663 = torch.aten.transpose.int %662, %int0_821, %int1_822 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.1.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.1.txt_attn.qkv.bias : tensor<9216xf16>
    %664 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_823 = torch.constant.int 6
    %665 = torch.prims.convert_element_type %664, %int6_823 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_824 = torch.constant.int 6
    %666 = torch.prims.convert_element_type %661, %int6_824 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_825 = torch.constant.int 6
    %667 = torch.prims.convert_element_type %663, %int6_825 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %668 = torch.aten.mm %666, %667 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_826 = torch.constant.int 1
    %669 = torch.aten.mul.Scalar %668, %int1_826 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_827 = torch.constant.int 1
    %670 = torch.aten.mul.Scalar %665, %int1_827 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_828 = torch.constant.int 1
    %671 = torch.aten.add.Tensor %669, %670, %int1_828 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_829 = torch.constant.int 5
    %672 = torch.prims.convert_element_type %671, %int5_829 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_830 = torch.constant.int 1
    %int512_831 = torch.constant.int 512
    %int9216_832 = torch.constant.int 9216
    %673 = torch.prim.ListConstruct %int1_830, %int512_831, %int9216_832 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %674 = torch.aten.view %672, %673 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %675 = torch_c.to_builtin_tensor %674 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_833 = tensor.cast %675 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_834 = arith.constant 0 : index
    %dim_835 = tensor.dim %cast_833, %c0_834 : tensor<?x?x?xf16>
    %c1_836 = arith.constant 1 : index
    %dim_837 = tensor.dim %cast_833, %c1_836 : tensor<?x?x?xf16>
    %c2_838 = arith.constant 2 : index
    %dim_839 = tensor.dim %cast_833, %c2_838 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_833 : tensor<?x?x?xf16>{%dim_835, %dim_837, %dim_839}]
    %cast_840 = tensor.cast %cast_833 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %676 = torch_c.from_builtin_tensor %cast_840 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_841 = torch.constant.int 1
    %int512_842 = torch.constant.int 512
    %int3_843 = torch.constant.int 3
    %int24_844 = torch.constant.int 24
    %int128_845 = torch.constant.int 128
    %677 = torch.prim.ListConstruct %int1_841, %int512_842, %int3_843, %int24_844, %int128_845 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %678 = torch.aten.view %676, %677 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_846 = torch.constant.int 2
    %int0_847 = torch.constant.int 0
    %int3_848 = torch.constant.int 3
    %int1_849 = torch.constant.int 1
    %int4_850 = torch.constant.int 4
    %679 = torch.prim.ListConstruct %int2_846, %int0_847, %int3_848, %int1_849, %int4_850 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %680 = torch.aten.permute %678, %679 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_851 = torch.constant.int 0
    %int0_852 = torch.constant.int 0
    %681 = torch.aten.select.int %680, %int0_851, %int0_852 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_853 = torch.constant.int 6
    %682 = torch.prims.convert_element_type %681, %int6_853 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_854 = torch.constant.int 2
    %683 = torch.aten.pow.Tensor_Scalar %682, %int2_854 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_855 = torch.constant.int -1
    %684 = torch.prim.ListConstruct %int-1_855 : (!torch.int) -> !torch.list<int>
    %true_856 = torch.constant.bool true
    %none_857 = torch.constant.none
    %685 = torch.aten.mean.dim %683, %684, %true_856, %none_857 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_858 = torch.constant.float 9.9999999999999995E-7
    %int1_859 = torch.constant.int 1
    %686 = torch.aten.add.Scalar %685, %float9.999990e-07_858, %int1_859 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %687 = torch.aten.rsqrt %686 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %688 = torch.aten.mul.Tensor %682, %687 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_860 = torch.constant.int 5
    %689 = torch.prims.convert_element_type %688, %int5_860 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.1.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.1.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %690 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %691 = torch.aten.mul.Tensor %689, %690 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_861 = torch.constant.int 1
    %int512_862 = torch.constant.int 512
    %int3_863 = torch.constant.int 3
    %int24_864 = torch.constant.int 24
    %int128_865 = torch.constant.int 128
    %692 = torch.prim.ListConstruct %int1_861, %int512_862, %int3_863, %int24_864, %int128_865 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %693 = torch.aten.view %676, %692 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_866 = torch.constant.int 2
    %int0_867 = torch.constant.int 0
    %int3_868 = torch.constant.int 3
    %int1_869 = torch.constant.int 1
    %int4_870 = torch.constant.int 4
    %694 = torch.prim.ListConstruct %int2_866, %int0_867, %int3_868, %int1_869, %int4_870 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %695 = torch.aten.permute %693, %694 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_871 = torch.constant.int 0
    %int1_872 = torch.constant.int 1
    %696 = torch.aten.select.int %695, %int0_871, %int1_872 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_873 = torch.constant.int 6
    %697 = torch.prims.convert_element_type %696, %int6_873 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_874 = torch.constant.int 2
    %698 = torch.aten.pow.Tensor_Scalar %697, %int2_874 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_875 = torch.constant.int -1
    %699 = torch.prim.ListConstruct %int-1_875 : (!torch.int) -> !torch.list<int>
    %true_876 = torch.constant.bool true
    %none_877 = torch.constant.none
    %700 = torch.aten.mean.dim %698, %699, %true_876, %none_877 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_878 = torch.constant.float 9.9999999999999995E-7
    %int1_879 = torch.constant.int 1
    %701 = torch.aten.add.Scalar %700, %float9.999990e-07_878, %int1_879 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %702 = torch.aten.rsqrt %701 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %703 = torch.aten.mul.Tensor %697, %702 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_880 = torch.constant.int 5
    %704 = torch.prims.convert_element_type %703, %int5_880 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.1.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.1.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %705 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %706 = torch.aten.mul.Tensor %704, %705 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_881 = torch.constant.int 5
    %707 = torch.prims.convert_element_type %691, %int5_881 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_882 = torch.constant.int 5
    %708 = torch.prims.convert_element_type %706, %int5_882 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %709 = torch.prim.ListConstruct %707, %648 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_883 = torch.constant.int 2
    %710 = torch.aten.cat %709, %int2_883 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %711 = torch.prim.ListConstruct %708, %649 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_884 = torch.constant.int 2
    %712 = torch.aten.cat %711, %int2_884 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_885 = torch.constant.int 1
    %int512_886 = torch.constant.int 512
    %int3_887 = torch.constant.int 3
    %int24_888 = torch.constant.int 24
    %int128_889 = torch.constant.int 128
    %713 = torch.prim.ListConstruct %int1_885, %int512_886, %int3_887, %int24_888, %int128_889 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %714 = torch.aten.view %676, %713 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_890 = torch.constant.int 2
    %int0_891 = torch.constant.int 0
    %int3_892 = torch.constant.int 3
    %int1_893 = torch.constant.int 1
    %int4_894 = torch.constant.int 4
    %715 = torch.prim.ListConstruct %int2_890, %int0_891, %int3_892, %int1_893, %int4_894 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %716 = torch.aten.permute %714, %715 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_895 = torch.constant.int 0
    %int2_896 = torch.constant.int 2
    %717 = torch.aten.select.int %716, %int0_895, %int2_896 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_897 = torch.constant.int 1
    %int4096_898 = torch.constant.int 4096
    %int3_899 = torch.constant.int 3
    %int24_900 = torch.constant.int 24
    %int128_901 = torch.constant.int 128
    %718 = torch.prim.ListConstruct %int1_897, %int4096_898, %int3_899, %int24_900, %int128_901 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %719 = torch.aten.view %617, %718 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_902 = torch.constant.int 2
    %int0_903 = torch.constant.int 0
    %int3_904 = torch.constant.int 3
    %int1_905 = torch.constant.int 1
    %int4_906 = torch.constant.int 4
    %720 = torch.prim.ListConstruct %int2_902, %int0_903, %int3_904, %int1_905, %int4_906 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %721 = torch.aten.permute %719, %720 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_907 = torch.constant.int 0
    %int2_908 = torch.constant.int 2
    %722 = torch.aten.select.int %721, %int0_907, %int2_908 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %723 = torch.prim.ListConstruct %717, %722 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_909 = torch.constant.int 2
    %724 = torch.aten.cat %723, %int2_909 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %725 = torch_c.to_builtin_tensor %710 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_910 = tensor.cast %725 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_911 = arith.constant 0 : index
    %dim_912 = tensor.dim %cast_910, %c0_911 : tensor<?x?x?x?xf16>
    %c1_913 = arith.constant 1 : index
    %dim_914 = tensor.dim %cast_910, %c1_913 : tensor<?x?x?x?xf16>
    %c2_915 = arith.constant 2 : index
    %dim_916 = tensor.dim %cast_910, %c2_915 : tensor<?x?x?x?xf16>
    %c3_917 = arith.constant 3 : index
    %dim_918 = tensor.dim %cast_910, %c3_917 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_910 : tensor<?x?x?x?xf16>{%dim_912, %dim_914, %dim_916, %dim_918}]
    %cast_919 = tensor.cast %cast_910 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %726 = torch_c.from_builtin_tensor %cast_919 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %727 = torch_c.to_builtin_tensor %712 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_920 = tensor.cast %727 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_921 = arith.constant 0 : index
    %dim_922 = tensor.dim %cast_920, %c0_921 : tensor<?x?x?x?xf16>
    %c1_923 = arith.constant 1 : index
    %dim_924 = tensor.dim %cast_920, %c1_923 : tensor<?x?x?x?xf16>
    %c2_925 = arith.constant 2 : index
    %dim_926 = tensor.dim %cast_920, %c2_925 : tensor<?x?x?x?xf16>
    %c3_927 = arith.constant 3 : index
    %dim_928 = tensor.dim %cast_920, %c3_927 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_920 : tensor<?x?x?x?xf16>{%dim_922, %dim_924, %dim_926, %dim_928}]
    %cast_929 = tensor.cast %cast_920 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %728 = torch_c.from_builtin_tensor %cast_929 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %729 = torch_c.to_builtin_tensor %724 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_930 = tensor.cast %729 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_931 = arith.constant 0 : index
    %dim_932 = tensor.dim %cast_930, %c0_931 : tensor<?x?x?x?xf16>
    %c1_933 = arith.constant 1 : index
    %dim_934 = tensor.dim %cast_930, %c1_933 : tensor<?x?x?x?xf16>
    %c2_935 = arith.constant 2 : index
    %dim_936 = tensor.dim %cast_930, %c2_935 : tensor<?x?x?x?xf16>
    %c3_937 = arith.constant 3 : index
    %dim_938 = tensor.dim %cast_930, %c3_937 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_930 : tensor<?x?x?x?xf16>{%dim_932, %dim_934, %dim_936, %dim_938}]
    %cast_939 = tensor.cast %cast_930 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %730 = torch_c.from_builtin_tensor %cast_939 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_940 = torch.constant.int 6
    %731 = torch.prims.convert_element_type %726, %int6_940 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_941 = torch.constant.int 1
    %int24_942 = torch.constant.int 24
    %int4608_943 = torch.constant.int 4608
    %int-1_944 = torch.constant.int -1
    %int1_945 = torch.constant.int 1
    %int2_946 = torch.constant.int 2
    %732 = torch.prim.ListConstruct %int1_941, %int24_942, %int4608_943, %int-1_944, %int1_945, %int2_946 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %733 = torch.aten.view %731, %732 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_947 = torch.constant.int 6
    %734 = torch.prims.convert_element_type %728, %int6_947 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_948 = torch.constant.int 1
    %int24_949 = torch.constant.int 24
    %int4608_950 = torch.constant.int 4608
    %int-1_951 = torch.constant.int -1
    %int1_952 = torch.constant.int 1
    %int2_953 = torch.constant.int 2
    %735 = torch.prim.ListConstruct %int1_948, %int24_949, %int4608_950, %int-1_951, %int1_952, %int2_953 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %736 = torch.aten.view %734, %735 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_954 = torch.constant.int 5
    %int0_955 = torch.constant.int 0
    %737 = torch.aten.select.int %211, %int5_954, %int0_955 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_956 = torch.constant.int 5
    %int0_957 = torch.constant.int 0
    %738 = torch.aten.select.int %733, %int5_956, %int0_957 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %739 = torch.aten.mul.Tensor %737, %738 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_958 = torch.constant.int 5
    %int1_959 = torch.constant.int 1
    %740 = torch.aten.select.int %211, %int5_958, %int1_959 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_960 = torch.constant.int 5
    %int1_961 = torch.constant.int 1
    %741 = torch.aten.select.int %733, %int5_960, %int1_961 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %742 = torch.aten.mul.Tensor %740, %741 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_962 = torch.constant.int 1
    %743 = torch.aten.add.Tensor %739, %742, %int1_962 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_963 = torch.constant.int 5
    %int0_964 = torch.constant.int 0
    %744 = torch.aten.select.int %211, %int5_963, %int0_964 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_965 = torch.constant.int 5
    %int0_966 = torch.constant.int 0
    %745 = torch.aten.select.int %736, %int5_965, %int0_966 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %746 = torch.aten.mul.Tensor %744, %745 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_967 = torch.constant.int 5
    %int1_968 = torch.constant.int 1
    %747 = torch.aten.select.int %211, %int5_967, %int1_968 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_969 = torch.constant.int 5
    %int1_970 = torch.constant.int 1
    %748 = torch.aten.select.int %736, %int5_969, %int1_970 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %749 = torch.aten.mul.Tensor %747, %748 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_971 = torch.constant.int 1
    %750 = torch.aten.add.Tensor %746, %749, %int1_971 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_972 = torch.constant.int 1
    %int24_973 = torch.constant.int 24
    %int4608_974 = torch.constant.int 4608
    %int128_975 = torch.constant.int 128
    %751 = torch.prim.ListConstruct %int1_972, %int24_973, %int4608_974, %int128_975 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %752 = torch.aten.view %743, %751 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_976 = torch.constant.int 5
    %753 = torch.prims.convert_element_type %752, %int5_976 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_977 = torch.constant.int 1
    %int24_978 = torch.constant.int 24
    %int4608_979 = torch.constant.int 4608
    %int128_980 = torch.constant.int 128
    %754 = torch.prim.ListConstruct %int1_977, %int24_978, %int4608_979, %int128_980 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %755 = torch.aten.view %750, %754 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_981 = torch.constant.int 5
    %756 = torch.prims.convert_element_type %755, %int5_981 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_982 = torch.constant.float 0.000000e+00
    %false_983 = torch.constant.bool false
    %none_984 = torch.constant.none
    %none_985 = torch.constant.none
    %757:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%753, %756, %730, %float0.000000e00_982, %false_983, %none_984, %none_985) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_986 = torch.constant.int 0
    %int2_987 = torch.constant.int 2
    %int1_988 = torch.constant.int 1
    %int3_989 = torch.constant.int 3
    %758 = torch.prim.ListConstruct %int0_986, %int2_987, %int1_988, %int3_989 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %759 = torch.aten.permute %757#0, %758 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_990 = torch.constant.int 1
    %int4608_991 = torch.constant.int 4608
    %int3072_992 = torch.constant.int 3072
    %760 = torch.prim.ListConstruct %int1_990, %int4608_991, %int3072_992 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %761 = torch.aten.view %759, %760 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_993 = torch.constant.int 0
    %int0_994 = torch.constant.int 0
    %int9223372036854775807_995 = torch.constant.int 9223372036854775807
    %int1_996 = torch.constant.int 1
    %762 = torch.aten.slice.Tensor %761, %int0_993, %int0_994, %int9223372036854775807_995, %int1_996 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_997 = torch.constant.int 1
    %int0_998 = torch.constant.int 0
    %int512_999 = torch.constant.int 512
    %int1_1000 = torch.constant.int 1
    %763 = torch.aten.slice.Tensor %762, %int1_997, %int0_998, %int512_999, %int1_1000 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_1001 = torch.constant.int 0
    %int0_1002 = torch.constant.int 0
    %int9223372036854775807_1003 = torch.constant.int 9223372036854775807
    %int1_1004 = torch.constant.int 1
    %764 = torch.aten.slice.Tensor %761, %int0_1001, %int0_1002, %int9223372036854775807_1003, %int1_1004 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_1005 = torch.constant.int 1
    %int512_1006 = torch.constant.int 512
    %int9223372036854775807_1007 = torch.constant.int 9223372036854775807
    %int1_1008 = torch.constant.int 1
    %765 = torch.aten.slice.Tensor %764, %int1_1005, %int512_1006, %int9223372036854775807_1007, %int1_1008 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_1009 = torch.constant.int 4096
    %int3072_1010 = torch.constant.int 3072
    %766 = torch.prim.ListConstruct %int4096_1009, %int3072_1010 : (!torch.int, !torch.int) -> !torch.list<int>
    %767 = torch.aten.view %765, %766 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.1.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.1.img_attn.proj.weight : tensor<3072x3072xf16>
    %768 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_1011 = torch.constant.int 0
    %int1_1012 = torch.constant.int 1
    %769 = torch.aten.transpose.int %768, %int0_1011, %int1_1012 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.1.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.1.img_attn.proj.bias : tensor<3072xf16>
    %770 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1013 = torch.constant.int 6
    %771 = torch.prims.convert_element_type %770, %int6_1013 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1014 = torch.constant.int 6
    %772 = torch.prims.convert_element_type %767, %int6_1014 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_1015 = torch.constant.int 6
    %773 = torch.prims.convert_element_type %769, %int6_1015 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %774 = torch.aten.mm %772, %773 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_1016 = torch.constant.int 1
    %775 = torch.aten.mul.Scalar %774, %int1_1016 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_1017 = torch.constant.int 1
    %776 = torch.aten.mul.Scalar %771, %int1_1017 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1018 = torch.constant.int 1
    %777 = torch.aten.add.Tensor %775, %776, %int1_1018 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_1019 = torch.constant.int 5
    %778 = torch.prims.convert_element_type %777, %int5_1019 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_1020 = torch.constant.int 1
    %int4096_1021 = torch.constant.int 4096
    %int3072_1022 = torch.constant.int 3072
    %779 = torch.prim.ListConstruct %int1_1020, %int4096_1021, %int3072_1022 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %780 = torch.aten.view %778, %779 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %781 = torch.aten.mul.Tensor %566, %780 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1023 = torch.constant.int 1
    %782 = torch.aten.add.Tensor %488, %781, %int1_1023 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1024 = torch.constant.int 1
    %int1_1025 = torch.constant.int 1
    %783 = torch.aten.add.Scalar %568, %int1_1024, %int1_1025 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_1026 = torch.constant.int 6
    %784 = torch.prims.convert_element_type %782, %int6_1026 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_1027 = torch.constant.int 2
    %785 = torch.prim.ListConstruct %int2_1027 : (!torch.int) -> !torch.list<int>
    %int0_1028 = torch.constant.int 0
    %true_1029 = torch.constant.bool true
    %result0_1030, %result1_1031 = torch.aten.var_mean.correction %784, %785, %int0_1028, %true_1029 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_1032 = torch.constant.float 9.9999999999999995E-7
    %int1_1033 = torch.constant.int 1
    %786 = torch.aten.add.Scalar %result0_1030, %float9.999990e-07_1032, %int1_1033 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %787 = torch.aten.rsqrt %786 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_1034 = torch.constant.int 1
    %788 = torch.aten.sub.Tensor %782, %result1_1031, %int1_1034 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %789 = torch.aten.mul.Tensor %788, %787 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_1035 = torch.constant.int 5
    %790 = torch.prims.convert_element_type %789, %int5_1035 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %791 = torch.aten.mul.Tensor %783, %790 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1036 = torch.constant.int 1
    %792 = torch.aten.add.Tensor %791, %567, %int1_1036 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_1037 = torch.constant.int 4096
    %int3072_1038 = torch.constant.int 3072
    %793 = torch.prim.ListConstruct %int4096_1037, %int3072_1038 : (!torch.int, !torch.int) -> !torch.list<int>
    %794 = torch.aten.view %792, %793 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.1.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.1.img_mlp.0.weight : tensor<12288x3072xf16>
    %795 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_1039 = torch.constant.int 0
    %int1_1040 = torch.constant.int 1
    %796 = torch.aten.transpose.int %795, %int0_1039, %int1_1040 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.1.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.1.img_mlp.0.bias : tensor<12288xf16>
    %797 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_1041 = torch.constant.int 6
    %798 = torch.prims.convert_element_type %797, %int6_1041 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_1042 = torch.constant.int 6
    %799 = torch.prims.convert_element_type %794, %int6_1042 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_1043 = torch.constant.int 6
    %800 = torch.prims.convert_element_type %796, %int6_1043 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %801 = torch.aten.mm %799, %800 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_1044 = torch.constant.int 1
    %802 = torch.aten.mul.Scalar %801, %int1_1044 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_1045 = torch.constant.int 1
    %803 = torch.aten.mul.Scalar %798, %int1_1045 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_1046 = torch.constant.int 1
    %804 = torch.aten.add.Tensor %802, %803, %int1_1046 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_1047 = torch.constant.int 5
    %805 = torch.prims.convert_element_type %804, %int5_1047 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_1048 = torch.constant.int 1
    %int4096_1049 = torch.constant.int 4096
    %int12288_1050 = torch.constant.int 12288
    %806 = torch.prim.ListConstruct %int1_1048, %int4096_1049, %int12288_1050 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %807 = torch.aten.view %805, %806 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_1051 = torch.constant.str "tanh"
    %808 = torch.aten.gelu %807, %str_1051 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_1052 = torch.constant.int 4096
    %int12288_1053 = torch.constant.int 12288
    %809 = torch.prim.ListConstruct %int4096_1052, %int12288_1053 : (!torch.int, !torch.int) -> !torch.list<int>
    %810 = torch.aten.view %808, %809 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.1.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.1.img_mlp.2.weight : tensor<3072x12288xf16>
    %811 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_1054 = torch.constant.int 0
    %int1_1055 = torch.constant.int 1
    %812 = torch.aten.transpose.int %811, %int0_1054, %int1_1055 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.1.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.1.img_mlp.2.bias : tensor<3072xf16>
    %813 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1056 = torch.constant.int 6
    %814 = torch.prims.convert_element_type %813, %int6_1056 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1057 = torch.constant.int 6
    %815 = torch.prims.convert_element_type %810, %int6_1057 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_1058 = torch.constant.int 6
    %816 = torch.prims.convert_element_type %812, %int6_1058 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %817 = torch.aten.mm %815, %816 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_1059 = torch.constant.int 1
    %818 = torch.aten.mul.Scalar %817, %int1_1059 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_1060 = torch.constant.int 1
    %819 = torch.aten.mul.Scalar %814, %int1_1060 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1061 = torch.constant.int 1
    %820 = torch.aten.add.Tensor %818, %819, %int1_1061 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_1062 = torch.constant.int 5
    %821 = torch.prims.convert_element_type %820, %int5_1062 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_1063 = torch.constant.int 1
    %int4096_1064 = torch.constant.int 4096
    %int3072_1065 = torch.constant.int 3072
    %822 = torch.prim.ListConstruct %int1_1063, %int4096_1064, %int3072_1065 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %823 = torch.aten.view %821, %822 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %824 = torch.aten.mul.Tensor %569, %823 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1066 = torch.constant.int 1
    %825 = torch.aten.add.Tensor %782, %824, %int1_1066 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_1067 = torch.constant.int 512
    %int3072_1068 = torch.constant.int 3072
    %826 = torch.prim.ListConstruct %int512_1067, %int3072_1068 : (!torch.int, !torch.int) -> !torch.list<int>
    %827 = torch.aten.view %763, %826 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.1.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.1.txt_attn.proj.weight : tensor<3072x3072xf16>
    %828 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_1069 = torch.constant.int 0
    %int1_1070 = torch.constant.int 1
    %829 = torch.aten.transpose.int %828, %int0_1069, %int1_1070 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.1.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.1.txt_attn.proj.bias : tensor<3072xf16>
    %830 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1071 = torch.constant.int 6
    %831 = torch.prims.convert_element_type %830, %int6_1071 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1072 = torch.constant.int 6
    %832 = torch.prims.convert_element_type %827, %int6_1072 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_1073 = torch.constant.int 6
    %833 = torch.prims.convert_element_type %829, %int6_1073 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %834 = torch.aten.mm %832, %833 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_1074 = torch.constant.int 1
    %835 = torch.aten.mul.Scalar %834, %int1_1074 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_1075 = torch.constant.int 1
    %836 = torch.aten.mul.Scalar %831, %int1_1075 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1076 = torch.constant.int 1
    %837 = torch.aten.add.Tensor %835, %836, %int1_1076 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_1077 = torch.constant.int 5
    %838 = torch.prims.convert_element_type %837, %int5_1077 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_1078 = torch.constant.int 1
    %int512_1079 = torch.constant.int 512
    %int3072_1080 = torch.constant.int 3072
    %839 = torch.prim.ListConstruct %int1_1078, %int512_1079, %int3072_1080 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %840 = torch.aten.view %838, %839 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %841 = torch.aten.mul.Tensor %587, %840 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1081 = torch.constant.int 1
    %842 = torch.aten.add.Tensor %548, %841, %int1_1081 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_1082 = torch.constant.int 1
    %int1_1083 = torch.constant.int 1
    %843 = torch.aten.add.Scalar %589, %int1_1082, %int1_1083 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_1084 = torch.constant.int 6
    %844 = torch.prims.convert_element_type %842, %int6_1084 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_1085 = torch.constant.int 2
    %845 = torch.prim.ListConstruct %int2_1085 : (!torch.int) -> !torch.list<int>
    %int0_1086 = torch.constant.int 0
    %true_1087 = torch.constant.bool true
    %result0_1088, %result1_1089 = torch.aten.var_mean.correction %844, %845, %int0_1086, %true_1087 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_1090 = torch.constant.float 9.9999999999999995E-7
    %int1_1091 = torch.constant.int 1
    %846 = torch.aten.add.Scalar %result0_1088, %float9.999990e-07_1090, %int1_1091 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %847 = torch.aten.rsqrt %846 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_1092 = torch.constant.int 1
    %848 = torch.aten.sub.Tensor %842, %result1_1089, %int1_1092 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %849 = torch.aten.mul.Tensor %848, %847 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_1093 = torch.constant.int 5
    %850 = torch.prims.convert_element_type %849, %int5_1093 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %851 = torch.aten.mul.Tensor %843, %850 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1094 = torch.constant.int 1
    %852 = torch.aten.add.Tensor %851, %588, %int1_1094 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_1095 = torch.constant.int 512
    %int3072_1096 = torch.constant.int 3072
    %853 = torch.prim.ListConstruct %int512_1095, %int3072_1096 : (!torch.int, !torch.int) -> !torch.list<int>
    %854 = torch.aten.view %852, %853 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.1.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.1.txt_mlp.0.weight : tensor<12288x3072xf16>
    %855 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_1097 = torch.constant.int 0
    %int1_1098 = torch.constant.int 1
    %856 = torch.aten.transpose.int %855, %int0_1097, %int1_1098 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.1.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.1.txt_mlp.0.bias : tensor<12288xf16>
    %857 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_1099 = torch.constant.int 6
    %858 = torch.prims.convert_element_type %857, %int6_1099 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_1100 = torch.constant.int 6
    %859 = torch.prims.convert_element_type %854, %int6_1100 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_1101 = torch.constant.int 6
    %860 = torch.prims.convert_element_type %856, %int6_1101 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %861 = torch.aten.mm %859, %860 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_1102 = torch.constant.int 1
    %862 = torch.aten.mul.Scalar %861, %int1_1102 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_1103 = torch.constant.int 1
    %863 = torch.aten.mul.Scalar %858, %int1_1103 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_1104 = torch.constant.int 1
    %864 = torch.aten.add.Tensor %862, %863, %int1_1104 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_1105 = torch.constant.int 5
    %865 = torch.prims.convert_element_type %864, %int5_1105 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_1106 = torch.constant.int 1
    %int512_1107 = torch.constant.int 512
    %int12288_1108 = torch.constant.int 12288
    %866 = torch.prim.ListConstruct %int1_1106, %int512_1107, %int12288_1108 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %867 = torch.aten.view %865, %866 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_1109 = torch.constant.str "tanh"
    %868 = torch.aten.gelu %867, %str_1109 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_1110 = torch.constant.int 512
    %int12288_1111 = torch.constant.int 12288
    %869 = torch.prim.ListConstruct %int512_1110, %int12288_1111 : (!torch.int, !torch.int) -> !torch.list<int>
    %870 = torch.aten.view %868, %869 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.1.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.1.txt_mlp.2.weight : tensor<3072x12288xf16>
    %871 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_1112 = torch.constant.int 0
    %int1_1113 = torch.constant.int 1
    %872 = torch.aten.transpose.int %871, %int0_1112, %int1_1113 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.1.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.1.txt_mlp.2.bias : tensor<3072xf16>
    %873 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1114 = torch.constant.int 6
    %874 = torch.prims.convert_element_type %873, %int6_1114 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1115 = torch.constant.int 6
    %875 = torch.prims.convert_element_type %870, %int6_1115 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_1116 = torch.constant.int 6
    %876 = torch.prims.convert_element_type %872, %int6_1116 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %877 = torch.aten.mm %875, %876 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_1117 = torch.constant.int 1
    %878 = torch.aten.mul.Scalar %877, %int1_1117 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_1118 = torch.constant.int 1
    %879 = torch.aten.mul.Scalar %874, %int1_1118 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1119 = torch.constant.int 1
    %880 = torch.aten.add.Tensor %878, %879, %int1_1119 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_1120 = torch.constant.int 5
    %881 = torch.prims.convert_element_type %880, %int5_1120 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_1121 = torch.constant.int 1
    %int512_1122 = torch.constant.int 512
    %int3072_1123 = torch.constant.int 3072
    %882 = torch.prim.ListConstruct %int1_1121, %int512_1122, %int3072_1123 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %883 = torch.aten.view %881, %882 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %884 = torch.aten.mul.Tensor %590, %883 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1124 = torch.constant.int 1
    %885 = torch.aten.add.Tensor %842, %884, %int1_1124 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %886 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.2.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.2.img_mod.lin.weight : tensor<18432x3072xf16>
    %887 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_1125 = torch.constant.int 0
    %int1_1126 = torch.constant.int 1
    %888 = torch.aten.transpose.int %887, %int0_1125, %int1_1126 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.2.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.2.img_mod.lin.bias : tensor<18432xf16>
    %889 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_1127 = torch.constant.int 6
    %890 = torch.prims.convert_element_type %889, %int6_1127 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_1128 = torch.constant.int 6
    %891 = torch.prims.convert_element_type %886, %int6_1128 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_1129 = torch.constant.int 6
    %892 = torch.prims.convert_element_type %888, %int6_1129 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %893 = torch.aten.mm %891, %892 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_1130 = torch.constant.int 1
    %894 = torch.aten.mul.Scalar %893, %int1_1130 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_1131 = torch.constant.int 1
    %895 = torch.aten.mul.Scalar %890, %int1_1131 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_1132 = torch.constant.int 1
    %896 = torch.aten.add.Tensor %894, %895, %int1_1132 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_1133 = torch.constant.int 5
    %897 = torch.prims.convert_element_type %896, %int5_1133 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_1134 = torch.constant.int 0
    %int0_1135 = torch.constant.int 0
    %int9223372036854775807_1136 = torch.constant.int 9223372036854775807
    %int1_1137 = torch.constant.int 1
    %898 = torch.aten.slice.Tensor %897, %int0_1134, %int0_1135, %int9223372036854775807_1136, %int1_1137 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_1138 = torch.constant.int 1
    %899 = torch.aten.unsqueeze %898, %int1_1138 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_1139 = torch.constant.int 2
    %int0_1140 = torch.constant.int 0
    %int9223372036854775807_1141 = torch.constant.int 9223372036854775807
    %int1_1142 = torch.constant.int 1
    %900 = torch.aten.slice.Tensor %899, %int2_1139, %int0_1140, %int9223372036854775807_1141, %int1_1142 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_1143 = torch.constant.int -1
    %int0_1144 = torch.constant.int 0
    %int3072_1145 = torch.constant.int 3072
    %int1_1146 = torch.constant.int 1
    %901 = torch.aten.slice.Tensor %900, %int-1_1143, %int0_1144, %int3072_1145, %int1_1146 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1147 = torch.constant.int -1
    %int3072_1148 = torch.constant.int 3072
    %int6144_1149 = torch.constant.int 6144
    %int1_1150 = torch.constant.int 1
    %902 = torch.aten.slice.Tensor %900, %int-1_1147, %int3072_1148, %int6144_1149, %int1_1150 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1151 = torch.constant.int -1
    %int6144_1152 = torch.constant.int 6144
    %int9216_1153 = torch.constant.int 9216
    %int1_1154 = torch.constant.int 1
    %903 = torch.aten.slice.Tensor %900, %int-1_1151, %int6144_1152, %int9216_1153, %int1_1154 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1155 = torch.constant.int -1
    %int9216_1156 = torch.constant.int 9216
    %int12288_1157 = torch.constant.int 12288
    %int1_1158 = torch.constant.int 1
    %904 = torch.aten.slice.Tensor %900, %int-1_1155, %int9216_1156, %int12288_1157, %int1_1158 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1159 = torch.constant.int -1
    %int12288_1160 = torch.constant.int 12288
    %int15360_1161 = torch.constant.int 15360
    %int1_1162 = torch.constant.int 1
    %905 = torch.aten.slice.Tensor %900, %int-1_1159, %int12288_1160, %int15360_1161, %int1_1162 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1163 = torch.constant.int -1
    %int15360_1164 = torch.constant.int 15360
    %int18432_1165 = torch.constant.int 18432
    %int1_1166 = torch.constant.int 1
    %906 = torch.aten.slice.Tensor %900, %int-1_1163, %int15360_1164, %int18432_1165, %int1_1166 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %907 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.2.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.2.txt_mod.lin.weight : tensor<18432x3072xf16>
    %908 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_1167 = torch.constant.int 0
    %int1_1168 = torch.constant.int 1
    %909 = torch.aten.transpose.int %908, %int0_1167, %int1_1168 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.2.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.2.txt_mod.lin.bias : tensor<18432xf16>
    %910 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_1169 = torch.constant.int 6
    %911 = torch.prims.convert_element_type %910, %int6_1169 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_1170 = torch.constant.int 6
    %912 = torch.prims.convert_element_type %907, %int6_1170 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_1171 = torch.constant.int 6
    %913 = torch.prims.convert_element_type %909, %int6_1171 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %914 = torch.aten.mm %912, %913 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_1172 = torch.constant.int 1
    %915 = torch.aten.mul.Scalar %914, %int1_1172 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_1173 = torch.constant.int 1
    %916 = torch.aten.mul.Scalar %911, %int1_1173 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_1174 = torch.constant.int 1
    %917 = torch.aten.add.Tensor %915, %916, %int1_1174 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_1175 = torch.constant.int 5
    %918 = torch.prims.convert_element_type %917, %int5_1175 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_1176 = torch.constant.int 0
    %int0_1177 = torch.constant.int 0
    %int9223372036854775807_1178 = torch.constant.int 9223372036854775807
    %int1_1179 = torch.constant.int 1
    %919 = torch.aten.slice.Tensor %918, %int0_1176, %int0_1177, %int9223372036854775807_1178, %int1_1179 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_1180 = torch.constant.int 1
    %920 = torch.aten.unsqueeze %919, %int1_1180 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_1181 = torch.constant.int 2
    %int0_1182 = torch.constant.int 0
    %int9223372036854775807_1183 = torch.constant.int 9223372036854775807
    %int1_1184 = torch.constant.int 1
    %921 = torch.aten.slice.Tensor %920, %int2_1181, %int0_1182, %int9223372036854775807_1183, %int1_1184 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_1185 = torch.constant.int -1
    %int0_1186 = torch.constant.int 0
    %int3072_1187 = torch.constant.int 3072
    %int1_1188 = torch.constant.int 1
    %922 = torch.aten.slice.Tensor %921, %int-1_1185, %int0_1186, %int3072_1187, %int1_1188 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1189 = torch.constant.int -1
    %int3072_1190 = torch.constant.int 3072
    %int6144_1191 = torch.constant.int 6144
    %int1_1192 = torch.constant.int 1
    %923 = torch.aten.slice.Tensor %921, %int-1_1189, %int3072_1190, %int6144_1191, %int1_1192 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1193 = torch.constant.int -1
    %int6144_1194 = torch.constant.int 6144
    %int9216_1195 = torch.constant.int 9216
    %int1_1196 = torch.constant.int 1
    %924 = torch.aten.slice.Tensor %921, %int-1_1193, %int6144_1194, %int9216_1195, %int1_1196 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1197 = torch.constant.int -1
    %int9216_1198 = torch.constant.int 9216
    %int12288_1199 = torch.constant.int 12288
    %int1_1200 = torch.constant.int 1
    %925 = torch.aten.slice.Tensor %921, %int-1_1197, %int9216_1198, %int12288_1199, %int1_1200 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1201 = torch.constant.int -1
    %int12288_1202 = torch.constant.int 12288
    %int15360_1203 = torch.constant.int 15360
    %int1_1204 = torch.constant.int 1
    %926 = torch.aten.slice.Tensor %921, %int-1_1201, %int12288_1202, %int15360_1203, %int1_1204 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1205 = torch.constant.int -1
    %int15360_1206 = torch.constant.int 15360
    %int18432_1207 = torch.constant.int 18432
    %int1_1208 = torch.constant.int 1
    %927 = torch.aten.slice.Tensor %921, %int-1_1205, %int15360_1206, %int18432_1207, %int1_1208 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_1209 = torch.constant.int 6
    %928 = torch.prims.convert_element_type %825, %int6_1209 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_1210 = torch.constant.int 2
    %929 = torch.prim.ListConstruct %int2_1210 : (!torch.int) -> !torch.list<int>
    %int0_1211 = torch.constant.int 0
    %true_1212 = torch.constant.bool true
    %result0_1213, %result1_1214 = torch.aten.var_mean.correction %928, %929, %int0_1211, %true_1212 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_1215 = torch.constant.float 9.9999999999999995E-7
    %int1_1216 = torch.constant.int 1
    %930 = torch.aten.add.Scalar %result0_1213, %float9.999990e-07_1215, %int1_1216 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %931 = torch.aten.rsqrt %930 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_1217 = torch.constant.int 1
    %932 = torch.aten.sub.Tensor %825, %result1_1214, %int1_1217 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %933 = torch.aten.mul.Tensor %932, %931 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_1218 = torch.constant.int 5
    %934 = torch.prims.convert_element_type %933, %int5_1218 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1219 = torch.constant.int 1
    %int1_1220 = torch.constant.int 1
    %935 = torch.aten.add.Scalar %902, %int1_1219, %int1_1220 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %936 = torch.aten.mul.Tensor %935, %934 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1221 = torch.constant.int 1
    %937 = torch.aten.add.Tensor %936, %901, %int1_1221 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_1222 = torch.constant.int 4096
    %int3072_1223 = torch.constant.int 3072
    %938 = torch.prim.ListConstruct %int4096_1222, %int3072_1223 : (!torch.int, !torch.int) -> !torch.list<int>
    %939 = torch.aten.view %937, %938 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.2.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.2.img_attn.qkv.weight : tensor<9216x3072xf16>
    %940 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_1224 = torch.constant.int 0
    %int1_1225 = torch.constant.int 1
    %941 = torch.aten.transpose.int %940, %int0_1224, %int1_1225 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.2.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.2.img_attn.qkv.bias : tensor<9216xf16>
    %942 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_1226 = torch.constant.int 6
    %943 = torch.prims.convert_element_type %942, %int6_1226 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_1227 = torch.constant.int 6
    %944 = torch.prims.convert_element_type %939, %int6_1227 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_1228 = torch.constant.int 6
    %945 = torch.prims.convert_element_type %941, %int6_1228 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %946 = torch.aten.mm %944, %945 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_1229 = torch.constant.int 1
    %947 = torch.aten.mul.Scalar %946, %int1_1229 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_1230 = torch.constant.int 1
    %948 = torch.aten.mul.Scalar %943, %int1_1230 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_1231 = torch.constant.int 1
    %949 = torch.aten.add.Tensor %947, %948, %int1_1231 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_1232 = torch.constant.int 5
    %950 = torch.prims.convert_element_type %949, %int5_1232 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_1233 = torch.constant.int 1
    %int4096_1234 = torch.constant.int 4096
    %int9216_1235 = torch.constant.int 9216
    %951 = torch.prim.ListConstruct %int1_1233, %int4096_1234, %int9216_1235 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %952 = torch.aten.view %950, %951 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %953 = torch_c.to_builtin_tensor %952 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_1236 = tensor.cast %953 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_1237 = arith.constant 0 : index
    %dim_1238 = tensor.dim %cast_1236, %c0_1237 : tensor<?x?x?xf16>
    %c1_1239 = arith.constant 1 : index
    %dim_1240 = tensor.dim %cast_1236, %c1_1239 : tensor<?x?x?xf16>
    %c2_1241 = arith.constant 2 : index
    %dim_1242 = tensor.dim %cast_1236, %c2_1241 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_1236 : tensor<?x?x?xf16>{%dim_1238, %dim_1240, %dim_1242}]
    %cast_1243 = tensor.cast %cast_1236 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %954 = torch_c.from_builtin_tensor %cast_1243 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_1244 = torch.constant.int 1
    %int4096_1245 = torch.constant.int 4096
    %int3_1246 = torch.constant.int 3
    %int24_1247 = torch.constant.int 24
    %int128_1248 = torch.constant.int 128
    %955 = torch.prim.ListConstruct %int1_1244, %int4096_1245, %int3_1246, %int24_1247, %int128_1248 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %956 = torch.aten.view %954, %955 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_1249 = torch.constant.int 2
    %int0_1250 = torch.constant.int 0
    %int3_1251 = torch.constant.int 3
    %int1_1252 = torch.constant.int 1
    %int4_1253 = torch.constant.int 4
    %957 = torch.prim.ListConstruct %int2_1249, %int0_1250, %int3_1251, %int1_1252, %int4_1253 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %958 = torch.aten.permute %956, %957 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_1254 = torch.constant.int 0
    %int0_1255 = torch.constant.int 0
    %959 = torch.aten.select.int %958, %int0_1254, %int0_1255 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_1256 = torch.constant.int 6
    %960 = torch.prims.convert_element_type %959, %int6_1256 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_1257 = torch.constant.int 2
    %961 = torch.aten.pow.Tensor_Scalar %960, %int2_1257 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_1258 = torch.constant.int -1
    %962 = torch.prim.ListConstruct %int-1_1258 : (!torch.int) -> !torch.list<int>
    %true_1259 = torch.constant.bool true
    %none_1260 = torch.constant.none
    %963 = torch.aten.mean.dim %961, %962, %true_1259, %none_1260 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_1261 = torch.constant.float 9.9999999999999995E-7
    %int1_1262 = torch.constant.int 1
    %964 = torch.aten.add.Scalar %963, %float9.999990e-07_1261, %int1_1262 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %965 = torch.aten.rsqrt %964 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %966 = torch.aten.mul.Tensor %960, %965 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_1263 = torch.constant.int 5
    %967 = torch.prims.convert_element_type %966, %int5_1263 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.2.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.2.img_attn.norm.query_norm.scale : tensor<128xf16>
    %968 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %969 = torch.aten.mul.Tensor %967, %968 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_1264 = torch.constant.int 1
    %int4096_1265 = torch.constant.int 4096
    %int3_1266 = torch.constant.int 3
    %int24_1267 = torch.constant.int 24
    %int128_1268 = torch.constant.int 128
    %970 = torch.prim.ListConstruct %int1_1264, %int4096_1265, %int3_1266, %int24_1267, %int128_1268 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %971 = torch.aten.view %954, %970 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_1269 = torch.constant.int 2
    %int0_1270 = torch.constant.int 0
    %int3_1271 = torch.constant.int 3
    %int1_1272 = torch.constant.int 1
    %int4_1273 = torch.constant.int 4
    %972 = torch.prim.ListConstruct %int2_1269, %int0_1270, %int3_1271, %int1_1272, %int4_1273 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %973 = torch.aten.permute %971, %972 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_1274 = torch.constant.int 0
    %int1_1275 = torch.constant.int 1
    %974 = torch.aten.select.int %973, %int0_1274, %int1_1275 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_1276 = torch.constant.int 6
    %975 = torch.prims.convert_element_type %974, %int6_1276 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_1277 = torch.constant.int 2
    %976 = torch.aten.pow.Tensor_Scalar %975, %int2_1277 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_1278 = torch.constant.int -1
    %977 = torch.prim.ListConstruct %int-1_1278 : (!torch.int) -> !torch.list<int>
    %true_1279 = torch.constant.bool true
    %none_1280 = torch.constant.none
    %978 = torch.aten.mean.dim %976, %977, %true_1279, %none_1280 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_1281 = torch.constant.float 9.9999999999999995E-7
    %int1_1282 = torch.constant.int 1
    %979 = torch.aten.add.Scalar %978, %float9.999990e-07_1281, %int1_1282 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %980 = torch.aten.rsqrt %979 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %981 = torch.aten.mul.Tensor %975, %980 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_1283 = torch.constant.int 5
    %982 = torch.prims.convert_element_type %981, %int5_1283 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.2.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.2.img_attn.norm.key_norm.scale : tensor<128xf16>
    %983 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %984 = torch.aten.mul.Tensor %982, %983 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_1284 = torch.constant.int 5
    %985 = torch.prims.convert_element_type %969, %int5_1284 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_1285 = torch.constant.int 5
    %986 = torch.prims.convert_element_type %984, %int5_1285 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_1286 = torch.constant.int 6
    %987 = torch.prims.convert_element_type %885, %int6_1286 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_1287 = torch.constant.int 2
    %988 = torch.prim.ListConstruct %int2_1287 : (!torch.int) -> !torch.list<int>
    %int0_1288 = torch.constant.int 0
    %true_1289 = torch.constant.bool true
    %result0_1290, %result1_1291 = torch.aten.var_mean.correction %987, %988, %int0_1288, %true_1289 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_1292 = torch.constant.float 9.9999999999999995E-7
    %int1_1293 = torch.constant.int 1
    %989 = torch.aten.add.Scalar %result0_1290, %float9.999990e-07_1292, %int1_1293 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %990 = torch.aten.rsqrt %989 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_1294 = torch.constant.int 1
    %991 = torch.aten.sub.Tensor %885, %result1_1291, %int1_1294 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %992 = torch.aten.mul.Tensor %991, %990 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_1295 = torch.constant.int 5
    %993 = torch.prims.convert_element_type %992, %int5_1295 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_1296 = torch.constant.int 1
    %int1_1297 = torch.constant.int 1
    %994 = torch.aten.add.Scalar %923, %int1_1296, %int1_1297 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %995 = torch.aten.mul.Tensor %994, %993 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1298 = torch.constant.int 1
    %996 = torch.aten.add.Tensor %995, %922, %int1_1298 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_1299 = torch.constant.int 512
    %int3072_1300 = torch.constant.int 3072
    %997 = torch.prim.ListConstruct %int512_1299, %int3072_1300 : (!torch.int, !torch.int) -> !torch.list<int>
    %998 = torch.aten.view %996, %997 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.2.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.2.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %999 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_1301 = torch.constant.int 0
    %int1_1302 = torch.constant.int 1
    %1000 = torch.aten.transpose.int %999, %int0_1301, %int1_1302 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.2.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.2.txt_attn.qkv.bias : tensor<9216xf16>
    %1001 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_1303 = torch.constant.int 6
    %1002 = torch.prims.convert_element_type %1001, %int6_1303 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_1304 = torch.constant.int 6
    %1003 = torch.prims.convert_element_type %998, %int6_1304 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_1305 = torch.constant.int 6
    %1004 = torch.prims.convert_element_type %1000, %int6_1305 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %1005 = torch.aten.mm %1003, %1004 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_1306 = torch.constant.int 1
    %1006 = torch.aten.mul.Scalar %1005, %int1_1306 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_1307 = torch.constant.int 1
    %1007 = torch.aten.mul.Scalar %1002, %int1_1307 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_1308 = torch.constant.int 1
    %1008 = torch.aten.add.Tensor %1006, %1007, %int1_1308 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_1309 = torch.constant.int 5
    %1009 = torch.prims.convert_element_type %1008, %int5_1309 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_1310 = torch.constant.int 1
    %int512_1311 = torch.constant.int 512
    %int9216_1312 = torch.constant.int 9216
    %1010 = torch.prim.ListConstruct %int1_1310, %int512_1311, %int9216_1312 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1011 = torch.aten.view %1009, %1010 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %1012 = torch_c.to_builtin_tensor %1011 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_1313 = tensor.cast %1012 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_1314 = arith.constant 0 : index
    %dim_1315 = tensor.dim %cast_1313, %c0_1314 : tensor<?x?x?xf16>
    %c1_1316 = arith.constant 1 : index
    %dim_1317 = tensor.dim %cast_1313, %c1_1316 : tensor<?x?x?xf16>
    %c2_1318 = arith.constant 2 : index
    %dim_1319 = tensor.dim %cast_1313, %c2_1318 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_1313 : tensor<?x?x?xf16>{%dim_1315, %dim_1317, %dim_1319}]
    %cast_1320 = tensor.cast %cast_1313 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %1013 = torch_c.from_builtin_tensor %cast_1320 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_1321 = torch.constant.int 1
    %int512_1322 = torch.constant.int 512
    %int3_1323 = torch.constant.int 3
    %int24_1324 = torch.constant.int 24
    %int128_1325 = torch.constant.int 128
    %1014 = torch.prim.ListConstruct %int1_1321, %int512_1322, %int3_1323, %int24_1324, %int128_1325 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1015 = torch.aten.view %1013, %1014 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_1326 = torch.constant.int 2
    %int0_1327 = torch.constant.int 0
    %int3_1328 = torch.constant.int 3
    %int1_1329 = torch.constant.int 1
    %int4_1330 = torch.constant.int 4
    %1016 = torch.prim.ListConstruct %int2_1326, %int0_1327, %int3_1328, %int1_1329, %int4_1330 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1017 = torch.aten.permute %1015, %1016 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_1331 = torch.constant.int 0
    %int0_1332 = torch.constant.int 0
    %1018 = torch.aten.select.int %1017, %int0_1331, %int0_1332 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_1333 = torch.constant.int 6
    %1019 = torch.prims.convert_element_type %1018, %int6_1333 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_1334 = torch.constant.int 2
    %1020 = torch.aten.pow.Tensor_Scalar %1019, %int2_1334 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_1335 = torch.constant.int -1
    %1021 = torch.prim.ListConstruct %int-1_1335 : (!torch.int) -> !torch.list<int>
    %true_1336 = torch.constant.bool true
    %none_1337 = torch.constant.none
    %1022 = torch.aten.mean.dim %1020, %1021, %true_1336, %none_1337 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_1338 = torch.constant.float 9.9999999999999995E-7
    %int1_1339 = torch.constant.int 1
    %1023 = torch.aten.add.Scalar %1022, %float9.999990e-07_1338, %int1_1339 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %1024 = torch.aten.rsqrt %1023 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %1025 = torch.aten.mul.Tensor %1019, %1024 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_1340 = torch.constant.int 5
    %1026 = torch.prims.convert_element_type %1025, %int5_1340 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.2.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.2.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %1027 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1028 = torch.aten.mul.Tensor %1026, %1027 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_1341 = torch.constant.int 1
    %int512_1342 = torch.constant.int 512
    %int3_1343 = torch.constant.int 3
    %int24_1344 = torch.constant.int 24
    %int128_1345 = torch.constant.int 128
    %1029 = torch.prim.ListConstruct %int1_1341, %int512_1342, %int3_1343, %int24_1344, %int128_1345 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1030 = torch.aten.view %1013, %1029 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_1346 = torch.constant.int 2
    %int0_1347 = torch.constant.int 0
    %int3_1348 = torch.constant.int 3
    %int1_1349 = torch.constant.int 1
    %int4_1350 = torch.constant.int 4
    %1031 = torch.prim.ListConstruct %int2_1346, %int0_1347, %int3_1348, %int1_1349, %int4_1350 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1032 = torch.aten.permute %1030, %1031 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_1351 = torch.constant.int 0
    %int1_1352 = torch.constant.int 1
    %1033 = torch.aten.select.int %1032, %int0_1351, %int1_1352 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_1353 = torch.constant.int 6
    %1034 = torch.prims.convert_element_type %1033, %int6_1353 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_1354 = torch.constant.int 2
    %1035 = torch.aten.pow.Tensor_Scalar %1034, %int2_1354 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_1355 = torch.constant.int -1
    %1036 = torch.prim.ListConstruct %int-1_1355 : (!torch.int) -> !torch.list<int>
    %true_1356 = torch.constant.bool true
    %none_1357 = torch.constant.none
    %1037 = torch.aten.mean.dim %1035, %1036, %true_1356, %none_1357 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_1358 = torch.constant.float 9.9999999999999995E-7
    %int1_1359 = torch.constant.int 1
    %1038 = torch.aten.add.Scalar %1037, %float9.999990e-07_1358, %int1_1359 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %1039 = torch.aten.rsqrt %1038 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %1040 = torch.aten.mul.Tensor %1034, %1039 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_1360 = torch.constant.int 5
    %1041 = torch.prims.convert_element_type %1040, %int5_1360 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.2.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.2.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %1042 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1043 = torch.aten.mul.Tensor %1041, %1042 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_1361 = torch.constant.int 5
    %1044 = torch.prims.convert_element_type %1028, %int5_1361 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_1362 = torch.constant.int 5
    %1045 = torch.prims.convert_element_type %1043, %int5_1362 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %1046 = torch.prim.ListConstruct %1044, %985 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_1363 = torch.constant.int 2
    %1047 = torch.aten.cat %1046, %int2_1363 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %1048 = torch.prim.ListConstruct %1045, %986 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_1364 = torch.constant.int 2
    %1049 = torch.aten.cat %1048, %int2_1364 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_1365 = torch.constant.int 1
    %int512_1366 = torch.constant.int 512
    %int3_1367 = torch.constant.int 3
    %int24_1368 = torch.constant.int 24
    %int128_1369 = torch.constant.int 128
    %1050 = torch.prim.ListConstruct %int1_1365, %int512_1366, %int3_1367, %int24_1368, %int128_1369 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1051 = torch.aten.view %1013, %1050 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_1370 = torch.constant.int 2
    %int0_1371 = torch.constant.int 0
    %int3_1372 = torch.constant.int 3
    %int1_1373 = torch.constant.int 1
    %int4_1374 = torch.constant.int 4
    %1052 = torch.prim.ListConstruct %int2_1370, %int0_1371, %int3_1372, %int1_1373, %int4_1374 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1053 = torch.aten.permute %1051, %1052 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_1375 = torch.constant.int 0
    %int2_1376 = torch.constant.int 2
    %1054 = torch.aten.select.int %1053, %int0_1375, %int2_1376 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_1377 = torch.constant.int 1
    %int4096_1378 = torch.constant.int 4096
    %int3_1379 = torch.constant.int 3
    %int24_1380 = torch.constant.int 24
    %int128_1381 = torch.constant.int 128
    %1055 = torch.prim.ListConstruct %int1_1377, %int4096_1378, %int3_1379, %int24_1380, %int128_1381 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1056 = torch.aten.view %954, %1055 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_1382 = torch.constant.int 2
    %int0_1383 = torch.constant.int 0
    %int3_1384 = torch.constant.int 3
    %int1_1385 = torch.constant.int 1
    %int4_1386 = torch.constant.int 4
    %1057 = torch.prim.ListConstruct %int2_1382, %int0_1383, %int3_1384, %int1_1385, %int4_1386 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1058 = torch.aten.permute %1056, %1057 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_1387 = torch.constant.int 0
    %int2_1388 = torch.constant.int 2
    %1059 = torch.aten.select.int %1058, %int0_1387, %int2_1388 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %1060 = torch.prim.ListConstruct %1054, %1059 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_1389 = torch.constant.int 2
    %1061 = torch.aten.cat %1060, %int2_1389 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %1062 = torch_c.to_builtin_tensor %1047 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_1390 = tensor.cast %1062 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_1391 = arith.constant 0 : index
    %dim_1392 = tensor.dim %cast_1390, %c0_1391 : tensor<?x?x?x?xf16>
    %c1_1393 = arith.constant 1 : index
    %dim_1394 = tensor.dim %cast_1390, %c1_1393 : tensor<?x?x?x?xf16>
    %c2_1395 = arith.constant 2 : index
    %dim_1396 = tensor.dim %cast_1390, %c2_1395 : tensor<?x?x?x?xf16>
    %c3_1397 = arith.constant 3 : index
    %dim_1398 = tensor.dim %cast_1390, %c3_1397 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_1390 : tensor<?x?x?x?xf16>{%dim_1392, %dim_1394, %dim_1396, %dim_1398}]
    %cast_1399 = tensor.cast %cast_1390 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %1063 = torch_c.from_builtin_tensor %cast_1399 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %1064 = torch_c.to_builtin_tensor %1049 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_1400 = tensor.cast %1064 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_1401 = arith.constant 0 : index
    %dim_1402 = tensor.dim %cast_1400, %c0_1401 : tensor<?x?x?x?xf16>
    %c1_1403 = arith.constant 1 : index
    %dim_1404 = tensor.dim %cast_1400, %c1_1403 : tensor<?x?x?x?xf16>
    %c2_1405 = arith.constant 2 : index
    %dim_1406 = tensor.dim %cast_1400, %c2_1405 : tensor<?x?x?x?xf16>
    %c3_1407 = arith.constant 3 : index
    %dim_1408 = tensor.dim %cast_1400, %c3_1407 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_1400 : tensor<?x?x?x?xf16>{%dim_1402, %dim_1404, %dim_1406, %dim_1408}]
    %cast_1409 = tensor.cast %cast_1400 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %1065 = torch_c.from_builtin_tensor %cast_1409 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %1066 = torch_c.to_builtin_tensor %1061 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_1410 = tensor.cast %1066 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_1411 = arith.constant 0 : index
    %dim_1412 = tensor.dim %cast_1410, %c0_1411 : tensor<?x?x?x?xf16>
    %c1_1413 = arith.constant 1 : index
    %dim_1414 = tensor.dim %cast_1410, %c1_1413 : tensor<?x?x?x?xf16>
    %c2_1415 = arith.constant 2 : index
    %dim_1416 = tensor.dim %cast_1410, %c2_1415 : tensor<?x?x?x?xf16>
    %c3_1417 = arith.constant 3 : index
    %dim_1418 = tensor.dim %cast_1410, %c3_1417 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_1410 : tensor<?x?x?x?xf16>{%dim_1412, %dim_1414, %dim_1416, %dim_1418}]
    %cast_1419 = tensor.cast %cast_1410 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %1067 = torch_c.from_builtin_tensor %cast_1419 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_1420 = torch.constant.int 6
    %1068 = torch.prims.convert_element_type %1063, %int6_1420 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_1421 = torch.constant.int 1
    %int24_1422 = torch.constant.int 24
    %int4608_1423 = torch.constant.int 4608
    %int-1_1424 = torch.constant.int -1
    %int1_1425 = torch.constant.int 1
    %int2_1426 = torch.constant.int 2
    %1069 = torch.prim.ListConstruct %int1_1421, %int24_1422, %int4608_1423, %int-1_1424, %int1_1425, %int2_1426 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1070 = torch.aten.view %1068, %1069 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_1427 = torch.constant.int 6
    %1071 = torch.prims.convert_element_type %1065, %int6_1427 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_1428 = torch.constant.int 1
    %int24_1429 = torch.constant.int 24
    %int4608_1430 = torch.constant.int 4608
    %int-1_1431 = torch.constant.int -1
    %int1_1432 = torch.constant.int 1
    %int2_1433 = torch.constant.int 2
    %1072 = torch.prim.ListConstruct %int1_1428, %int24_1429, %int4608_1430, %int-1_1431, %int1_1432, %int2_1433 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1073 = torch.aten.view %1071, %1072 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_1434 = torch.constant.int 5
    %int0_1435 = torch.constant.int 0
    %1074 = torch.aten.select.int %211, %int5_1434, %int0_1435 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1436 = torch.constant.int 5
    %int0_1437 = torch.constant.int 0
    %1075 = torch.aten.select.int %1070, %int5_1436, %int0_1437 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1076 = torch.aten.mul.Tensor %1074, %1075 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_1438 = torch.constant.int 5
    %int1_1439 = torch.constant.int 1
    %1077 = torch.aten.select.int %211, %int5_1438, %int1_1439 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1440 = torch.constant.int 5
    %int1_1441 = torch.constant.int 1
    %1078 = torch.aten.select.int %1070, %int5_1440, %int1_1441 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1079 = torch.aten.mul.Tensor %1077, %1078 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_1442 = torch.constant.int 1
    %1080 = torch.aten.add.Tensor %1076, %1079, %int1_1442 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_1443 = torch.constant.int 5
    %int0_1444 = torch.constant.int 0
    %1081 = torch.aten.select.int %211, %int5_1443, %int0_1444 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1445 = torch.constant.int 5
    %int0_1446 = torch.constant.int 0
    %1082 = torch.aten.select.int %1073, %int5_1445, %int0_1446 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1083 = torch.aten.mul.Tensor %1081, %1082 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_1447 = torch.constant.int 5
    %int1_1448 = torch.constant.int 1
    %1084 = torch.aten.select.int %211, %int5_1447, %int1_1448 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1449 = torch.constant.int 5
    %int1_1450 = torch.constant.int 1
    %1085 = torch.aten.select.int %1073, %int5_1449, %int1_1450 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1086 = torch.aten.mul.Tensor %1084, %1085 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_1451 = torch.constant.int 1
    %1087 = torch.aten.add.Tensor %1083, %1086, %int1_1451 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_1452 = torch.constant.int 1
    %int24_1453 = torch.constant.int 24
    %int4608_1454 = torch.constant.int 4608
    %int128_1455 = torch.constant.int 128
    %1088 = torch.prim.ListConstruct %int1_1452, %int24_1453, %int4608_1454, %int128_1455 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1089 = torch.aten.view %1080, %1088 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_1456 = torch.constant.int 5
    %1090 = torch.prims.convert_element_type %1089, %int5_1456 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_1457 = torch.constant.int 1
    %int24_1458 = torch.constant.int 24
    %int4608_1459 = torch.constant.int 4608
    %int128_1460 = torch.constant.int 128
    %1091 = torch.prim.ListConstruct %int1_1457, %int24_1458, %int4608_1459, %int128_1460 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1092 = torch.aten.view %1087, %1091 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_1461 = torch.constant.int 5
    %1093 = torch.prims.convert_element_type %1092, %int5_1461 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_1462 = torch.constant.float 0.000000e+00
    %false_1463 = torch.constant.bool false
    %none_1464 = torch.constant.none
    %none_1465 = torch.constant.none
    %1094:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1090, %1093, %1067, %float0.000000e00_1462, %false_1463, %none_1464, %none_1465) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_1466 = torch.constant.int 0
    %int2_1467 = torch.constant.int 2
    %int1_1468 = torch.constant.int 1
    %int3_1469 = torch.constant.int 3
    %1095 = torch.prim.ListConstruct %int0_1466, %int2_1467, %int1_1468, %int3_1469 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1096 = torch.aten.permute %1094#0, %1095 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_1470 = torch.constant.int 1
    %int4608_1471 = torch.constant.int 4608
    %int3072_1472 = torch.constant.int 3072
    %1097 = torch.prim.ListConstruct %int1_1470, %int4608_1471, %int3072_1472 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1098 = torch.aten.view %1096, %1097 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_1473 = torch.constant.int 0
    %int0_1474 = torch.constant.int 0
    %int9223372036854775807_1475 = torch.constant.int 9223372036854775807
    %int1_1476 = torch.constant.int 1
    %1099 = torch.aten.slice.Tensor %1098, %int0_1473, %int0_1474, %int9223372036854775807_1475, %int1_1476 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_1477 = torch.constant.int 1
    %int0_1478 = torch.constant.int 0
    %int512_1479 = torch.constant.int 512
    %int1_1480 = torch.constant.int 1
    %1100 = torch.aten.slice.Tensor %1099, %int1_1477, %int0_1478, %int512_1479, %int1_1480 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_1481 = torch.constant.int 0
    %int0_1482 = torch.constant.int 0
    %int9223372036854775807_1483 = torch.constant.int 9223372036854775807
    %int1_1484 = torch.constant.int 1
    %1101 = torch.aten.slice.Tensor %1098, %int0_1481, %int0_1482, %int9223372036854775807_1483, %int1_1484 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_1485 = torch.constant.int 1
    %int512_1486 = torch.constant.int 512
    %int9223372036854775807_1487 = torch.constant.int 9223372036854775807
    %int1_1488 = torch.constant.int 1
    %1102 = torch.aten.slice.Tensor %1101, %int1_1485, %int512_1486, %int9223372036854775807_1487, %int1_1488 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_1489 = torch.constant.int 4096
    %int3072_1490 = torch.constant.int 3072
    %1103 = torch.prim.ListConstruct %int4096_1489, %int3072_1490 : (!torch.int, !torch.int) -> !torch.list<int>
    %1104 = torch.aten.view %1102, %1103 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.2.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.2.img_attn.proj.weight : tensor<3072x3072xf16>
    %1105 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_1491 = torch.constant.int 0
    %int1_1492 = torch.constant.int 1
    %1106 = torch.aten.transpose.int %1105, %int0_1491, %int1_1492 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.2.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.2.img_attn.proj.bias : tensor<3072xf16>
    %1107 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1493 = torch.constant.int 6
    %1108 = torch.prims.convert_element_type %1107, %int6_1493 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1494 = torch.constant.int 6
    %1109 = torch.prims.convert_element_type %1104, %int6_1494 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_1495 = torch.constant.int 6
    %1110 = torch.prims.convert_element_type %1106, %int6_1495 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %1111 = torch.aten.mm %1109, %1110 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_1496 = torch.constant.int 1
    %1112 = torch.aten.mul.Scalar %1111, %int1_1496 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_1497 = torch.constant.int 1
    %1113 = torch.aten.mul.Scalar %1108, %int1_1497 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1498 = torch.constant.int 1
    %1114 = torch.aten.add.Tensor %1112, %1113, %int1_1498 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_1499 = torch.constant.int 5
    %1115 = torch.prims.convert_element_type %1114, %int5_1499 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_1500 = torch.constant.int 1
    %int4096_1501 = torch.constant.int 4096
    %int3072_1502 = torch.constant.int 3072
    %1116 = torch.prim.ListConstruct %int1_1500, %int4096_1501, %int3072_1502 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1117 = torch.aten.view %1115, %1116 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %1118 = torch.aten.mul.Tensor %903, %1117 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1503 = torch.constant.int 1
    %1119 = torch.aten.add.Tensor %825, %1118, %int1_1503 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1504 = torch.constant.int 1
    %int1_1505 = torch.constant.int 1
    %1120 = torch.aten.add.Scalar %905, %int1_1504, %int1_1505 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_1506 = torch.constant.int 6
    %1121 = torch.prims.convert_element_type %1119, %int6_1506 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_1507 = torch.constant.int 2
    %1122 = torch.prim.ListConstruct %int2_1507 : (!torch.int) -> !torch.list<int>
    %int0_1508 = torch.constant.int 0
    %true_1509 = torch.constant.bool true
    %result0_1510, %result1_1511 = torch.aten.var_mean.correction %1121, %1122, %int0_1508, %true_1509 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_1512 = torch.constant.float 9.9999999999999995E-7
    %int1_1513 = torch.constant.int 1
    %1123 = torch.aten.add.Scalar %result0_1510, %float9.999990e-07_1512, %int1_1513 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %1124 = torch.aten.rsqrt %1123 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_1514 = torch.constant.int 1
    %1125 = torch.aten.sub.Tensor %1119, %result1_1511, %int1_1514 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %1126 = torch.aten.mul.Tensor %1125, %1124 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_1515 = torch.constant.int 5
    %1127 = torch.prims.convert_element_type %1126, %int5_1515 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %1128 = torch.aten.mul.Tensor %1120, %1127 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1516 = torch.constant.int 1
    %1129 = torch.aten.add.Tensor %1128, %904, %int1_1516 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_1517 = torch.constant.int 4096
    %int3072_1518 = torch.constant.int 3072
    %1130 = torch.prim.ListConstruct %int4096_1517, %int3072_1518 : (!torch.int, !torch.int) -> !torch.list<int>
    %1131 = torch.aten.view %1129, %1130 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.2.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.2.img_mlp.0.weight : tensor<12288x3072xf16>
    %1132 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_1519 = torch.constant.int 0
    %int1_1520 = torch.constant.int 1
    %1133 = torch.aten.transpose.int %1132, %int0_1519, %int1_1520 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.2.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.2.img_mlp.0.bias : tensor<12288xf16>
    %1134 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_1521 = torch.constant.int 6
    %1135 = torch.prims.convert_element_type %1134, %int6_1521 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_1522 = torch.constant.int 6
    %1136 = torch.prims.convert_element_type %1131, %int6_1522 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_1523 = torch.constant.int 6
    %1137 = torch.prims.convert_element_type %1133, %int6_1523 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %1138 = torch.aten.mm %1136, %1137 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_1524 = torch.constant.int 1
    %1139 = torch.aten.mul.Scalar %1138, %int1_1524 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_1525 = torch.constant.int 1
    %1140 = torch.aten.mul.Scalar %1135, %int1_1525 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_1526 = torch.constant.int 1
    %1141 = torch.aten.add.Tensor %1139, %1140, %int1_1526 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_1527 = torch.constant.int 5
    %1142 = torch.prims.convert_element_type %1141, %int5_1527 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_1528 = torch.constant.int 1
    %int4096_1529 = torch.constant.int 4096
    %int12288_1530 = torch.constant.int 12288
    %1143 = torch.prim.ListConstruct %int1_1528, %int4096_1529, %int12288_1530 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1144 = torch.aten.view %1142, %1143 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_1531 = torch.constant.str "tanh"
    %1145 = torch.aten.gelu %1144, %str_1531 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_1532 = torch.constant.int 4096
    %int12288_1533 = torch.constant.int 12288
    %1146 = torch.prim.ListConstruct %int4096_1532, %int12288_1533 : (!torch.int, !torch.int) -> !torch.list<int>
    %1147 = torch.aten.view %1145, %1146 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.2.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.2.img_mlp.2.weight : tensor<3072x12288xf16>
    %1148 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_1534 = torch.constant.int 0
    %int1_1535 = torch.constant.int 1
    %1149 = torch.aten.transpose.int %1148, %int0_1534, %int1_1535 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.2.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.2.img_mlp.2.bias : tensor<3072xf16>
    %1150 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1536 = torch.constant.int 6
    %1151 = torch.prims.convert_element_type %1150, %int6_1536 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1537 = torch.constant.int 6
    %1152 = torch.prims.convert_element_type %1147, %int6_1537 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_1538 = torch.constant.int 6
    %1153 = torch.prims.convert_element_type %1149, %int6_1538 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %1154 = torch.aten.mm %1152, %1153 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_1539 = torch.constant.int 1
    %1155 = torch.aten.mul.Scalar %1154, %int1_1539 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_1540 = torch.constant.int 1
    %1156 = torch.aten.mul.Scalar %1151, %int1_1540 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1541 = torch.constant.int 1
    %1157 = torch.aten.add.Tensor %1155, %1156, %int1_1541 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_1542 = torch.constant.int 5
    %1158 = torch.prims.convert_element_type %1157, %int5_1542 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_1543 = torch.constant.int 1
    %int4096_1544 = torch.constant.int 4096
    %int3072_1545 = torch.constant.int 3072
    %1159 = torch.prim.ListConstruct %int1_1543, %int4096_1544, %int3072_1545 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1160 = torch.aten.view %1158, %1159 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %1161 = torch.aten.mul.Tensor %906, %1160 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1546 = torch.constant.int 1
    %1162 = torch.aten.add.Tensor %1119, %1161, %int1_1546 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_1547 = torch.constant.int 512
    %int3072_1548 = torch.constant.int 3072
    %1163 = torch.prim.ListConstruct %int512_1547, %int3072_1548 : (!torch.int, !torch.int) -> !torch.list<int>
    %1164 = torch.aten.view %1100, %1163 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.2.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.2.txt_attn.proj.weight : tensor<3072x3072xf16>
    %1165 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_1549 = torch.constant.int 0
    %int1_1550 = torch.constant.int 1
    %1166 = torch.aten.transpose.int %1165, %int0_1549, %int1_1550 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.2.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.2.txt_attn.proj.bias : tensor<3072xf16>
    %1167 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1551 = torch.constant.int 6
    %1168 = torch.prims.convert_element_type %1167, %int6_1551 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1552 = torch.constant.int 6
    %1169 = torch.prims.convert_element_type %1164, %int6_1552 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_1553 = torch.constant.int 6
    %1170 = torch.prims.convert_element_type %1166, %int6_1553 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %1171 = torch.aten.mm %1169, %1170 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_1554 = torch.constant.int 1
    %1172 = torch.aten.mul.Scalar %1171, %int1_1554 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_1555 = torch.constant.int 1
    %1173 = torch.aten.mul.Scalar %1168, %int1_1555 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1556 = torch.constant.int 1
    %1174 = torch.aten.add.Tensor %1172, %1173, %int1_1556 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_1557 = torch.constant.int 5
    %1175 = torch.prims.convert_element_type %1174, %int5_1557 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_1558 = torch.constant.int 1
    %int512_1559 = torch.constant.int 512
    %int3072_1560 = torch.constant.int 3072
    %1176 = torch.prim.ListConstruct %int1_1558, %int512_1559, %int3072_1560 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1177 = torch.aten.view %1175, %1176 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %1178 = torch.aten.mul.Tensor %924, %1177 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1561 = torch.constant.int 1
    %1179 = torch.aten.add.Tensor %885, %1178, %int1_1561 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_1562 = torch.constant.int 1
    %int1_1563 = torch.constant.int 1
    %1180 = torch.aten.add.Scalar %926, %int1_1562, %int1_1563 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_1564 = torch.constant.int 6
    %1181 = torch.prims.convert_element_type %1179, %int6_1564 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_1565 = torch.constant.int 2
    %1182 = torch.prim.ListConstruct %int2_1565 : (!torch.int) -> !torch.list<int>
    %int0_1566 = torch.constant.int 0
    %true_1567 = torch.constant.bool true
    %result0_1568, %result1_1569 = torch.aten.var_mean.correction %1181, %1182, %int0_1566, %true_1567 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_1570 = torch.constant.float 9.9999999999999995E-7
    %int1_1571 = torch.constant.int 1
    %1183 = torch.aten.add.Scalar %result0_1568, %float9.999990e-07_1570, %int1_1571 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %1184 = torch.aten.rsqrt %1183 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_1572 = torch.constant.int 1
    %1185 = torch.aten.sub.Tensor %1179, %result1_1569, %int1_1572 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %1186 = torch.aten.mul.Tensor %1185, %1184 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_1573 = torch.constant.int 5
    %1187 = torch.prims.convert_element_type %1186, %int5_1573 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %1188 = torch.aten.mul.Tensor %1180, %1187 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1574 = torch.constant.int 1
    %1189 = torch.aten.add.Tensor %1188, %925, %int1_1574 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_1575 = torch.constant.int 512
    %int3072_1576 = torch.constant.int 3072
    %1190 = torch.prim.ListConstruct %int512_1575, %int3072_1576 : (!torch.int, !torch.int) -> !torch.list<int>
    %1191 = torch.aten.view %1189, %1190 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.2.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.2.txt_mlp.0.weight : tensor<12288x3072xf16>
    %1192 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_1577 = torch.constant.int 0
    %int1_1578 = torch.constant.int 1
    %1193 = torch.aten.transpose.int %1192, %int0_1577, %int1_1578 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.2.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.2.txt_mlp.0.bias : tensor<12288xf16>
    %1194 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_1579 = torch.constant.int 6
    %1195 = torch.prims.convert_element_type %1194, %int6_1579 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_1580 = torch.constant.int 6
    %1196 = torch.prims.convert_element_type %1191, %int6_1580 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_1581 = torch.constant.int 6
    %1197 = torch.prims.convert_element_type %1193, %int6_1581 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %1198 = torch.aten.mm %1196, %1197 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_1582 = torch.constant.int 1
    %1199 = torch.aten.mul.Scalar %1198, %int1_1582 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_1583 = torch.constant.int 1
    %1200 = torch.aten.mul.Scalar %1195, %int1_1583 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_1584 = torch.constant.int 1
    %1201 = torch.aten.add.Tensor %1199, %1200, %int1_1584 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_1585 = torch.constant.int 5
    %1202 = torch.prims.convert_element_type %1201, %int5_1585 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_1586 = torch.constant.int 1
    %int512_1587 = torch.constant.int 512
    %int12288_1588 = torch.constant.int 12288
    %1203 = torch.prim.ListConstruct %int1_1586, %int512_1587, %int12288_1588 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1204 = torch.aten.view %1202, %1203 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_1589 = torch.constant.str "tanh"
    %1205 = torch.aten.gelu %1204, %str_1589 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_1590 = torch.constant.int 512
    %int12288_1591 = torch.constant.int 12288
    %1206 = torch.prim.ListConstruct %int512_1590, %int12288_1591 : (!torch.int, !torch.int) -> !torch.list<int>
    %1207 = torch.aten.view %1205, %1206 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.2.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.2.txt_mlp.2.weight : tensor<3072x12288xf16>
    %1208 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_1592 = torch.constant.int 0
    %int1_1593 = torch.constant.int 1
    %1209 = torch.aten.transpose.int %1208, %int0_1592, %int1_1593 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.2.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.2.txt_mlp.2.bias : tensor<3072xf16>
    %1210 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1594 = torch.constant.int 6
    %1211 = torch.prims.convert_element_type %1210, %int6_1594 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1595 = torch.constant.int 6
    %1212 = torch.prims.convert_element_type %1207, %int6_1595 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_1596 = torch.constant.int 6
    %1213 = torch.prims.convert_element_type %1209, %int6_1596 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %1214 = torch.aten.mm %1212, %1213 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_1597 = torch.constant.int 1
    %1215 = torch.aten.mul.Scalar %1214, %int1_1597 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_1598 = torch.constant.int 1
    %1216 = torch.aten.mul.Scalar %1211, %int1_1598 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1599 = torch.constant.int 1
    %1217 = torch.aten.add.Tensor %1215, %1216, %int1_1599 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_1600 = torch.constant.int 5
    %1218 = torch.prims.convert_element_type %1217, %int5_1600 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_1601 = torch.constant.int 1
    %int512_1602 = torch.constant.int 512
    %int3072_1603 = torch.constant.int 3072
    %1219 = torch.prim.ListConstruct %int1_1601, %int512_1602, %int3072_1603 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1220 = torch.aten.view %1218, %1219 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %1221 = torch.aten.mul.Tensor %927, %1220 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1604 = torch.constant.int 1
    %1222 = torch.aten.add.Tensor %1179, %1221, %int1_1604 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %1223 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.3.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.3.img_mod.lin.weight : tensor<18432x3072xf16>
    %1224 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_1605 = torch.constant.int 0
    %int1_1606 = torch.constant.int 1
    %1225 = torch.aten.transpose.int %1224, %int0_1605, %int1_1606 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.3.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.3.img_mod.lin.bias : tensor<18432xf16>
    %1226 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_1607 = torch.constant.int 6
    %1227 = torch.prims.convert_element_type %1226, %int6_1607 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_1608 = torch.constant.int 6
    %1228 = torch.prims.convert_element_type %1223, %int6_1608 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_1609 = torch.constant.int 6
    %1229 = torch.prims.convert_element_type %1225, %int6_1609 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %1230 = torch.aten.mm %1228, %1229 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_1610 = torch.constant.int 1
    %1231 = torch.aten.mul.Scalar %1230, %int1_1610 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_1611 = torch.constant.int 1
    %1232 = torch.aten.mul.Scalar %1227, %int1_1611 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_1612 = torch.constant.int 1
    %1233 = torch.aten.add.Tensor %1231, %1232, %int1_1612 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_1613 = torch.constant.int 5
    %1234 = torch.prims.convert_element_type %1233, %int5_1613 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_1614 = torch.constant.int 0
    %int0_1615 = torch.constant.int 0
    %int9223372036854775807_1616 = torch.constant.int 9223372036854775807
    %int1_1617 = torch.constant.int 1
    %1235 = torch.aten.slice.Tensor %1234, %int0_1614, %int0_1615, %int9223372036854775807_1616, %int1_1617 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_1618 = torch.constant.int 1
    %1236 = torch.aten.unsqueeze %1235, %int1_1618 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_1619 = torch.constant.int 2
    %int0_1620 = torch.constant.int 0
    %int9223372036854775807_1621 = torch.constant.int 9223372036854775807
    %int1_1622 = torch.constant.int 1
    %1237 = torch.aten.slice.Tensor %1236, %int2_1619, %int0_1620, %int9223372036854775807_1621, %int1_1622 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_1623 = torch.constant.int -1
    %int0_1624 = torch.constant.int 0
    %int3072_1625 = torch.constant.int 3072
    %int1_1626 = torch.constant.int 1
    %1238 = torch.aten.slice.Tensor %1237, %int-1_1623, %int0_1624, %int3072_1625, %int1_1626 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1627 = torch.constant.int -1
    %int3072_1628 = torch.constant.int 3072
    %int6144_1629 = torch.constant.int 6144
    %int1_1630 = torch.constant.int 1
    %1239 = torch.aten.slice.Tensor %1237, %int-1_1627, %int3072_1628, %int6144_1629, %int1_1630 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1631 = torch.constant.int -1
    %int6144_1632 = torch.constant.int 6144
    %int9216_1633 = torch.constant.int 9216
    %int1_1634 = torch.constant.int 1
    %1240 = torch.aten.slice.Tensor %1237, %int-1_1631, %int6144_1632, %int9216_1633, %int1_1634 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1635 = torch.constant.int -1
    %int9216_1636 = torch.constant.int 9216
    %int12288_1637 = torch.constant.int 12288
    %int1_1638 = torch.constant.int 1
    %1241 = torch.aten.slice.Tensor %1237, %int-1_1635, %int9216_1636, %int12288_1637, %int1_1638 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1639 = torch.constant.int -1
    %int12288_1640 = torch.constant.int 12288
    %int15360_1641 = torch.constant.int 15360
    %int1_1642 = torch.constant.int 1
    %1242 = torch.aten.slice.Tensor %1237, %int-1_1639, %int12288_1640, %int15360_1641, %int1_1642 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1643 = torch.constant.int -1
    %int15360_1644 = torch.constant.int 15360
    %int18432_1645 = torch.constant.int 18432
    %int1_1646 = torch.constant.int 1
    %1243 = torch.aten.slice.Tensor %1237, %int-1_1643, %int15360_1644, %int18432_1645, %int1_1646 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1244 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.3.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.3.txt_mod.lin.weight : tensor<18432x3072xf16>
    %1245 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_1647 = torch.constant.int 0
    %int1_1648 = torch.constant.int 1
    %1246 = torch.aten.transpose.int %1245, %int0_1647, %int1_1648 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.3.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.3.txt_mod.lin.bias : tensor<18432xf16>
    %1247 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_1649 = torch.constant.int 6
    %1248 = torch.prims.convert_element_type %1247, %int6_1649 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_1650 = torch.constant.int 6
    %1249 = torch.prims.convert_element_type %1244, %int6_1650 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_1651 = torch.constant.int 6
    %1250 = torch.prims.convert_element_type %1246, %int6_1651 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %1251 = torch.aten.mm %1249, %1250 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_1652 = torch.constant.int 1
    %1252 = torch.aten.mul.Scalar %1251, %int1_1652 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_1653 = torch.constant.int 1
    %1253 = torch.aten.mul.Scalar %1248, %int1_1653 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_1654 = torch.constant.int 1
    %1254 = torch.aten.add.Tensor %1252, %1253, %int1_1654 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_1655 = torch.constant.int 5
    %1255 = torch.prims.convert_element_type %1254, %int5_1655 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_1656 = torch.constant.int 0
    %int0_1657 = torch.constant.int 0
    %int9223372036854775807_1658 = torch.constant.int 9223372036854775807
    %int1_1659 = torch.constant.int 1
    %1256 = torch.aten.slice.Tensor %1255, %int0_1656, %int0_1657, %int9223372036854775807_1658, %int1_1659 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_1660 = torch.constant.int 1
    %1257 = torch.aten.unsqueeze %1256, %int1_1660 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_1661 = torch.constant.int 2
    %int0_1662 = torch.constant.int 0
    %int9223372036854775807_1663 = torch.constant.int 9223372036854775807
    %int1_1664 = torch.constant.int 1
    %1258 = torch.aten.slice.Tensor %1257, %int2_1661, %int0_1662, %int9223372036854775807_1663, %int1_1664 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_1665 = torch.constant.int -1
    %int0_1666 = torch.constant.int 0
    %int3072_1667 = torch.constant.int 3072
    %int1_1668 = torch.constant.int 1
    %1259 = torch.aten.slice.Tensor %1258, %int-1_1665, %int0_1666, %int3072_1667, %int1_1668 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1669 = torch.constant.int -1
    %int3072_1670 = torch.constant.int 3072
    %int6144_1671 = torch.constant.int 6144
    %int1_1672 = torch.constant.int 1
    %1260 = torch.aten.slice.Tensor %1258, %int-1_1669, %int3072_1670, %int6144_1671, %int1_1672 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1673 = torch.constant.int -1
    %int6144_1674 = torch.constant.int 6144
    %int9216_1675 = torch.constant.int 9216
    %int1_1676 = torch.constant.int 1
    %1261 = torch.aten.slice.Tensor %1258, %int-1_1673, %int6144_1674, %int9216_1675, %int1_1676 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1677 = torch.constant.int -1
    %int9216_1678 = torch.constant.int 9216
    %int12288_1679 = torch.constant.int 12288
    %int1_1680 = torch.constant.int 1
    %1262 = torch.aten.slice.Tensor %1258, %int-1_1677, %int9216_1678, %int12288_1679, %int1_1680 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1681 = torch.constant.int -1
    %int12288_1682 = torch.constant.int 12288
    %int15360_1683 = torch.constant.int 15360
    %int1_1684 = torch.constant.int 1
    %1263 = torch.aten.slice.Tensor %1258, %int-1_1681, %int12288_1682, %int15360_1683, %int1_1684 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1685 = torch.constant.int -1
    %int15360_1686 = torch.constant.int 15360
    %int18432_1687 = torch.constant.int 18432
    %int1_1688 = torch.constant.int 1
    %1264 = torch.aten.slice.Tensor %1258, %int-1_1685, %int15360_1686, %int18432_1687, %int1_1688 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_1689 = torch.constant.int 6
    %1265 = torch.prims.convert_element_type %1162, %int6_1689 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_1690 = torch.constant.int 2
    %1266 = torch.prim.ListConstruct %int2_1690 : (!torch.int) -> !torch.list<int>
    %int0_1691 = torch.constant.int 0
    %true_1692 = torch.constant.bool true
    %result0_1693, %result1_1694 = torch.aten.var_mean.correction %1265, %1266, %int0_1691, %true_1692 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_1695 = torch.constant.float 9.9999999999999995E-7
    %int1_1696 = torch.constant.int 1
    %1267 = torch.aten.add.Scalar %result0_1693, %float9.999990e-07_1695, %int1_1696 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %1268 = torch.aten.rsqrt %1267 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_1697 = torch.constant.int 1
    %1269 = torch.aten.sub.Tensor %1162, %result1_1694, %int1_1697 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %1270 = torch.aten.mul.Tensor %1269, %1268 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_1698 = torch.constant.int 5
    %1271 = torch.prims.convert_element_type %1270, %int5_1698 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1699 = torch.constant.int 1
    %int1_1700 = torch.constant.int 1
    %1272 = torch.aten.add.Scalar %1239, %int1_1699, %int1_1700 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1273 = torch.aten.mul.Tensor %1272, %1271 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1701 = torch.constant.int 1
    %1274 = torch.aten.add.Tensor %1273, %1238, %int1_1701 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_1702 = torch.constant.int 4096
    %int3072_1703 = torch.constant.int 3072
    %1275 = torch.prim.ListConstruct %int4096_1702, %int3072_1703 : (!torch.int, !torch.int) -> !torch.list<int>
    %1276 = torch.aten.view %1274, %1275 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.3.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.3.img_attn.qkv.weight : tensor<9216x3072xf16>
    %1277 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_1704 = torch.constant.int 0
    %int1_1705 = torch.constant.int 1
    %1278 = torch.aten.transpose.int %1277, %int0_1704, %int1_1705 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.3.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.3.img_attn.qkv.bias : tensor<9216xf16>
    %1279 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_1706 = torch.constant.int 6
    %1280 = torch.prims.convert_element_type %1279, %int6_1706 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_1707 = torch.constant.int 6
    %1281 = torch.prims.convert_element_type %1276, %int6_1707 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_1708 = torch.constant.int 6
    %1282 = torch.prims.convert_element_type %1278, %int6_1708 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %1283 = torch.aten.mm %1281, %1282 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_1709 = torch.constant.int 1
    %1284 = torch.aten.mul.Scalar %1283, %int1_1709 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_1710 = torch.constant.int 1
    %1285 = torch.aten.mul.Scalar %1280, %int1_1710 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_1711 = torch.constant.int 1
    %1286 = torch.aten.add.Tensor %1284, %1285, %int1_1711 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_1712 = torch.constant.int 5
    %1287 = torch.prims.convert_element_type %1286, %int5_1712 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_1713 = torch.constant.int 1
    %int4096_1714 = torch.constant.int 4096
    %int9216_1715 = torch.constant.int 9216
    %1288 = torch.prim.ListConstruct %int1_1713, %int4096_1714, %int9216_1715 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1289 = torch.aten.view %1287, %1288 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %1290 = torch_c.to_builtin_tensor %1289 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_1716 = tensor.cast %1290 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_1717 = arith.constant 0 : index
    %dim_1718 = tensor.dim %cast_1716, %c0_1717 : tensor<?x?x?xf16>
    %c1_1719 = arith.constant 1 : index
    %dim_1720 = tensor.dim %cast_1716, %c1_1719 : tensor<?x?x?xf16>
    %c2_1721 = arith.constant 2 : index
    %dim_1722 = tensor.dim %cast_1716, %c2_1721 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_1716 : tensor<?x?x?xf16>{%dim_1718, %dim_1720, %dim_1722}]
    %cast_1723 = tensor.cast %cast_1716 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %1291 = torch_c.from_builtin_tensor %cast_1723 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_1724 = torch.constant.int 1
    %int4096_1725 = torch.constant.int 4096
    %int3_1726 = torch.constant.int 3
    %int24_1727 = torch.constant.int 24
    %int128_1728 = torch.constant.int 128
    %1292 = torch.prim.ListConstruct %int1_1724, %int4096_1725, %int3_1726, %int24_1727, %int128_1728 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1293 = torch.aten.view %1291, %1292 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_1729 = torch.constant.int 2
    %int0_1730 = torch.constant.int 0
    %int3_1731 = torch.constant.int 3
    %int1_1732 = torch.constant.int 1
    %int4_1733 = torch.constant.int 4
    %1294 = torch.prim.ListConstruct %int2_1729, %int0_1730, %int3_1731, %int1_1732, %int4_1733 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1295 = torch.aten.permute %1293, %1294 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_1734 = torch.constant.int 0
    %int0_1735 = torch.constant.int 0
    %1296 = torch.aten.select.int %1295, %int0_1734, %int0_1735 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_1736 = torch.constant.int 6
    %1297 = torch.prims.convert_element_type %1296, %int6_1736 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_1737 = torch.constant.int 2
    %1298 = torch.aten.pow.Tensor_Scalar %1297, %int2_1737 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_1738 = torch.constant.int -1
    %1299 = torch.prim.ListConstruct %int-1_1738 : (!torch.int) -> !torch.list<int>
    %true_1739 = torch.constant.bool true
    %none_1740 = torch.constant.none
    %1300 = torch.aten.mean.dim %1298, %1299, %true_1739, %none_1740 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_1741 = torch.constant.float 9.9999999999999995E-7
    %int1_1742 = torch.constant.int 1
    %1301 = torch.aten.add.Scalar %1300, %float9.999990e-07_1741, %int1_1742 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %1302 = torch.aten.rsqrt %1301 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %1303 = torch.aten.mul.Tensor %1297, %1302 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_1743 = torch.constant.int 5
    %1304 = torch.prims.convert_element_type %1303, %int5_1743 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.3.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.3.img_attn.norm.query_norm.scale : tensor<128xf16>
    %1305 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1306 = torch.aten.mul.Tensor %1304, %1305 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_1744 = torch.constant.int 1
    %int4096_1745 = torch.constant.int 4096
    %int3_1746 = torch.constant.int 3
    %int24_1747 = torch.constant.int 24
    %int128_1748 = torch.constant.int 128
    %1307 = torch.prim.ListConstruct %int1_1744, %int4096_1745, %int3_1746, %int24_1747, %int128_1748 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1308 = torch.aten.view %1291, %1307 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_1749 = torch.constant.int 2
    %int0_1750 = torch.constant.int 0
    %int3_1751 = torch.constant.int 3
    %int1_1752 = torch.constant.int 1
    %int4_1753 = torch.constant.int 4
    %1309 = torch.prim.ListConstruct %int2_1749, %int0_1750, %int3_1751, %int1_1752, %int4_1753 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1310 = torch.aten.permute %1308, %1309 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_1754 = torch.constant.int 0
    %int1_1755 = torch.constant.int 1
    %1311 = torch.aten.select.int %1310, %int0_1754, %int1_1755 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_1756 = torch.constant.int 6
    %1312 = torch.prims.convert_element_type %1311, %int6_1756 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_1757 = torch.constant.int 2
    %1313 = torch.aten.pow.Tensor_Scalar %1312, %int2_1757 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_1758 = torch.constant.int -1
    %1314 = torch.prim.ListConstruct %int-1_1758 : (!torch.int) -> !torch.list<int>
    %true_1759 = torch.constant.bool true
    %none_1760 = torch.constant.none
    %1315 = torch.aten.mean.dim %1313, %1314, %true_1759, %none_1760 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_1761 = torch.constant.float 9.9999999999999995E-7
    %int1_1762 = torch.constant.int 1
    %1316 = torch.aten.add.Scalar %1315, %float9.999990e-07_1761, %int1_1762 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %1317 = torch.aten.rsqrt %1316 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %1318 = torch.aten.mul.Tensor %1312, %1317 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_1763 = torch.constant.int 5
    %1319 = torch.prims.convert_element_type %1318, %int5_1763 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.3.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.3.img_attn.norm.key_norm.scale : tensor<128xf16>
    %1320 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1321 = torch.aten.mul.Tensor %1319, %1320 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_1764 = torch.constant.int 5
    %1322 = torch.prims.convert_element_type %1306, %int5_1764 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_1765 = torch.constant.int 5
    %1323 = torch.prims.convert_element_type %1321, %int5_1765 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_1766 = torch.constant.int 6
    %1324 = torch.prims.convert_element_type %1222, %int6_1766 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_1767 = torch.constant.int 2
    %1325 = torch.prim.ListConstruct %int2_1767 : (!torch.int) -> !torch.list<int>
    %int0_1768 = torch.constant.int 0
    %true_1769 = torch.constant.bool true
    %result0_1770, %result1_1771 = torch.aten.var_mean.correction %1324, %1325, %int0_1768, %true_1769 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_1772 = torch.constant.float 9.9999999999999995E-7
    %int1_1773 = torch.constant.int 1
    %1326 = torch.aten.add.Scalar %result0_1770, %float9.999990e-07_1772, %int1_1773 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %1327 = torch.aten.rsqrt %1326 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_1774 = torch.constant.int 1
    %1328 = torch.aten.sub.Tensor %1222, %result1_1771, %int1_1774 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %1329 = torch.aten.mul.Tensor %1328, %1327 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_1775 = torch.constant.int 5
    %1330 = torch.prims.convert_element_type %1329, %int5_1775 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_1776 = torch.constant.int 1
    %int1_1777 = torch.constant.int 1
    %1331 = torch.aten.add.Scalar %1260, %int1_1776, %int1_1777 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1332 = torch.aten.mul.Tensor %1331, %1330 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1778 = torch.constant.int 1
    %1333 = torch.aten.add.Tensor %1332, %1259, %int1_1778 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_1779 = torch.constant.int 512
    %int3072_1780 = torch.constant.int 3072
    %1334 = torch.prim.ListConstruct %int512_1779, %int3072_1780 : (!torch.int, !torch.int) -> !torch.list<int>
    %1335 = torch.aten.view %1333, %1334 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.3.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.3.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %1336 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_1781 = torch.constant.int 0
    %int1_1782 = torch.constant.int 1
    %1337 = torch.aten.transpose.int %1336, %int0_1781, %int1_1782 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.3.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.3.txt_attn.qkv.bias : tensor<9216xf16>
    %1338 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_1783 = torch.constant.int 6
    %1339 = torch.prims.convert_element_type %1338, %int6_1783 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_1784 = torch.constant.int 6
    %1340 = torch.prims.convert_element_type %1335, %int6_1784 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_1785 = torch.constant.int 6
    %1341 = torch.prims.convert_element_type %1337, %int6_1785 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %1342 = torch.aten.mm %1340, %1341 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_1786 = torch.constant.int 1
    %1343 = torch.aten.mul.Scalar %1342, %int1_1786 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_1787 = torch.constant.int 1
    %1344 = torch.aten.mul.Scalar %1339, %int1_1787 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_1788 = torch.constant.int 1
    %1345 = torch.aten.add.Tensor %1343, %1344, %int1_1788 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_1789 = torch.constant.int 5
    %1346 = torch.prims.convert_element_type %1345, %int5_1789 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_1790 = torch.constant.int 1
    %int512_1791 = torch.constant.int 512
    %int9216_1792 = torch.constant.int 9216
    %1347 = torch.prim.ListConstruct %int1_1790, %int512_1791, %int9216_1792 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1348 = torch.aten.view %1346, %1347 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %1349 = torch_c.to_builtin_tensor %1348 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_1793 = tensor.cast %1349 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_1794 = arith.constant 0 : index
    %dim_1795 = tensor.dim %cast_1793, %c0_1794 : tensor<?x?x?xf16>
    %c1_1796 = arith.constant 1 : index
    %dim_1797 = tensor.dim %cast_1793, %c1_1796 : tensor<?x?x?xf16>
    %c2_1798 = arith.constant 2 : index
    %dim_1799 = tensor.dim %cast_1793, %c2_1798 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_1793 : tensor<?x?x?xf16>{%dim_1795, %dim_1797, %dim_1799}]
    %cast_1800 = tensor.cast %cast_1793 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %1350 = torch_c.from_builtin_tensor %cast_1800 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_1801 = torch.constant.int 1
    %int512_1802 = torch.constant.int 512
    %int3_1803 = torch.constant.int 3
    %int24_1804 = torch.constant.int 24
    %int128_1805 = torch.constant.int 128
    %1351 = torch.prim.ListConstruct %int1_1801, %int512_1802, %int3_1803, %int24_1804, %int128_1805 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1352 = torch.aten.view %1350, %1351 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_1806 = torch.constant.int 2
    %int0_1807 = torch.constant.int 0
    %int3_1808 = torch.constant.int 3
    %int1_1809 = torch.constant.int 1
    %int4_1810 = torch.constant.int 4
    %1353 = torch.prim.ListConstruct %int2_1806, %int0_1807, %int3_1808, %int1_1809, %int4_1810 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1354 = torch.aten.permute %1352, %1353 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_1811 = torch.constant.int 0
    %int0_1812 = torch.constant.int 0
    %1355 = torch.aten.select.int %1354, %int0_1811, %int0_1812 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_1813 = torch.constant.int 6
    %1356 = torch.prims.convert_element_type %1355, %int6_1813 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_1814 = torch.constant.int 2
    %1357 = torch.aten.pow.Tensor_Scalar %1356, %int2_1814 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_1815 = torch.constant.int -1
    %1358 = torch.prim.ListConstruct %int-1_1815 : (!torch.int) -> !torch.list<int>
    %true_1816 = torch.constant.bool true
    %none_1817 = torch.constant.none
    %1359 = torch.aten.mean.dim %1357, %1358, %true_1816, %none_1817 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_1818 = torch.constant.float 9.9999999999999995E-7
    %int1_1819 = torch.constant.int 1
    %1360 = torch.aten.add.Scalar %1359, %float9.999990e-07_1818, %int1_1819 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %1361 = torch.aten.rsqrt %1360 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %1362 = torch.aten.mul.Tensor %1356, %1361 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_1820 = torch.constant.int 5
    %1363 = torch.prims.convert_element_type %1362, %int5_1820 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.3.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.3.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %1364 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1365 = torch.aten.mul.Tensor %1363, %1364 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_1821 = torch.constant.int 1
    %int512_1822 = torch.constant.int 512
    %int3_1823 = torch.constant.int 3
    %int24_1824 = torch.constant.int 24
    %int128_1825 = torch.constant.int 128
    %1366 = torch.prim.ListConstruct %int1_1821, %int512_1822, %int3_1823, %int24_1824, %int128_1825 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1367 = torch.aten.view %1350, %1366 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_1826 = torch.constant.int 2
    %int0_1827 = torch.constant.int 0
    %int3_1828 = torch.constant.int 3
    %int1_1829 = torch.constant.int 1
    %int4_1830 = torch.constant.int 4
    %1368 = torch.prim.ListConstruct %int2_1826, %int0_1827, %int3_1828, %int1_1829, %int4_1830 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1369 = torch.aten.permute %1367, %1368 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_1831 = torch.constant.int 0
    %int1_1832 = torch.constant.int 1
    %1370 = torch.aten.select.int %1369, %int0_1831, %int1_1832 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_1833 = torch.constant.int 6
    %1371 = torch.prims.convert_element_type %1370, %int6_1833 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_1834 = torch.constant.int 2
    %1372 = torch.aten.pow.Tensor_Scalar %1371, %int2_1834 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_1835 = torch.constant.int -1
    %1373 = torch.prim.ListConstruct %int-1_1835 : (!torch.int) -> !torch.list<int>
    %true_1836 = torch.constant.bool true
    %none_1837 = torch.constant.none
    %1374 = torch.aten.mean.dim %1372, %1373, %true_1836, %none_1837 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_1838 = torch.constant.float 9.9999999999999995E-7
    %int1_1839 = torch.constant.int 1
    %1375 = torch.aten.add.Scalar %1374, %float9.999990e-07_1838, %int1_1839 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %1376 = torch.aten.rsqrt %1375 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %1377 = torch.aten.mul.Tensor %1371, %1376 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_1840 = torch.constant.int 5
    %1378 = torch.prims.convert_element_type %1377, %int5_1840 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.3.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.3.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %1379 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1380 = torch.aten.mul.Tensor %1378, %1379 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_1841 = torch.constant.int 5
    %1381 = torch.prims.convert_element_type %1365, %int5_1841 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_1842 = torch.constant.int 5
    %1382 = torch.prims.convert_element_type %1380, %int5_1842 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %1383 = torch.prim.ListConstruct %1381, %1322 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_1843 = torch.constant.int 2
    %1384 = torch.aten.cat %1383, %int2_1843 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %1385 = torch.prim.ListConstruct %1382, %1323 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_1844 = torch.constant.int 2
    %1386 = torch.aten.cat %1385, %int2_1844 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_1845 = torch.constant.int 1
    %int512_1846 = torch.constant.int 512
    %int3_1847 = torch.constant.int 3
    %int24_1848 = torch.constant.int 24
    %int128_1849 = torch.constant.int 128
    %1387 = torch.prim.ListConstruct %int1_1845, %int512_1846, %int3_1847, %int24_1848, %int128_1849 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1388 = torch.aten.view %1350, %1387 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_1850 = torch.constant.int 2
    %int0_1851 = torch.constant.int 0
    %int3_1852 = torch.constant.int 3
    %int1_1853 = torch.constant.int 1
    %int4_1854 = torch.constant.int 4
    %1389 = torch.prim.ListConstruct %int2_1850, %int0_1851, %int3_1852, %int1_1853, %int4_1854 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1390 = torch.aten.permute %1388, %1389 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_1855 = torch.constant.int 0
    %int2_1856 = torch.constant.int 2
    %1391 = torch.aten.select.int %1390, %int0_1855, %int2_1856 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_1857 = torch.constant.int 1
    %int4096_1858 = torch.constant.int 4096
    %int3_1859 = torch.constant.int 3
    %int24_1860 = torch.constant.int 24
    %int128_1861 = torch.constant.int 128
    %1392 = torch.prim.ListConstruct %int1_1857, %int4096_1858, %int3_1859, %int24_1860, %int128_1861 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1393 = torch.aten.view %1291, %1392 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_1862 = torch.constant.int 2
    %int0_1863 = torch.constant.int 0
    %int3_1864 = torch.constant.int 3
    %int1_1865 = torch.constant.int 1
    %int4_1866 = torch.constant.int 4
    %1394 = torch.prim.ListConstruct %int2_1862, %int0_1863, %int3_1864, %int1_1865, %int4_1866 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1395 = torch.aten.permute %1393, %1394 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_1867 = torch.constant.int 0
    %int2_1868 = torch.constant.int 2
    %1396 = torch.aten.select.int %1395, %int0_1867, %int2_1868 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %1397 = torch.prim.ListConstruct %1391, %1396 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_1869 = torch.constant.int 2
    %1398 = torch.aten.cat %1397, %int2_1869 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %1399 = torch_c.to_builtin_tensor %1384 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_1870 = tensor.cast %1399 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_1871 = arith.constant 0 : index
    %dim_1872 = tensor.dim %cast_1870, %c0_1871 : tensor<?x?x?x?xf16>
    %c1_1873 = arith.constant 1 : index
    %dim_1874 = tensor.dim %cast_1870, %c1_1873 : tensor<?x?x?x?xf16>
    %c2_1875 = arith.constant 2 : index
    %dim_1876 = tensor.dim %cast_1870, %c2_1875 : tensor<?x?x?x?xf16>
    %c3_1877 = arith.constant 3 : index
    %dim_1878 = tensor.dim %cast_1870, %c3_1877 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_1870 : tensor<?x?x?x?xf16>{%dim_1872, %dim_1874, %dim_1876, %dim_1878}]
    %cast_1879 = tensor.cast %cast_1870 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %1400 = torch_c.from_builtin_tensor %cast_1879 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %1401 = torch_c.to_builtin_tensor %1386 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_1880 = tensor.cast %1401 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_1881 = arith.constant 0 : index
    %dim_1882 = tensor.dim %cast_1880, %c0_1881 : tensor<?x?x?x?xf16>
    %c1_1883 = arith.constant 1 : index
    %dim_1884 = tensor.dim %cast_1880, %c1_1883 : tensor<?x?x?x?xf16>
    %c2_1885 = arith.constant 2 : index
    %dim_1886 = tensor.dim %cast_1880, %c2_1885 : tensor<?x?x?x?xf16>
    %c3_1887 = arith.constant 3 : index
    %dim_1888 = tensor.dim %cast_1880, %c3_1887 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_1880 : tensor<?x?x?x?xf16>{%dim_1882, %dim_1884, %dim_1886, %dim_1888}]
    %cast_1889 = tensor.cast %cast_1880 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %1402 = torch_c.from_builtin_tensor %cast_1889 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %1403 = torch_c.to_builtin_tensor %1398 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_1890 = tensor.cast %1403 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_1891 = arith.constant 0 : index
    %dim_1892 = tensor.dim %cast_1890, %c0_1891 : tensor<?x?x?x?xf16>
    %c1_1893 = arith.constant 1 : index
    %dim_1894 = tensor.dim %cast_1890, %c1_1893 : tensor<?x?x?x?xf16>
    %c2_1895 = arith.constant 2 : index
    %dim_1896 = tensor.dim %cast_1890, %c2_1895 : tensor<?x?x?x?xf16>
    %c3_1897 = arith.constant 3 : index
    %dim_1898 = tensor.dim %cast_1890, %c3_1897 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_1890 : tensor<?x?x?x?xf16>{%dim_1892, %dim_1894, %dim_1896, %dim_1898}]
    %cast_1899 = tensor.cast %cast_1890 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %1404 = torch_c.from_builtin_tensor %cast_1899 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_1900 = torch.constant.int 6
    %1405 = torch.prims.convert_element_type %1400, %int6_1900 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_1901 = torch.constant.int 1
    %int24_1902 = torch.constant.int 24
    %int4608_1903 = torch.constant.int 4608
    %int-1_1904 = torch.constant.int -1
    %int1_1905 = torch.constant.int 1
    %int2_1906 = torch.constant.int 2
    %1406 = torch.prim.ListConstruct %int1_1901, %int24_1902, %int4608_1903, %int-1_1904, %int1_1905, %int2_1906 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1407 = torch.aten.view %1405, %1406 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_1907 = torch.constant.int 6
    %1408 = torch.prims.convert_element_type %1402, %int6_1907 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_1908 = torch.constant.int 1
    %int24_1909 = torch.constant.int 24
    %int4608_1910 = torch.constant.int 4608
    %int-1_1911 = torch.constant.int -1
    %int1_1912 = torch.constant.int 1
    %int2_1913 = torch.constant.int 2
    %1409 = torch.prim.ListConstruct %int1_1908, %int24_1909, %int4608_1910, %int-1_1911, %int1_1912, %int2_1913 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1410 = torch.aten.view %1408, %1409 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_1914 = torch.constant.int 5
    %int0_1915 = torch.constant.int 0
    %1411 = torch.aten.select.int %211, %int5_1914, %int0_1915 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1916 = torch.constant.int 5
    %int0_1917 = torch.constant.int 0
    %1412 = torch.aten.select.int %1407, %int5_1916, %int0_1917 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1413 = torch.aten.mul.Tensor %1411, %1412 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_1918 = torch.constant.int 5
    %int1_1919 = torch.constant.int 1
    %1414 = torch.aten.select.int %211, %int5_1918, %int1_1919 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1920 = torch.constant.int 5
    %int1_1921 = torch.constant.int 1
    %1415 = torch.aten.select.int %1407, %int5_1920, %int1_1921 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1416 = torch.aten.mul.Tensor %1414, %1415 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_1922 = torch.constant.int 1
    %1417 = torch.aten.add.Tensor %1413, %1416, %int1_1922 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_1923 = torch.constant.int 5
    %int0_1924 = torch.constant.int 0
    %1418 = torch.aten.select.int %211, %int5_1923, %int0_1924 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1925 = torch.constant.int 5
    %int0_1926 = torch.constant.int 0
    %1419 = torch.aten.select.int %1410, %int5_1925, %int0_1926 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1420 = torch.aten.mul.Tensor %1418, %1419 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_1927 = torch.constant.int 5
    %int1_1928 = torch.constant.int 1
    %1421 = torch.aten.select.int %211, %int5_1927, %int1_1928 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1929 = torch.constant.int 5
    %int1_1930 = torch.constant.int 1
    %1422 = torch.aten.select.int %1410, %int5_1929, %int1_1930 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1423 = torch.aten.mul.Tensor %1421, %1422 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_1931 = torch.constant.int 1
    %1424 = torch.aten.add.Tensor %1420, %1423, %int1_1931 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_1932 = torch.constant.int 1
    %int24_1933 = torch.constant.int 24
    %int4608_1934 = torch.constant.int 4608
    %int128_1935 = torch.constant.int 128
    %1425 = torch.prim.ListConstruct %int1_1932, %int24_1933, %int4608_1934, %int128_1935 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1426 = torch.aten.view %1417, %1425 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_1936 = torch.constant.int 5
    %1427 = torch.prims.convert_element_type %1426, %int5_1936 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_1937 = torch.constant.int 1
    %int24_1938 = torch.constant.int 24
    %int4608_1939 = torch.constant.int 4608
    %int128_1940 = torch.constant.int 128
    %1428 = torch.prim.ListConstruct %int1_1937, %int24_1938, %int4608_1939, %int128_1940 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1429 = torch.aten.view %1424, %1428 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_1941 = torch.constant.int 5
    %1430 = torch.prims.convert_element_type %1429, %int5_1941 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_1942 = torch.constant.float 0.000000e+00
    %false_1943 = torch.constant.bool false
    %none_1944 = torch.constant.none
    %none_1945 = torch.constant.none
    %1431:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1427, %1430, %1404, %float0.000000e00_1942, %false_1943, %none_1944, %none_1945) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_1946 = torch.constant.int 0
    %int2_1947 = torch.constant.int 2
    %int1_1948 = torch.constant.int 1
    %int3_1949 = torch.constant.int 3
    %1432 = torch.prim.ListConstruct %int0_1946, %int2_1947, %int1_1948, %int3_1949 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1433 = torch.aten.permute %1431#0, %1432 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_1950 = torch.constant.int 1
    %int4608_1951 = torch.constant.int 4608
    %int3072_1952 = torch.constant.int 3072
    %1434 = torch.prim.ListConstruct %int1_1950, %int4608_1951, %int3072_1952 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1435 = torch.aten.view %1433, %1434 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_1953 = torch.constant.int 0
    %int0_1954 = torch.constant.int 0
    %int9223372036854775807_1955 = torch.constant.int 9223372036854775807
    %int1_1956 = torch.constant.int 1
    %1436 = torch.aten.slice.Tensor %1435, %int0_1953, %int0_1954, %int9223372036854775807_1955, %int1_1956 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_1957 = torch.constant.int 1
    %int0_1958 = torch.constant.int 0
    %int512_1959 = torch.constant.int 512
    %int1_1960 = torch.constant.int 1
    %1437 = torch.aten.slice.Tensor %1436, %int1_1957, %int0_1958, %int512_1959, %int1_1960 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_1961 = torch.constant.int 0
    %int0_1962 = torch.constant.int 0
    %int9223372036854775807_1963 = torch.constant.int 9223372036854775807
    %int1_1964 = torch.constant.int 1
    %1438 = torch.aten.slice.Tensor %1435, %int0_1961, %int0_1962, %int9223372036854775807_1963, %int1_1964 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_1965 = torch.constant.int 1
    %int512_1966 = torch.constant.int 512
    %int9223372036854775807_1967 = torch.constant.int 9223372036854775807
    %int1_1968 = torch.constant.int 1
    %1439 = torch.aten.slice.Tensor %1438, %int1_1965, %int512_1966, %int9223372036854775807_1967, %int1_1968 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_1969 = torch.constant.int 4096
    %int3072_1970 = torch.constant.int 3072
    %1440 = torch.prim.ListConstruct %int4096_1969, %int3072_1970 : (!torch.int, !torch.int) -> !torch.list<int>
    %1441 = torch.aten.view %1439, %1440 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.3.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.3.img_attn.proj.weight : tensor<3072x3072xf16>
    %1442 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_1971 = torch.constant.int 0
    %int1_1972 = torch.constant.int 1
    %1443 = torch.aten.transpose.int %1442, %int0_1971, %int1_1972 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.3.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.3.img_attn.proj.bias : tensor<3072xf16>
    %1444 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1973 = torch.constant.int 6
    %1445 = torch.prims.convert_element_type %1444, %int6_1973 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1974 = torch.constant.int 6
    %1446 = torch.prims.convert_element_type %1441, %int6_1974 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_1975 = torch.constant.int 6
    %1447 = torch.prims.convert_element_type %1443, %int6_1975 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %1448 = torch.aten.mm %1446, %1447 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_1976 = torch.constant.int 1
    %1449 = torch.aten.mul.Scalar %1448, %int1_1976 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_1977 = torch.constant.int 1
    %1450 = torch.aten.mul.Scalar %1445, %int1_1977 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1978 = torch.constant.int 1
    %1451 = torch.aten.add.Tensor %1449, %1450, %int1_1978 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_1979 = torch.constant.int 5
    %1452 = torch.prims.convert_element_type %1451, %int5_1979 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_1980 = torch.constant.int 1
    %int4096_1981 = torch.constant.int 4096
    %int3072_1982 = torch.constant.int 3072
    %1453 = torch.prim.ListConstruct %int1_1980, %int4096_1981, %int3072_1982 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1454 = torch.aten.view %1452, %1453 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %1455 = torch.aten.mul.Tensor %1240, %1454 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1983 = torch.constant.int 1
    %1456 = torch.aten.add.Tensor %1162, %1455, %int1_1983 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1984 = torch.constant.int 1
    %int1_1985 = torch.constant.int 1
    %1457 = torch.aten.add.Scalar %1242, %int1_1984, %int1_1985 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_1986 = torch.constant.int 6
    %1458 = torch.prims.convert_element_type %1456, %int6_1986 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_1987 = torch.constant.int 2
    %1459 = torch.prim.ListConstruct %int2_1987 : (!torch.int) -> !torch.list<int>
    %int0_1988 = torch.constant.int 0
    %true_1989 = torch.constant.bool true
    %result0_1990, %result1_1991 = torch.aten.var_mean.correction %1458, %1459, %int0_1988, %true_1989 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_1992 = torch.constant.float 9.9999999999999995E-7
    %int1_1993 = torch.constant.int 1
    %1460 = torch.aten.add.Scalar %result0_1990, %float9.999990e-07_1992, %int1_1993 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %1461 = torch.aten.rsqrt %1460 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_1994 = torch.constant.int 1
    %1462 = torch.aten.sub.Tensor %1456, %result1_1991, %int1_1994 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %1463 = torch.aten.mul.Tensor %1462, %1461 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_1995 = torch.constant.int 5
    %1464 = torch.prims.convert_element_type %1463, %int5_1995 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %1465 = torch.aten.mul.Tensor %1457, %1464 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1996 = torch.constant.int 1
    %1466 = torch.aten.add.Tensor %1465, %1241, %int1_1996 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_1997 = torch.constant.int 4096
    %int3072_1998 = torch.constant.int 3072
    %1467 = torch.prim.ListConstruct %int4096_1997, %int3072_1998 : (!torch.int, !torch.int) -> !torch.list<int>
    %1468 = torch.aten.view %1466, %1467 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.3.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.3.img_mlp.0.weight : tensor<12288x3072xf16>
    %1469 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_1999 = torch.constant.int 0
    %int1_2000 = torch.constant.int 1
    %1470 = torch.aten.transpose.int %1469, %int0_1999, %int1_2000 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.3.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.3.img_mlp.0.bias : tensor<12288xf16>
    %1471 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_2001 = torch.constant.int 6
    %1472 = torch.prims.convert_element_type %1471, %int6_2001 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_2002 = torch.constant.int 6
    %1473 = torch.prims.convert_element_type %1468, %int6_2002 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_2003 = torch.constant.int 6
    %1474 = torch.prims.convert_element_type %1470, %int6_2003 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %1475 = torch.aten.mm %1473, %1474 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_2004 = torch.constant.int 1
    %1476 = torch.aten.mul.Scalar %1475, %int1_2004 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_2005 = torch.constant.int 1
    %1477 = torch.aten.mul.Scalar %1472, %int1_2005 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_2006 = torch.constant.int 1
    %1478 = torch.aten.add.Tensor %1476, %1477, %int1_2006 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_2007 = torch.constant.int 5
    %1479 = torch.prims.convert_element_type %1478, %int5_2007 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_2008 = torch.constant.int 1
    %int4096_2009 = torch.constant.int 4096
    %int12288_2010 = torch.constant.int 12288
    %1480 = torch.prim.ListConstruct %int1_2008, %int4096_2009, %int12288_2010 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1481 = torch.aten.view %1479, %1480 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_2011 = torch.constant.str "tanh"
    %1482 = torch.aten.gelu %1481, %str_2011 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_2012 = torch.constant.int 4096
    %int12288_2013 = torch.constant.int 12288
    %1483 = torch.prim.ListConstruct %int4096_2012, %int12288_2013 : (!torch.int, !torch.int) -> !torch.list<int>
    %1484 = torch.aten.view %1482, %1483 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.3.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.3.img_mlp.2.weight : tensor<3072x12288xf16>
    %1485 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_2014 = torch.constant.int 0
    %int1_2015 = torch.constant.int 1
    %1486 = torch.aten.transpose.int %1485, %int0_2014, %int1_2015 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.3.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.3.img_mlp.2.bias : tensor<3072xf16>
    %1487 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2016 = torch.constant.int 6
    %1488 = torch.prims.convert_element_type %1487, %int6_2016 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2017 = torch.constant.int 6
    %1489 = torch.prims.convert_element_type %1484, %int6_2017 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_2018 = torch.constant.int 6
    %1490 = torch.prims.convert_element_type %1486, %int6_2018 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %1491 = torch.aten.mm %1489, %1490 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_2019 = torch.constant.int 1
    %1492 = torch.aten.mul.Scalar %1491, %int1_2019 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_2020 = torch.constant.int 1
    %1493 = torch.aten.mul.Scalar %1488, %int1_2020 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2021 = torch.constant.int 1
    %1494 = torch.aten.add.Tensor %1492, %1493, %int1_2021 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_2022 = torch.constant.int 5
    %1495 = torch.prims.convert_element_type %1494, %int5_2022 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_2023 = torch.constant.int 1
    %int4096_2024 = torch.constant.int 4096
    %int3072_2025 = torch.constant.int 3072
    %1496 = torch.prim.ListConstruct %int1_2023, %int4096_2024, %int3072_2025 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1497 = torch.aten.view %1495, %1496 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %1498 = torch.aten.mul.Tensor %1243, %1497 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2026 = torch.constant.int 1
    %1499 = torch.aten.add.Tensor %1456, %1498, %int1_2026 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_2027 = torch.constant.int 512
    %int3072_2028 = torch.constant.int 3072
    %1500 = torch.prim.ListConstruct %int512_2027, %int3072_2028 : (!torch.int, !torch.int) -> !torch.list<int>
    %1501 = torch.aten.view %1437, %1500 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.3.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.3.txt_attn.proj.weight : tensor<3072x3072xf16>
    %1502 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_2029 = torch.constant.int 0
    %int1_2030 = torch.constant.int 1
    %1503 = torch.aten.transpose.int %1502, %int0_2029, %int1_2030 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.3.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.3.txt_attn.proj.bias : tensor<3072xf16>
    %1504 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2031 = torch.constant.int 6
    %1505 = torch.prims.convert_element_type %1504, %int6_2031 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2032 = torch.constant.int 6
    %1506 = torch.prims.convert_element_type %1501, %int6_2032 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_2033 = torch.constant.int 6
    %1507 = torch.prims.convert_element_type %1503, %int6_2033 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %1508 = torch.aten.mm %1506, %1507 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_2034 = torch.constant.int 1
    %1509 = torch.aten.mul.Scalar %1508, %int1_2034 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_2035 = torch.constant.int 1
    %1510 = torch.aten.mul.Scalar %1505, %int1_2035 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2036 = torch.constant.int 1
    %1511 = torch.aten.add.Tensor %1509, %1510, %int1_2036 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_2037 = torch.constant.int 5
    %1512 = torch.prims.convert_element_type %1511, %int5_2037 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_2038 = torch.constant.int 1
    %int512_2039 = torch.constant.int 512
    %int3072_2040 = torch.constant.int 3072
    %1513 = torch.prim.ListConstruct %int1_2038, %int512_2039, %int3072_2040 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1514 = torch.aten.view %1512, %1513 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %1515 = torch.aten.mul.Tensor %1261, %1514 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2041 = torch.constant.int 1
    %1516 = torch.aten.add.Tensor %1222, %1515, %int1_2041 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_2042 = torch.constant.int 1
    %int1_2043 = torch.constant.int 1
    %1517 = torch.aten.add.Scalar %1263, %int1_2042, %int1_2043 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_2044 = torch.constant.int 6
    %1518 = torch.prims.convert_element_type %1516, %int6_2044 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_2045 = torch.constant.int 2
    %1519 = torch.prim.ListConstruct %int2_2045 : (!torch.int) -> !torch.list<int>
    %int0_2046 = torch.constant.int 0
    %true_2047 = torch.constant.bool true
    %result0_2048, %result1_2049 = torch.aten.var_mean.correction %1518, %1519, %int0_2046, %true_2047 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_2050 = torch.constant.float 9.9999999999999995E-7
    %int1_2051 = torch.constant.int 1
    %1520 = torch.aten.add.Scalar %result0_2048, %float9.999990e-07_2050, %int1_2051 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %1521 = torch.aten.rsqrt %1520 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_2052 = torch.constant.int 1
    %1522 = torch.aten.sub.Tensor %1516, %result1_2049, %int1_2052 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %1523 = torch.aten.mul.Tensor %1522, %1521 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_2053 = torch.constant.int 5
    %1524 = torch.prims.convert_element_type %1523, %int5_2053 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %1525 = torch.aten.mul.Tensor %1517, %1524 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2054 = torch.constant.int 1
    %1526 = torch.aten.add.Tensor %1525, %1262, %int1_2054 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_2055 = torch.constant.int 512
    %int3072_2056 = torch.constant.int 3072
    %1527 = torch.prim.ListConstruct %int512_2055, %int3072_2056 : (!torch.int, !torch.int) -> !torch.list<int>
    %1528 = torch.aten.view %1526, %1527 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.3.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.3.txt_mlp.0.weight : tensor<12288x3072xf16>
    %1529 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_2057 = torch.constant.int 0
    %int1_2058 = torch.constant.int 1
    %1530 = torch.aten.transpose.int %1529, %int0_2057, %int1_2058 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.3.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.3.txt_mlp.0.bias : tensor<12288xf16>
    %1531 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_2059 = torch.constant.int 6
    %1532 = torch.prims.convert_element_type %1531, %int6_2059 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_2060 = torch.constant.int 6
    %1533 = torch.prims.convert_element_type %1528, %int6_2060 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_2061 = torch.constant.int 6
    %1534 = torch.prims.convert_element_type %1530, %int6_2061 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %1535 = torch.aten.mm %1533, %1534 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_2062 = torch.constant.int 1
    %1536 = torch.aten.mul.Scalar %1535, %int1_2062 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_2063 = torch.constant.int 1
    %1537 = torch.aten.mul.Scalar %1532, %int1_2063 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_2064 = torch.constant.int 1
    %1538 = torch.aten.add.Tensor %1536, %1537, %int1_2064 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_2065 = torch.constant.int 5
    %1539 = torch.prims.convert_element_type %1538, %int5_2065 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_2066 = torch.constant.int 1
    %int512_2067 = torch.constant.int 512
    %int12288_2068 = torch.constant.int 12288
    %1540 = torch.prim.ListConstruct %int1_2066, %int512_2067, %int12288_2068 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1541 = torch.aten.view %1539, %1540 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_2069 = torch.constant.str "tanh"
    %1542 = torch.aten.gelu %1541, %str_2069 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_2070 = torch.constant.int 512
    %int12288_2071 = torch.constant.int 12288
    %1543 = torch.prim.ListConstruct %int512_2070, %int12288_2071 : (!torch.int, !torch.int) -> !torch.list<int>
    %1544 = torch.aten.view %1542, %1543 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.3.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.3.txt_mlp.2.weight : tensor<3072x12288xf16>
    %1545 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_2072 = torch.constant.int 0
    %int1_2073 = torch.constant.int 1
    %1546 = torch.aten.transpose.int %1545, %int0_2072, %int1_2073 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.3.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.3.txt_mlp.2.bias : tensor<3072xf16>
    %1547 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2074 = torch.constant.int 6
    %1548 = torch.prims.convert_element_type %1547, %int6_2074 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2075 = torch.constant.int 6
    %1549 = torch.prims.convert_element_type %1544, %int6_2075 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_2076 = torch.constant.int 6
    %1550 = torch.prims.convert_element_type %1546, %int6_2076 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %1551 = torch.aten.mm %1549, %1550 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_2077 = torch.constant.int 1
    %1552 = torch.aten.mul.Scalar %1551, %int1_2077 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_2078 = torch.constant.int 1
    %1553 = torch.aten.mul.Scalar %1548, %int1_2078 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2079 = torch.constant.int 1
    %1554 = torch.aten.add.Tensor %1552, %1553, %int1_2079 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_2080 = torch.constant.int 5
    %1555 = torch.prims.convert_element_type %1554, %int5_2080 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_2081 = torch.constant.int 1
    %int512_2082 = torch.constant.int 512
    %int3072_2083 = torch.constant.int 3072
    %1556 = torch.prim.ListConstruct %int1_2081, %int512_2082, %int3072_2083 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1557 = torch.aten.view %1555, %1556 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %1558 = torch.aten.mul.Tensor %1264, %1557 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2084 = torch.constant.int 1
    %1559 = torch.aten.add.Tensor %1516, %1558, %int1_2084 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %1560 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.4.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.4.img_mod.lin.weight : tensor<18432x3072xf16>
    %1561 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_2085 = torch.constant.int 0
    %int1_2086 = torch.constant.int 1
    %1562 = torch.aten.transpose.int %1561, %int0_2085, %int1_2086 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.4.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.4.img_mod.lin.bias : tensor<18432xf16>
    %1563 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_2087 = torch.constant.int 6
    %1564 = torch.prims.convert_element_type %1563, %int6_2087 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_2088 = torch.constant.int 6
    %1565 = torch.prims.convert_element_type %1560, %int6_2088 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_2089 = torch.constant.int 6
    %1566 = torch.prims.convert_element_type %1562, %int6_2089 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %1567 = torch.aten.mm %1565, %1566 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_2090 = torch.constant.int 1
    %1568 = torch.aten.mul.Scalar %1567, %int1_2090 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_2091 = torch.constant.int 1
    %1569 = torch.aten.mul.Scalar %1564, %int1_2091 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_2092 = torch.constant.int 1
    %1570 = torch.aten.add.Tensor %1568, %1569, %int1_2092 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_2093 = torch.constant.int 5
    %1571 = torch.prims.convert_element_type %1570, %int5_2093 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_2094 = torch.constant.int 0
    %int0_2095 = torch.constant.int 0
    %int9223372036854775807_2096 = torch.constant.int 9223372036854775807
    %int1_2097 = torch.constant.int 1
    %1572 = torch.aten.slice.Tensor %1571, %int0_2094, %int0_2095, %int9223372036854775807_2096, %int1_2097 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_2098 = torch.constant.int 1
    %1573 = torch.aten.unsqueeze %1572, %int1_2098 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_2099 = torch.constant.int 2
    %int0_2100 = torch.constant.int 0
    %int9223372036854775807_2101 = torch.constant.int 9223372036854775807
    %int1_2102 = torch.constant.int 1
    %1574 = torch.aten.slice.Tensor %1573, %int2_2099, %int0_2100, %int9223372036854775807_2101, %int1_2102 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_2103 = torch.constant.int -1
    %int0_2104 = torch.constant.int 0
    %int3072_2105 = torch.constant.int 3072
    %int1_2106 = torch.constant.int 1
    %1575 = torch.aten.slice.Tensor %1574, %int-1_2103, %int0_2104, %int3072_2105, %int1_2106 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2107 = torch.constant.int -1
    %int3072_2108 = torch.constant.int 3072
    %int6144_2109 = torch.constant.int 6144
    %int1_2110 = torch.constant.int 1
    %1576 = torch.aten.slice.Tensor %1574, %int-1_2107, %int3072_2108, %int6144_2109, %int1_2110 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2111 = torch.constant.int -1
    %int6144_2112 = torch.constant.int 6144
    %int9216_2113 = torch.constant.int 9216
    %int1_2114 = torch.constant.int 1
    %1577 = torch.aten.slice.Tensor %1574, %int-1_2111, %int6144_2112, %int9216_2113, %int1_2114 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2115 = torch.constant.int -1
    %int9216_2116 = torch.constant.int 9216
    %int12288_2117 = torch.constant.int 12288
    %int1_2118 = torch.constant.int 1
    %1578 = torch.aten.slice.Tensor %1574, %int-1_2115, %int9216_2116, %int12288_2117, %int1_2118 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2119 = torch.constant.int -1
    %int12288_2120 = torch.constant.int 12288
    %int15360_2121 = torch.constant.int 15360
    %int1_2122 = torch.constant.int 1
    %1579 = torch.aten.slice.Tensor %1574, %int-1_2119, %int12288_2120, %int15360_2121, %int1_2122 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2123 = torch.constant.int -1
    %int15360_2124 = torch.constant.int 15360
    %int18432_2125 = torch.constant.int 18432
    %int1_2126 = torch.constant.int 1
    %1580 = torch.aten.slice.Tensor %1574, %int-1_2123, %int15360_2124, %int18432_2125, %int1_2126 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1581 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.4.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.4.txt_mod.lin.weight : tensor<18432x3072xf16>
    %1582 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_2127 = torch.constant.int 0
    %int1_2128 = torch.constant.int 1
    %1583 = torch.aten.transpose.int %1582, %int0_2127, %int1_2128 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.4.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.4.txt_mod.lin.bias : tensor<18432xf16>
    %1584 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_2129 = torch.constant.int 6
    %1585 = torch.prims.convert_element_type %1584, %int6_2129 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_2130 = torch.constant.int 6
    %1586 = torch.prims.convert_element_type %1581, %int6_2130 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_2131 = torch.constant.int 6
    %1587 = torch.prims.convert_element_type %1583, %int6_2131 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %1588 = torch.aten.mm %1586, %1587 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_2132 = torch.constant.int 1
    %1589 = torch.aten.mul.Scalar %1588, %int1_2132 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_2133 = torch.constant.int 1
    %1590 = torch.aten.mul.Scalar %1585, %int1_2133 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_2134 = torch.constant.int 1
    %1591 = torch.aten.add.Tensor %1589, %1590, %int1_2134 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_2135 = torch.constant.int 5
    %1592 = torch.prims.convert_element_type %1591, %int5_2135 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_2136 = torch.constant.int 0
    %int0_2137 = torch.constant.int 0
    %int9223372036854775807_2138 = torch.constant.int 9223372036854775807
    %int1_2139 = torch.constant.int 1
    %1593 = torch.aten.slice.Tensor %1592, %int0_2136, %int0_2137, %int9223372036854775807_2138, %int1_2139 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_2140 = torch.constant.int 1
    %1594 = torch.aten.unsqueeze %1593, %int1_2140 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_2141 = torch.constant.int 2
    %int0_2142 = torch.constant.int 0
    %int9223372036854775807_2143 = torch.constant.int 9223372036854775807
    %int1_2144 = torch.constant.int 1
    %1595 = torch.aten.slice.Tensor %1594, %int2_2141, %int0_2142, %int9223372036854775807_2143, %int1_2144 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_2145 = torch.constant.int -1
    %int0_2146 = torch.constant.int 0
    %int3072_2147 = torch.constant.int 3072
    %int1_2148 = torch.constant.int 1
    %1596 = torch.aten.slice.Tensor %1595, %int-1_2145, %int0_2146, %int3072_2147, %int1_2148 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2149 = torch.constant.int -1
    %int3072_2150 = torch.constant.int 3072
    %int6144_2151 = torch.constant.int 6144
    %int1_2152 = torch.constant.int 1
    %1597 = torch.aten.slice.Tensor %1595, %int-1_2149, %int3072_2150, %int6144_2151, %int1_2152 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2153 = torch.constant.int -1
    %int6144_2154 = torch.constant.int 6144
    %int9216_2155 = torch.constant.int 9216
    %int1_2156 = torch.constant.int 1
    %1598 = torch.aten.slice.Tensor %1595, %int-1_2153, %int6144_2154, %int9216_2155, %int1_2156 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2157 = torch.constant.int -1
    %int9216_2158 = torch.constant.int 9216
    %int12288_2159 = torch.constant.int 12288
    %int1_2160 = torch.constant.int 1
    %1599 = torch.aten.slice.Tensor %1595, %int-1_2157, %int9216_2158, %int12288_2159, %int1_2160 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2161 = torch.constant.int -1
    %int12288_2162 = torch.constant.int 12288
    %int15360_2163 = torch.constant.int 15360
    %int1_2164 = torch.constant.int 1
    %1600 = torch.aten.slice.Tensor %1595, %int-1_2161, %int12288_2162, %int15360_2163, %int1_2164 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2165 = torch.constant.int -1
    %int15360_2166 = torch.constant.int 15360
    %int18432_2167 = torch.constant.int 18432
    %int1_2168 = torch.constant.int 1
    %1601 = torch.aten.slice.Tensor %1595, %int-1_2165, %int15360_2166, %int18432_2167, %int1_2168 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_2169 = torch.constant.int 6
    %1602 = torch.prims.convert_element_type %1499, %int6_2169 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_2170 = torch.constant.int 2
    %1603 = torch.prim.ListConstruct %int2_2170 : (!torch.int) -> !torch.list<int>
    %int0_2171 = torch.constant.int 0
    %true_2172 = torch.constant.bool true
    %result0_2173, %result1_2174 = torch.aten.var_mean.correction %1602, %1603, %int0_2171, %true_2172 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_2175 = torch.constant.float 9.9999999999999995E-7
    %int1_2176 = torch.constant.int 1
    %1604 = torch.aten.add.Scalar %result0_2173, %float9.999990e-07_2175, %int1_2176 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %1605 = torch.aten.rsqrt %1604 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_2177 = torch.constant.int 1
    %1606 = torch.aten.sub.Tensor %1499, %result1_2174, %int1_2177 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %1607 = torch.aten.mul.Tensor %1606, %1605 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_2178 = torch.constant.int 5
    %1608 = torch.prims.convert_element_type %1607, %int5_2178 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2179 = torch.constant.int 1
    %int1_2180 = torch.constant.int 1
    %1609 = torch.aten.add.Scalar %1576, %int1_2179, %int1_2180 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1610 = torch.aten.mul.Tensor %1609, %1608 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2181 = torch.constant.int 1
    %1611 = torch.aten.add.Tensor %1610, %1575, %int1_2181 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_2182 = torch.constant.int 4096
    %int3072_2183 = torch.constant.int 3072
    %1612 = torch.prim.ListConstruct %int4096_2182, %int3072_2183 : (!torch.int, !torch.int) -> !torch.list<int>
    %1613 = torch.aten.view %1611, %1612 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.4.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.4.img_attn.qkv.weight : tensor<9216x3072xf16>
    %1614 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_2184 = torch.constant.int 0
    %int1_2185 = torch.constant.int 1
    %1615 = torch.aten.transpose.int %1614, %int0_2184, %int1_2185 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.4.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.4.img_attn.qkv.bias : tensor<9216xf16>
    %1616 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_2186 = torch.constant.int 6
    %1617 = torch.prims.convert_element_type %1616, %int6_2186 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_2187 = torch.constant.int 6
    %1618 = torch.prims.convert_element_type %1613, %int6_2187 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_2188 = torch.constant.int 6
    %1619 = torch.prims.convert_element_type %1615, %int6_2188 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %1620 = torch.aten.mm %1618, %1619 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_2189 = torch.constant.int 1
    %1621 = torch.aten.mul.Scalar %1620, %int1_2189 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_2190 = torch.constant.int 1
    %1622 = torch.aten.mul.Scalar %1617, %int1_2190 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_2191 = torch.constant.int 1
    %1623 = torch.aten.add.Tensor %1621, %1622, %int1_2191 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_2192 = torch.constant.int 5
    %1624 = torch.prims.convert_element_type %1623, %int5_2192 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_2193 = torch.constant.int 1
    %int4096_2194 = torch.constant.int 4096
    %int9216_2195 = torch.constant.int 9216
    %1625 = torch.prim.ListConstruct %int1_2193, %int4096_2194, %int9216_2195 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1626 = torch.aten.view %1624, %1625 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %1627 = torch_c.to_builtin_tensor %1626 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_2196 = tensor.cast %1627 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_2197 = arith.constant 0 : index
    %dim_2198 = tensor.dim %cast_2196, %c0_2197 : tensor<?x?x?xf16>
    %c1_2199 = arith.constant 1 : index
    %dim_2200 = tensor.dim %cast_2196, %c1_2199 : tensor<?x?x?xf16>
    %c2_2201 = arith.constant 2 : index
    %dim_2202 = tensor.dim %cast_2196, %c2_2201 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_2196 : tensor<?x?x?xf16>{%dim_2198, %dim_2200, %dim_2202}]
    %cast_2203 = tensor.cast %cast_2196 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %1628 = torch_c.from_builtin_tensor %cast_2203 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_2204 = torch.constant.int 1
    %int4096_2205 = torch.constant.int 4096
    %int3_2206 = torch.constant.int 3
    %int24_2207 = torch.constant.int 24
    %int128_2208 = torch.constant.int 128
    %1629 = torch.prim.ListConstruct %int1_2204, %int4096_2205, %int3_2206, %int24_2207, %int128_2208 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1630 = torch.aten.view %1628, %1629 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_2209 = torch.constant.int 2
    %int0_2210 = torch.constant.int 0
    %int3_2211 = torch.constant.int 3
    %int1_2212 = torch.constant.int 1
    %int4_2213 = torch.constant.int 4
    %1631 = torch.prim.ListConstruct %int2_2209, %int0_2210, %int3_2211, %int1_2212, %int4_2213 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1632 = torch.aten.permute %1630, %1631 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_2214 = torch.constant.int 0
    %int0_2215 = torch.constant.int 0
    %1633 = torch.aten.select.int %1632, %int0_2214, %int0_2215 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_2216 = torch.constant.int 6
    %1634 = torch.prims.convert_element_type %1633, %int6_2216 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_2217 = torch.constant.int 2
    %1635 = torch.aten.pow.Tensor_Scalar %1634, %int2_2217 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_2218 = torch.constant.int -1
    %1636 = torch.prim.ListConstruct %int-1_2218 : (!torch.int) -> !torch.list<int>
    %true_2219 = torch.constant.bool true
    %none_2220 = torch.constant.none
    %1637 = torch.aten.mean.dim %1635, %1636, %true_2219, %none_2220 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_2221 = torch.constant.float 9.9999999999999995E-7
    %int1_2222 = torch.constant.int 1
    %1638 = torch.aten.add.Scalar %1637, %float9.999990e-07_2221, %int1_2222 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %1639 = torch.aten.rsqrt %1638 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %1640 = torch.aten.mul.Tensor %1634, %1639 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_2223 = torch.constant.int 5
    %1641 = torch.prims.convert_element_type %1640, %int5_2223 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.4.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.4.img_attn.norm.query_norm.scale : tensor<128xf16>
    %1642 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1643 = torch.aten.mul.Tensor %1641, %1642 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_2224 = torch.constant.int 1
    %int4096_2225 = torch.constant.int 4096
    %int3_2226 = torch.constant.int 3
    %int24_2227 = torch.constant.int 24
    %int128_2228 = torch.constant.int 128
    %1644 = torch.prim.ListConstruct %int1_2224, %int4096_2225, %int3_2226, %int24_2227, %int128_2228 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1645 = torch.aten.view %1628, %1644 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_2229 = torch.constant.int 2
    %int0_2230 = torch.constant.int 0
    %int3_2231 = torch.constant.int 3
    %int1_2232 = torch.constant.int 1
    %int4_2233 = torch.constant.int 4
    %1646 = torch.prim.ListConstruct %int2_2229, %int0_2230, %int3_2231, %int1_2232, %int4_2233 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1647 = torch.aten.permute %1645, %1646 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_2234 = torch.constant.int 0
    %int1_2235 = torch.constant.int 1
    %1648 = torch.aten.select.int %1647, %int0_2234, %int1_2235 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_2236 = torch.constant.int 6
    %1649 = torch.prims.convert_element_type %1648, %int6_2236 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_2237 = torch.constant.int 2
    %1650 = torch.aten.pow.Tensor_Scalar %1649, %int2_2237 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_2238 = torch.constant.int -1
    %1651 = torch.prim.ListConstruct %int-1_2238 : (!torch.int) -> !torch.list<int>
    %true_2239 = torch.constant.bool true
    %none_2240 = torch.constant.none
    %1652 = torch.aten.mean.dim %1650, %1651, %true_2239, %none_2240 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_2241 = torch.constant.float 9.9999999999999995E-7
    %int1_2242 = torch.constant.int 1
    %1653 = torch.aten.add.Scalar %1652, %float9.999990e-07_2241, %int1_2242 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %1654 = torch.aten.rsqrt %1653 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %1655 = torch.aten.mul.Tensor %1649, %1654 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_2243 = torch.constant.int 5
    %1656 = torch.prims.convert_element_type %1655, %int5_2243 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.4.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.4.img_attn.norm.key_norm.scale : tensor<128xf16>
    %1657 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1658 = torch.aten.mul.Tensor %1656, %1657 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_2244 = torch.constant.int 5
    %1659 = torch.prims.convert_element_type %1643, %int5_2244 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_2245 = torch.constant.int 5
    %1660 = torch.prims.convert_element_type %1658, %int5_2245 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_2246 = torch.constant.int 6
    %1661 = torch.prims.convert_element_type %1559, %int6_2246 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_2247 = torch.constant.int 2
    %1662 = torch.prim.ListConstruct %int2_2247 : (!torch.int) -> !torch.list<int>
    %int0_2248 = torch.constant.int 0
    %true_2249 = torch.constant.bool true
    %result0_2250, %result1_2251 = torch.aten.var_mean.correction %1661, %1662, %int0_2248, %true_2249 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_2252 = torch.constant.float 9.9999999999999995E-7
    %int1_2253 = torch.constant.int 1
    %1663 = torch.aten.add.Scalar %result0_2250, %float9.999990e-07_2252, %int1_2253 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %1664 = torch.aten.rsqrt %1663 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_2254 = torch.constant.int 1
    %1665 = torch.aten.sub.Tensor %1559, %result1_2251, %int1_2254 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %1666 = torch.aten.mul.Tensor %1665, %1664 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_2255 = torch.constant.int 5
    %1667 = torch.prims.convert_element_type %1666, %int5_2255 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_2256 = torch.constant.int 1
    %int1_2257 = torch.constant.int 1
    %1668 = torch.aten.add.Scalar %1597, %int1_2256, %int1_2257 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1669 = torch.aten.mul.Tensor %1668, %1667 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2258 = torch.constant.int 1
    %1670 = torch.aten.add.Tensor %1669, %1596, %int1_2258 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_2259 = torch.constant.int 512
    %int3072_2260 = torch.constant.int 3072
    %1671 = torch.prim.ListConstruct %int512_2259, %int3072_2260 : (!torch.int, !torch.int) -> !torch.list<int>
    %1672 = torch.aten.view %1670, %1671 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.4.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.4.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %1673 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_2261 = torch.constant.int 0
    %int1_2262 = torch.constant.int 1
    %1674 = torch.aten.transpose.int %1673, %int0_2261, %int1_2262 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.4.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.4.txt_attn.qkv.bias : tensor<9216xf16>
    %1675 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_2263 = torch.constant.int 6
    %1676 = torch.prims.convert_element_type %1675, %int6_2263 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_2264 = torch.constant.int 6
    %1677 = torch.prims.convert_element_type %1672, %int6_2264 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_2265 = torch.constant.int 6
    %1678 = torch.prims.convert_element_type %1674, %int6_2265 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %1679 = torch.aten.mm %1677, %1678 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_2266 = torch.constant.int 1
    %1680 = torch.aten.mul.Scalar %1679, %int1_2266 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_2267 = torch.constant.int 1
    %1681 = torch.aten.mul.Scalar %1676, %int1_2267 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_2268 = torch.constant.int 1
    %1682 = torch.aten.add.Tensor %1680, %1681, %int1_2268 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_2269 = torch.constant.int 5
    %1683 = torch.prims.convert_element_type %1682, %int5_2269 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_2270 = torch.constant.int 1
    %int512_2271 = torch.constant.int 512
    %int9216_2272 = torch.constant.int 9216
    %1684 = torch.prim.ListConstruct %int1_2270, %int512_2271, %int9216_2272 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1685 = torch.aten.view %1683, %1684 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %1686 = torch_c.to_builtin_tensor %1685 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_2273 = tensor.cast %1686 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_2274 = arith.constant 0 : index
    %dim_2275 = tensor.dim %cast_2273, %c0_2274 : tensor<?x?x?xf16>
    %c1_2276 = arith.constant 1 : index
    %dim_2277 = tensor.dim %cast_2273, %c1_2276 : tensor<?x?x?xf16>
    %c2_2278 = arith.constant 2 : index
    %dim_2279 = tensor.dim %cast_2273, %c2_2278 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_2273 : tensor<?x?x?xf16>{%dim_2275, %dim_2277, %dim_2279}]
    %cast_2280 = tensor.cast %cast_2273 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %1687 = torch_c.from_builtin_tensor %cast_2280 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_2281 = torch.constant.int 1
    %int512_2282 = torch.constant.int 512
    %int3_2283 = torch.constant.int 3
    %int24_2284 = torch.constant.int 24
    %int128_2285 = torch.constant.int 128
    %1688 = torch.prim.ListConstruct %int1_2281, %int512_2282, %int3_2283, %int24_2284, %int128_2285 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1689 = torch.aten.view %1687, %1688 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_2286 = torch.constant.int 2
    %int0_2287 = torch.constant.int 0
    %int3_2288 = torch.constant.int 3
    %int1_2289 = torch.constant.int 1
    %int4_2290 = torch.constant.int 4
    %1690 = torch.prim.ListConstruct %int2_2286, %int0_2287, %int3_2288, %int1_2289, %int4_2290 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1691 = torch.aten.permute %1689, %1690 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_2291 = torch.constant.int 0
    %int0_2292 = torch.constant.int 0
    %1692 = torch.aten.select.int %1691, %int0_2291, %int0_2292 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_2293 = torch.constant.int 6
    %1693 = torch.prims.convert_element_type %1692, %int6_2293 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_2294 = torch.constant.int 2
    %1694 = torch.aten.pow.Tensor_Scalar %1693, %int2_2294 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_2295 = torch.constant.int -1
    %1695 = torch.prim.ListConstruct %int-1_2295 : (!torch.int) -> !torch.list<int>
    %true_2296 = torch.constant.bool true
    %none_2297 = torch.constant.none
    %1696 = torch.aten.mean.dim %1694, %1695, %true_2296, %none_2297 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_2298 = torch.constant.float 9.9999999999999995E-7
    %int1_2299 = torch.constant.int 1
    %1697 = torch.aten.add.Scalar %1696, %float9.999990e-07_2298, %int1_2299 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %1698 = torch.aten.rsqrt %1697 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %1699 = torch.aten.mul.Tensor %1693, %1698 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_2300 = torch.constant.int 5
    %1700 = torch.prims.convert_element_type %1699, %int5_2300 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.4.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.4.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %1701 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1702 = torch.aten.mul.Tensor %1700, %1701 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_2301 = torch.constant.int 1
    %int512_2302 = torch.constant.int 512
    %int3_2303 = torch.constant.int 3
    %int24_2304 = torch.constant.int 24
    %int128_2305 = torch.constant.int 128
    %1703 = torch.prim.ListConstruct %int1_2301, %int512_2302, %int3_2303, %int24_2304, %int128_2305 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1704 = torch.aten.view %1687, %1703 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_2306 = torch.constant.int 2
    %int0_2307 = torch.constant.int 0
    %int3_2308 = torch.constant.int 3
    %int1_2309 = torch.constant.int 1
    %int4_2310 = torch.constant.int 4
    %1705 = torch.prim.ListConstruct %int2_2306, %int0_2307, %int3_2308, %int1_2309, %int4_2310 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1706 = torch.aten.permute %1704, %1705 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_2311 = torch.constant.int 0
    %int1_2312 = torch.constant.int 1
    %1707 = torch.aten.select.int %1706, %int0_2311, %int1_2312 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_2313 = torch.constant.int 6
    %1708 = torch.prims.convert_element_type %1707, %int6_2313 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_2314 = torch.constant.int 2
    %1709 = torch.aten.pow.Tensor_Scalar %1708, %int2_2314 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_2315 = torch.constant.int -1
    %1710 = torch.prim.ListConstruct %int-1_2315 : (!torch.int) -> !torch.list<int>
    %true_2316 = torch.constant.bool true
    %none_2317 = torch.constant.none
    %1711 = torch.aten.mean.dim %1709, %1710, %true_2316, %none_2317 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_2318 = torch.constant.float 9.9999999999999995E-7
    %int1_2319 = torch.constant.int 1
    %1712 = torch.aten.add.Scalar %1711, %float9.999990e-07_2318, %int1_2319 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %1713 = torch.aten.rsqrt %1712 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %1714 = torch.aten.mul.Tensor %1708, %1713 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_2320 = torch.constant.int 5
    %1715 = torch.prims.convert_element_type %1714, %int5_2320 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.4.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.4.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %1716 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1717 = torch.aten.mul.Tensor %1715, %1716 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_2321 = torch.constant.int 5
    %1718 = torch.prims.convert_element_type %1702, %int5_2321 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_2322 = torch.constant.int 5
    %1719 = torch.prims.convert_element_type %1717, %int5_2322 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %1720 = torch.prim.ListConstruct %1718, %1659 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_2323 = torch.constant.int 2
    %1721 = torch.aten.cat %1720, %int2_2323 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %1722 = torch.prim.ListConstruct %1719, %1660 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_2324 = torch.constant.int 2
    %1723 = torch.aten.cat %1722, %int2_2324 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_2325 = torch.constant.int 1
    %int512_2326 = torch.constant.int 512
    %int3_2327 = torch.constant.int 3
    %int24_2328 = torch.constant.int 24
    %int128_2329 = torch.constant.int 128
    %1724 = torch.prim.ListConstruct %int1_2325, %int512_2326, %int3_2327, %int24_2328, %int128_2329 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1725 = torch.aten.view %1687, %1724 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_2330 = torch.constant.int 2
    %int0_2331 = torch.constant.int 0
    %int3_2332 = torch.constant.int 3
    %int1_2333 = torch.constant.int 1
    %int4_2334 = torch.constant.int 4
    %1726 = torch.prim.ListConstruct %int2_2330, %int0_2331, %int3_2332, %int1_2333, %int4_2334 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1727 = torch.aten.permute %1725, %1726 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_2335 = torch.constant.int 0
    %int2_2336 = torch.constant.int 2
    %1728 = torch.aten.select.int %1727, %int0_2335, %int2_2336 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_2337 = torch.constant.int 1
    %int4096_2338 = torch.constant.int 4096
    %int3_2339 = torch.constant.int 3
    %int24_2340 = torch.constant.int 24
    %int128_2341 = torch.constant.int 128
    %1729 = torch.prim.ListConstruct %int1_2337, %int4096_2338, %int3_2339, %int24_2340, %int128_2341 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1730 = torch.aten.view %1628, %1729 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_2342 = torch.constant.int 2
    %int0_2343 = torch.constant.int 0
    %int3_2344 = torch.constant.int 3
    %int1_2345 = torch.constant.int 1
    %int4_2346 = torch.constant.int 4
    %1731 = torch.prim.ListConstruct %int2_2342, %int0_2343, %int3_2344, %int1_2345, %int4_2346 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1732 = torch.aten.permute %1730, %1731 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_2347 = torch.constant.int 0
    %int2_2348 = torch.constant.int 2
    %1733 = torch.aten.select.int %1732, %int0_2347, %int2_2348 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %1734 = torch.prim.ListConstruct %1728, %1733 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_2349 = torch.constant.int 2
    %1735 = torch.aten.cat %1734, %int2_2349 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %1736 = torch_c.to_builtin_tensor %1721 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_2350 = tensor.cast %1736 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_2351 = arith.constant 0 : index
    %dim_2352 = tensor.dim %cast_2350, %c0_2351 : tensor<?x?x?x?xf16>
    %c1_2353 = arith.constant 1 : index
    %dim_2354 = tensor.dim %cast_2350, %c1_2353 : tensor<?x?x?x?xf16>
    %c2_2355 = arith.constant 2 : index
    %dim_2356 = tensor.dim %cast_2350, %c2_2355 : tensor<?x?x?x?xf16>
    %c3_2357 = arith.constant 3 : index
    %dim_2358 = tensor.dim %cast_2350, %c3_2357 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_2350 : tensor<?x?x?x?xf16>{%dim_2352, %dim_2354, %dim_2356, %dim_2358}]
    %cast_2359 = tensor.cast %cast_2350 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %1737 = torch_c.from_builtin_tensor %cast_2359 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %1738 = torch_c.to_builtin_tensor %1723 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_2360 = tensor.cast %1738 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_2361 = arith.constant 0 : index
    %dim_2362 = tensor.dim %cast_2360, %c0_2361 : tensor<?x?x?x?xf16>
    %c1_2363 = arith.constant 1 : index
    %dim_2364 = tensor.dim %cast_2360, %c1_2363 : tensor<?x?x?x?xf16>
    %c2_2365 = arith.constant 2 : index
    %dim_2366 = tensor.dim %cast_2360, %c2_2365 : tensor<?x?x?x?xf16>
    %c3_2367 = arith.constant 3 : index
    %dim_2368 = tensor.dim %cast_2360, %c3_2367 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_2360 : tensor<?x?x?x?xf16>{%dim_2362, %dim_2364, %dim_2366, %dim_2368}]
    %cast_2369 = tensor.cast %cast_2360 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %1739 = torch_c.from_builtin_tensor %cast_2369 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %1740 = torch_c.to_builtin_tensor %1735 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_2370 = tensor.cast %1740 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_2371 = arith.constant 0 : index
    %dim_2372 = tensor.dim %cast_2370, %c0_2371 : tensor<?x?x?x?xf16>
    %c1_2373 = arith.constant 1 : index
    %dim_2374 = tensor.dim %cast_2370, %c1_2373 : tensor<?x?x?x?xf16>
    %c2_2375 = arith.constant 2 : index
    %dim_2376 = tensor.dim %cast_2370, %c2_2375 : tensor<?x?x?x?xf16>
    %c3_2377 = arith.constant 3 : index
    %dim_2378 = tensor.dim %cast_2370, %c3_2377 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_2370 : tensor<?x?x?x?xf16>{%dim_2372, %dim_2374, %dim_2376, %dim_2378}]
    %cast_2379 = tensor.cast %cast_2370 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %1741 = torch_c.from_builtin_tensor %cast_2379 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_2380 = torch.constant.int 6
    %1742 = torch.prims.convert_element_type %1737, %int6_2380 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_2381 = torch.constant.int 1
    %int24_2382 = torch.constant.int 24
    %int4608_2383 = torch.constant.int 4608
    %int-1_2384 = torch.constant.int -1
    %int1_2385 = torch.constant.int 1
    %int2_2386 = torch.constant.int 2
    %1743 = torch.prim.ListConstruct %int1_2381, %int24_2382, %int4608_2383, %int-1_2384, %int1_2385, %int2_2386 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1744 = torch.aten.view %1742, %1743 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_2387 = torch.constant.int 6
    %1745 = torch.prims.convert_element_type %1739, %int6_2387 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_2388 = torch.constant.int 1
    %int24_2389 = torch.constant.int 24
    %int4608_2390 = torch.constant.int 4608
    %int-1_2391 = torch.constant.int -1
    %int1_2392 = torch.constant.int 1
    %int2_2393 = torch.constant.int 2
    %1746 = torch.prim.ListConstruct %int1_2388, %int24_2389, %int4608_2390, %int-1_2391, %int1_2392, %int2_2393 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1747 = torch.aten.view %1745, %1746 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_2394 = torch.constant.int 5
    %int0_2395 = torch.constant.int 0
    %1748 = torch.aten.select.int %211, %int5_2394, %int0_2395 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2396 = torch.constant.int 5
    %int0_2397 = torch.constant.int 0
    %1749 = torch.aten.select.int %1744, %int5_2396, %int0_2397 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1750 = torch.aten.mul.Tensor %1748, %1749 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_2398 = torch.constant.int 5
    %int1_2399 = torch.constant.int 1
    %1751 = torch.aten.select.int %211, %int5_2398, %int1_2399 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2400 = torch.constant.int 5
    %int1_2401 = torch.constant.int 1
    %1752 = torch.aten.select.int %1744, %int5_2400, %int1_2401 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1753 = torch.aten.mul.Tensor %1751, %1752 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_2402 = torch.constant.int 1
    %1754 = torch.aten.add.Tensor %1750, %1753, %int1_2402 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_2403 = torch.constant.int 5
    %int0_2404 = torch.constant.int 0
    %1755 = torch.aten.select.int %211, %int5_2403, %int0_2404 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2405 = torch.constant.int 5
    %int0_2406 = torch.constant.int 0
    %1756 = torch.aten.select.int %1747, %int5_2405, %int0_2406 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1757 = torch.aten.mul.Tensor %1755, %1756 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_2407 = torch.constant.int 5
    %int1_2408 = torch.constant.int 1
    %1758 = torch.aten.select.int %211, %int5_2407, %int1_2408 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2409 = torch.constant.int 5
    %int1_2410 = torch.constant.int 1
    %1759 = torch.aten.select.int %1747, %int5_2409, %int1_2410 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1760 = torch.aten.mul.Tensor %1758, %1759 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_2411 = torch.constant.int 1
    %1761 = torch.aten.add.Tensor %1757, %1760, %int1_2411 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_2412 = torch.constant.int 1
    %int24_2413 = torch.constant.int 24
    %int4608_2414 = torch.constant.int 4608
    %int128_2415 = torch.constant.int 128
    %1762 = torch.prim.ListConstruct %int1_2412, %int24_2413, %int4608_2414, %int128_2415 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1763 = torch.aten.view %1754, %1762 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_2416 = torch.constant.int 5
    %1764 = torch.prims.convert_element_type %1763, %int5_2416 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_2417 = torch.constant.int 1
    %int24_2418 = torch.constant.int 24
    %int4608_2419 = torch.constant.int 4608
    %int128_2420 = torch.constant.int 128
    %1765 = torch.prim.ListConstruct %int1_2417, %int24_2418, %int4608_2419, %int128_2420 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1766 = torch.aten.view %1761, %1765 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_2421 = torch.constant.int 5
    %1767 = torch.prims.convert_element_type %1766, %int5_2421 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_2422 = torch.constant.float 0.000000e+00
    %false_2423 = torch.constant.bool false
    %none_2424 = torch.constant.none
    %none_2425 = torch.constant.none
    %1768:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1764, %1767, %1741, %float0.000000e00_2422, %false_2423, %none_2424, %none_2425) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_2426 = torch.constant.int 0
    %int2_2427 = torch.constant.int 2
    %int1_2428 = torch.constant.int 1
    %int3_2429 = torch.constant.int 3
    %1769 = torch.prim.ListConstruct %int0_2426, %int2_2427, %int1_2428, %int3_2429 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1770 = torch.aten.permute %1768#0, %1769 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_2430 = torch.constant.int 1
    %int4608_2431 = torch.constant.int 4608
    %int3072_2432 = torch.constant.int 3072
    %1771 = torch.prim.ListConstruct %int1_2430, %int4608_2431, %int3072_2432 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1772 = torch.aten.view %1770, %1771 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_2433 = torch.constant.int 0
    %int0_2434 = torch.constant.int 0
    %int9223372036854775807_2435 = torch.constant.int 9223372036854775807
    %int1_2436 = torch.constant.int 1
    %1773 = torch.aten.slice.Tensor %1772, %int0_2433, %int0_2434, %int9223372036854775807_2435, %int1_2436 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_2437 = torch.constant.int 1
    %int0_2438 = torch.constant.int 0
    %int512_2439 = torch.constant.int 512
    %int1_2440 = torch.constant.int 1
    %1774 = torch.aten.slice.Tensor %1773, %int1_2437, %int0_2438, %int512_2439, %int1_2440 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_2441 = torch.constant.int 0
    %int0_2442 = torch.constant.int 0
    %int9223372036854775807_2443 = torch.constant.int 9223372036854775807
    %int1_2444 = torch.constant.int 1
    %1775 = torch.aten.slice.Tensor %1772, %int0_2441, %int0_2442, %int9223372036854775807_2443, %int1_2444 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_2445 = torch.constant.int 1
    %int512_2446 = torch.constant.int 512
    %int9223372036854775807_2447 = torch.constant.int 9223372036854775807
    %int1_2448 = torch.constant.int 1
    %1776 = torch.aten.slice.Tensor %1775, %int1_2445, %int512_2446, %int9223372036854775807_2447, %int1_2448 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_2449 = torch.constant.int 4096
    %int3072_2450 = torch.constant.int 3072
    %1777 = torch.prim.ListConstruct %int4096_2449, %int3072_2450 : (!torch.int, !torch.int) -> !torch.list<int>
    %1778 = torch.aten.view %1776, %1777 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.4.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.4.img_attn.proj.weight : tensor<3072x3072xf16>
    %1779 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_2451 = torch.constant.int 0
    %int1_2452 = torch.constant.int 1
    %1780 = torch.aten.transpose.int %1779, %int0_2451, %int1_2452 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.4.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.4.img_attn.proj.bias : tensor<3072xf16>
    %1781 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2453 = torch.constant.int 6
    %1782 = torch.prims.convert_element_type %1781, %int6_2453 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2454 = torch.constant.int 6
    %1783 = torch.prims.convert_element_type %1778, %int6_2454 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_2455 = torch.constant.int 6
    %1784 = torch.prims.convert_element_type %1780, %int6_2455 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %1785 = torch.aten.mm %1783, %1784 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_2456 = torch.constant.int 1
    %1786 = torch.aten.mul.Scalar %1785, %int1_2456 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_2457 = torch.constant.int 1
    %1787 = torch.aten.mul.Scalar %1782, %int1_2457 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2458 = torch.constant.int 1
    %1788 = torch.aten.add.Tensor %1786, %1787, %int1_2458 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_2459 = torch.constant.int 5
    %1789 = torch.prims.convert_element_type %1788, %int5_2459 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_2460 = torch.constant.int 1
    %int4096_2461 = torch.constant.int 4096
    %int3072_2462 = torch.constant.int 3072
    %1790 = torch.prim.ListConstruct %int1_2460, %int4096_2461, %int3072_2462 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1791 = torch.aten.view %1789, %1790 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %1792 = torch.aten.mul.Tensor %1577, %1791 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2463 = torch.constant.int 1
    %1793 = torch.aten.add.Tensor %1499, %1792, %int1_2463 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2464 = torch.constant.int 1
    %int1_2465 = torch.constant.int 1
    %1794 = torch.aten.add.Scalar %1579, %int1_2464, %int1_2465 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_2466 = torch.constant.int 6
    %1795 = torch.prims.convert_element_type %1793, %int6_2466 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_2467 = torch.constant.int 2
    %1796 = torch.prim.ListConstruct %int2_2467 : (!torch.int) -> !torch.list<int>
    %int0_2468 = torch.constant.int 0
    %true_2469 = torch.constant.bool true
    %result0_2470, %result1_2471 = torch.aten.var_mean.correction %1795, %1796, %int0_2468, %true_2469 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_2472 = torch.constant.float 9.9999999999999995E-7
    %int1_2473 = torch.constant.int 1
    %1797 = torch.aten.add.Scalar %result0_2470, %float9.999990e-07_2472, %int1_2473 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %1798 = torch.aten.rsqrt %1797 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_2474 = torch.constant.int 1
    %1799 = torch.aten.sub.Tensor %1793, %result1_2471, %int1_2474 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %1800 = torch.aten.mul.Tensor %1799, %1798 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_2475 = torch.constant.int 5
    %1801 = torch.prims.convert_element_type %1800, %int5_2475 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %1802 = torch.aten.mul.Tensor %1794, %1801 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2476 = torch.constant.int 1
    %1803 = torch.aten.add.Tensor %1802, %1578, %int1_2476 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_2477 = torch.constant.int 4096
    %int3072_2478 = torch.constant.int 3072
    %1804 = torch.prim.ListConstruct %int4096_2477, %int3072_2478 : (!torch.int, !torch.int) -> !torch.list<int>
    %1805 = torch.aten.view %1803, %1804 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.4.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.4.img_mlp.0.weight : tensor<12288x3072xf16>
    %1806 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_2479 = torch.constant.int 0
    %int1_2480 = torch.constant.int 1
    %1807 = torch.aten.transpose.int %1806, %int0_2479, %int1_2480 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.4.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.4.img_mlp.0.bias : tensor<12288xf16>
    %1808 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_2481 = torch.constant.int 6
    %1809 = torch.prims.convert_element_type %1808, %int6_2481 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_2482 = torch.constant.int 6
    %1810 = torch.prims.convert_element_type %1805, %int6_2482 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_2483 = torch.constant.int 6
    %1811 = torch.prims.convert_element_type %1807, %int6_2483 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %1812 = torch.aten.mm %1810, %1811 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_2484 = torch.constant.int 1
    %1813 = torch.aten.mul.Scalar %1812, %int1_2484 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_2485 = torch.constant.int 1
    %1814 = torch.aten.mul.Scalar %1809, %int1_2485 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_2486 = torch.constant.int 1
    %1815 = torch.aten.add.Tensor %1813, %1814, %int1_2486 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_2487 = torch.constant.int 5
    %1816 = torch.prims.convert_element_type %1815, %int5_2487 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_2488 = torch.constant.int 1
    %int4096_2489 = torch.constant.int 4096
    %int12288_2490 = torch.constant.int 12288
    %1817 = torch.prim.ListConstruct %int1_2488, %int4096_2489, %int12288_2490 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1818 = torch.aten.view %1816, %1817 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_2491 = torch.constant.str "tanh"
    %1819 = torch.aten.gelu %1818, %str_2491 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_2492 = torch.constant.int 4096
    %int12288_2493 = torch.constant.int 12288
    %1820 = torch.prim.ListConstruct %int4096_2492, %int12288_2493 : (!torch.int, !torch.int) -> !torch.list<int>
    %1821 = torch.aten.view %1819, %1820 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.4.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.4.img_mlp.2.weight : tensor<3072x12288xf16>
    %1822 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_2494 = torch.constant.int 0
    %int1_2495 = torch.constant.int 1
    %1823 = torch.aten.transpose.int %1822, %int0_2494, %int1_2495 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.4.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.4.img_mlp.2.bias : tensor<3072xf16>
    %1824 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2496 = torch.constant.int 6
    %1825 = torch.prims.convert_element_type %1824, %int6_2496 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2497 = torch.constant.int 6
    %1826 = torch.prims.convert_element_type %1821, %int6_2497 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_2498 = torch.constant.int 6
    %1827 = torch.prims.convert_element_type %1823, %int6_2498 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %1828 = torch.aten.mm %1826, %1827 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_2499 = torch.constant.int 1
    %1829 = torch.aten.mul.Scalar %1828, %int1_2499 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_2500 = torch.constant.int 1
    %1830 = torch.aten.mul.Scalar %1825, %int1_2500 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2501 = torch.constant.int 1
    %1831 = torch.aten.add.Tensor %1829, %1830, %int1_2501 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_2502 = torch.constant.int 5
    %1832 = torch.prims.convert_element_type %1831, %int5_2502 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_2503 = torch.constant.int 1
    %int4096_2504 = torch.constant.int 4096
    %int3072_2505 = torch.constant.int 3072
    %1833 = torch.prim.ListConstruct %int1_2503, %int4096_2504, %int3072_2505 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1834 = torch.aten.view %1832, %1833 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %1835 = torch.aten.mul.Tensor %1580, %1834 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2506 = torch.constant.int 1
    %1836 = torch.aten.add.Tensor %1793, %1835, %int1_2506 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_2507 = torch.constant.int 512
    %int3072_2508 = torch.constant.int 3072
    %1837 = torch.prim.ListConstruct %int512_2507, %int3072_2508 : (!torch.int, !torch.int) -> !torch.list<int>
    %1838 = torch.aten.view %1774, %1837 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.4.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.4.txt_attn.proj.weight : tensor<3072x3072xf16>
    %1839 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_2509 = torch.constant.int 0
    %int1_2510 = torch.constant.int 1
    %1840 = torch.aten.transpose.int %1839, %int0_2509, %int1_2510 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.4.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.4.txt_attn.proj.bias : tensor<3072xf16>
    %1841 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2511 = torch.constant.int 6
    %1842 = torch.prims.convert_element_type %1841, %int6_2511 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2512 = torch.constant.int 6
    %1843 = torch.prims.convert_element_type %1838, %int6_2512 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_2513 = torch.constant.int 6
    %1844 = torch.prims.convert_element_type %1840, %int6_2513 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %1845 = torch.aten.mm %1843, %1844 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_2514 = torch.constant.int 1
    %1846 = torch.aten.mul.Scalar %1845, %int1_2514 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_2515 = torch.constant.int 1
    %1847 = torch.aten.mul.Scalar %1842, %int1_2515 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2516 = torch.constant.int 1
    %1848 = torch.aten.add.Tensor %1846, %1847, %int1_2516 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_2517 = torch.constant.int 5
    %1849 = torch.prims.convert_element_type %1848, %int5_2517 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_2518 = torch.constant.int 1
    %int512_2519 = torch.constant.int 512
    %int3072_2520 = torch.constant.int 3072
    %1850 = torch.prim.ListConstruct %int1_2518, %int512_2519, %int3072_2520 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1851 = torch.aten.view %1849, %1850 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %1852 = torch.aten.mul.Tensor %1598, %1851 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2521 = torch.constant.int 1
    %1853 = torch.aten.add.Tensor %1559, %1852, %int1_2521 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_2522 = torch.constant.int 1
    %int1_2523 = torch.constant.int 1
    %1854 = torch.aten.add.Scalar %1600, %int1_2522, %int1_2523 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_2524 = torch.constant.int 6
    %1855 = torch.prims.convert_element_type %1853, %int6_2524 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_2525 = torch.constant.int 2
    %1856 = torch.prim.ListConstruct %int2_2525 : (!torch.int) -> !torch.list<int>
    %int0_2526 = torch.constant.int 0
    %true_2527 = torch.constant.bool true
    %result0_2528, %result1_2529 = torch.aten.var_mean.correction %1855, %1856, %int0_2526, %true_2527 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_2530 = torch.constant.float 9.9999999999999995E-7
    %int1_2531 = torch.constant.int 1
    %1857 = torch.aten.add.Scalar %result0_2528, %float9.999990e-07_2530, %int1_2531 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %1858 = torch.aten.rsqrt %1857 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_2532 = torch.constant.int 1
    %1859 = torch.aten.sub.Tensor %1853, %result1_2529, %int1_2532 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %1860 = torch.aten.mul.Tensor %1859, %1858 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_2533 = torch.constant.int 5
    %1861 = torch.prims.convert_element_type %1860, %int5_2533 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %1862 = torch.aten.mul.Tensor %1854, %1861 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2534 = torch.constant.int 1
    %1863 = torch.aten.add.Tensor %1862, %1599, %int1_2534 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_2535 = torch.constant.int 512
    %int3072_2536 = torch.constant.int 3072
    %1864 = torch.prim.ListConstruct %int512_2535, %int3072_2536 : (!torch.int, !torch.int) -> !torch.list<int>
    %1865 = torch.aten.view %1863, %1864 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.4.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.4.txt_mlp.0.weight : tensor<12288x3072xf16>
    %1866 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_2537 = torch.constant.int 0
    %int1_2538 = torch.constant.int 1
    %1867 = torch.aten.transpose.int %1866, %int0_2537, %int1_2538 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.4.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.4.txt_mlp.0.bias : tensor<12288xf16>
    %1868 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_2539 = torch.constant.int 6
    %1869 = torch.prims.convert_element_type %1868, %int6_2539 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_2540 = torch.constant.int 6
    %1870 = torch.prims.convert_element_type %1865, %int6_2540 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_2541 = torch.constant.int 6
    %1871 = torch.prims.convert_element_type %1867, %int6_2541 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %1872 = torch.aten.mm %1870, %1871 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_2542 = torch.constant.int 1
    %1873 = torch.aten.mul.Scalar %1872, %int1_2542 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_2543 = torch.constant.int 1
    %1874 = torch.aten.mul.Scalar %1869, %int1_2543 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_2544 = torch.constant.int 1
    %1875 = torch.aten.add.Tensor %1873, %1874, %int1_2544 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_2545 = torch.constant.int 5
    %1876 = torch.prims.convert_element_type %1875, %int5_2545 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_2546 = torch.constant.int 1
    %int512_2547 = torch.constant.int 512
    %int12288_2548 = torch.constant.int 12288
    %1877 = torch.prim.ListConstruct %int1_2546, %int512_2547, %int12288_2548 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1878 = torch.aten.view %1876, %1877 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_2549 = torch.constant.str "tanh"
    %1879 = torch.aten.gelu %1878, %str_2549 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_2550 = torch.constant.int 512
    %int12288_2551 = torch.constant.int 12288
    %1880 = torch.prim.ListConstruct %int512_2550, %int12288_2551 : (!torch.int, !torch.int) -> !torch.list<int>
    %1881 = torch.aten.view %1879, %1880 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.4.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.4.txt_mlp.2.weight : tensor<3072x12288xf16>
    %1882 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_2552 = torch.constant.int 0
    %int1_2553 = torch.constant.int 1
    %1883 = torch.aten.transpose.int %1882, %int0_2552, %int1_2553 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.4.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.4.txt_mlp.2.bias : tensor<3072xf16>
    %1884 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2554 = torch.constant.int 6
    %1885 = torch.prims.convert_element_type %1884, %int6_2554 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2555 = torch.constant.int 6
    %1886 = torch.prims.convert_element_type %1881, %int6_2555 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_2556 = torch.constant.int 6
    %1887 = torch.prims.convert_element_type %1883, %int6_2556 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %1888 = torch.aten.mm %1886, %1887 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_2557 = torch.constant.int 1
    %1889 = torch.aten.mul.Scalar %1888, %int1_2557 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_2558 = torch.constant.int 1
    %1890 = torch.aten.mul.Scalar %1885, %int1_2558 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2559 = torch.constant.int 1
    %1891 = torch.aten.add.Tensor %1889, %1890, %int1_2559 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_2560 = torch.constant.int 5
    %1892 = torch.prims.convert_element_type %1891, %int5_2560 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_2561 = torch.constant.int 1
    %int512_2562 = torch.constant.int 512
    %int3072_2563 = torch.constant.int 3072
    %1893 = torch.prim.ListConstruct %int1_2561, %int512_2562, %int3072_2563 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1894 = torch.aten.view %1892, %1893 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %1895 = torch.aten.mul.Tensor %1601, %1894 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2564 = torch.constant.int 1
    %1896 = torch.aten.add.Tensor %1853, %1895, %int1_2564 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %1897 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.5.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.5.img_mod.lin.weight : tensor<18432x3072xf16>
    %1898 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_2565 = torch.constant.int 0
    %int1_2566 = torch.constant.int 1
    %1899 = torch.aten.transpose.int %1898, %int0_2565, %int1_2566 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.5.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.5.img_mod.lin.bias : tensor<18432xf16>
    %1900 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_2567 = torch.constant.int 6
    %1901 = torch.prims.convert_element_type %1900, %int6_2567 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_2568 = torch.constant.int 6
    %1902 = torch.prims.convert_element_type %1897, %int6_2568 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_2569 = torch.constant.int 6
    %1903 = torch.prims.convert_element_type %1899, %int6_2569 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %1904 = torch.aten.mm %1902, %1903 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_2570 = torch.constant.int 1
    %1905 = torch.aten.mul.Scalar %1904, %int1_2570 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_2571 = torch.constant.int 1
    %1906 = torch.aten.mul.Scalar %1901, %int1_2571 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_2572 = torch.constant.int 1
    %1907 = torch.aten.add.Tensor %1905, %1906, %int1_2572 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_2573 = torch.constant.int 5
    %1908 = torch.prims.convert_element_type %1907, %int5_2573 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_2574 = torch.constant.int 0
    %int0_2575 = torch.constant.int 0
    %int9223372036854775807_2576 = torch.constant.int 9223372036854775807
    %int1_2577 = torch.constant.int 1
    %1909 = torch.aten.slice.Tensor %1908, %int0_2574, %int0_2575, %int9223372036854775807_2576, %int1_2577 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_2578 = torch.constant.int 1
    %1910 = torch.aten.unsqueeze %1909, %int1_2578 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_2579 = torch.constant.int 2
    %int0_2580 = torch.constant.int 0
    %int9223372036854775807_2581 = torch.constant.int 9223372036854775807
    %int1_2582 = torch.constant.int 1
    %1911 = torch.aten.slice.Tensor %1910, %int2_2579, %int0_2580, %int9223372036854775807_2581, %int1_2582 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_2583 = torch.constant.int -1
    %int0_2584 = torch.constant.int 0
    %int3072_2585 = torch.constant.int 3072
    %int1_2586 = torch.constant.int 1
    %1912 = torch.aten.slice.Tensor %1911, %int-1_2583, %int0_2584, %int3072_2585, %int1_2586 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2587 = torch.constant.int -1
    %int3072_2588 = torch.constant.int 3072
    %int6144_2589 = torch.constant.int 6144
    %int1_2590 = torch.constant.int 1
    %1913 = torch.aten.slice.Tensor %1911, %int-1_2587, %int3072_2588, %int6144_2589, %int1_2590 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2591 = torch.constant.int -1
    %int6144_2592 = torch.constant.int 6144
    %int9216_2593 = torch.constant.int 9216
    %int1_2594 = torch.constant.int 1
    %1914 = torch.aten.slice.Tensor %1911, %int-1_2591, %int6144_2592, %int9216_2593, %int1_2594 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2595 = torch.constant.int -1
    %int9216_2596 = torch.constant.int 9216
    %int12288_2597 = torch.constant.int 12288
    %int1_2598 = torch.constant.int 1
    %1915 = torch.aten.slice.Tensor %1911, %int-1_2595, %int9216_2596, %int12288_2597, %int1_2598 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2599 = torch.constant.int -1
    %int12288_2600 = torch.constant.int 12288
    %int15360_2601 = torch.constant.int 15360
    %int1_2602 = torch.constant.int 1
    %1916 = torch.aten.slice.Tensor %1911, %int-1_2599, %int12288_2600, %int15360_2601, %int1_2602 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2603 = torch.constant.int -1
    %int15360_2604 = torch.constant.int 15360
    %int18432_2605 = torch.constant.int 18432
    %int1_2606 = torch.constant.int 1
    %1917 = torch.aten.slice.Tensor %1911, %int-1_2603, %int15360_2604, %int18432_2605, %int1_2606 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1918 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.5.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.5.txt_mod.lin.weight : tensor<18432x3072xf16>
    %1919 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_2607 = torch.constant.int 0
    %int1_2608 = torch.constant.int 1
    %1920 = torch.aten.transpose.int %1919, %int0_2607, %int1_2608 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.5.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.5.txt_mod.lin.bias : tensor<18432xf16>
    %1921 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_2609 = torch.constant.int 6
    %1922 = torch.prims.convert_element_type %1921, %int6_2609 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_2610 = torch.constant.int 6
    %1923 = torch.prims.convert_element_type %1918, %int6_2610 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_2611 = torch.constant.int 6
    %1924 = torch.prims.convert_element_type %1920, %int6_2611 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %1925 = torch.aten.mm %1923, %1924 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_2612 = torch.constant.int 1
    %1926 = torch.aten.mul.Scalar %1925, %int1_2612 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_2613 = torch.constant.int 1
    %1927 = torch.aten.mul.Scalar %1922, %int1_2613 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_2614 = torch.constant.int 1
    %1928 = torch.aten.add.Tensor %1926, %1927, %int1_2614 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_2615 = torch.constant.int 5
    %1929 = torch.prims.convert_element_type %1928, %int5_2615 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_2616 = torch.constant.int 0
    %int0_2617 = torch.constant.int 0
    %int9223372036854775807_2618 = torch.constant.int 9223372036854775807
    %int1_2619 = torch.constant.int 1
    %1930 = torch.aten.slice.Tensor %1929, %int0_2616, %int0_2617, %int9223372036854775807_2618, %int1_2619 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_2620 = torch.constant.int 1
    %1931 = torch.aten.unsqueeze %1930, %int1_2620 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_2621 = torch.constant.int 2
    %int0_2622 = torch.constant.int 0
    %int9223372036854775807_2623 = torch.constant.int 9223372036854775807
    %int1_2624 = torch.constant.int 1
    %1932 = torch.aten.slice.Tensor %1931, %int2_2621, %int0_2622, %int9223372036854775807_2623, %int1_2624 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_2625 = torch.constant.int -1
    %int0_2626 = torch.constant.int 0
    %int3072_2627 = torch.constant.int 3072
    %int1_2628 = torch.constant.int 1
    %1933 = torch.aten.slice.Tensor %1932, %int-1_2625, %int0_2626, %int3072_2627, %int1_2628 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2629 = torch.constant.int -1
    %int3072_2630 = torch.constant.int 3072
    %int6144_2631 = torch.constant.int 6144
    %int1_2632 = torch.constant.int 1
    %1934 = torch.aten.slice.Tensor %1932, %int-1_2629, %int3072_2630, %int6144_2631, %int1_2632 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2633 = torch.constant.int -1
    %int6144_2634 = torch.constant.int 6144
    %int9216_2635 = torch.constant.int 9216
    %int1_2636 = torch.constant.int 1
    %1935 = torch.aten.slice.Tensor %1932, %int-1_2633, %int6144_2634, %int9216_2635, %int1_2636 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2637 = torch.constant.int -1
    %int9216_2638 = torch.constant.int 9216
    %int12288_2639 = torch.constant.int 12288
    %int1_2640 = torch.constant.int 1
    %1936 = torch.aten.slice.Tensor %1932, %int-1_2637, %int9216_2638, %int12288_2639, %int1_2640 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2641 = torch.constant.int -1
    %int12288_2642 = torch.constant.int 12288
    %int15360_2643 = torch.constant.int 15360
    %int1_2644 = torch.constant.int 1
    %1937 = torch.aten.slice.Tensor %1932, %int-1_2641, %int12288_2642, %int15360_2643, %int1_2644 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2645 = torch.constant.int -1
    %int15360_2646 = torch.constant.int 15360
    %int18432_2647 = torch.constant.int 18432
    %int1_2648 = torch.constant.int 1
    %1938 = torch.aten.slice.Tensor %1932, %int-1_2645, %int15360_2646, %int18432_2647, %int1_2648 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_2649 = torch.constant.int 6
    %1939 = torch.prims.convert_element_type %1836, %int6_2649 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_2650 = torch.constant.int 2
    %1940 = torch.prim.ListConstruct %int2_2650 : (!torch.int) -> !torch.list<int>
    %int0_2651 = torch.constant.int 0
    %true_2652 = torch.constant.bool true
    %result0_2653, %result1_2654 = torch.aten.var_mean.correction %1939, %1940, %int0_2651, %true_2652 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_2655 = torch.constant.float 9.9999999999999995E-7
    %int1_2656 = torch.constant.int 1
    %1941 = torch.aten.add.Scalar %result0_2653, %float9.999990e-07_2655, %int1_2656 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %1942 = torch.aten.rsqrt %1941 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_2657 = torch.constant.int 1
    %1943 = torch.aten.sub.Tensor %1836, %result1_2654, %int1_2657 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %1944 = torch.aten.mul.Tensor %1943, %1942 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_2658 = torch.constant.int 5
    %1945 = torch.prims.convert_element_type %1944, %int5_2658 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2659 = torch.constant.int 1
    %int1_2660 = torch.constant.int 1
    %1946 = torch.aten.add.Scalar %1913, %int1_2659, %int1_2660 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1947 = torch.aten.mul.Tensor %1946, %1945 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2661 = torch.constant.int 1
    %1948 = torch.aten.add.Tensor %1947, %1912, %int1_2661 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_2662 = torch.constant.int 4096
    %int3072_2663 = torch.constant.int 3072
    %1949 = torch.prim.ListConstruct %int4096_2662, %int3072_2663 : (!torch.int, !torch.int) -> !torch.list<int>
    %1950 = torch.aten.view %1948, %1949 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.5.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.5.img_attn.qkv.weight : tensor<9216x3072xf16>
    %1951 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_2664 = torch.constant.int 0
    %int1_2665 = torch.constant.int 1
    %1952 = torch.aten.transpose.int %1951, %int0_2664, %int1_2665 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.5.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.5.img_attn.qkv.bias : tensor<9216xf16>
    %1953 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_2666 = torch.constant.int 6
    %1954 = torch.prims.convert_element_type %1953, %int6_2666 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_2667 = torch.constant.int 6
    %1955 = torch.prims.convert_element_type %1950, %int6_2667 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_2668 = torch.constant.int 6
    %1956 = torch.prims.convert_element_type %1952, %int6_2668 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %1957 = torch.aten.mm %1955, %1956 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_2669 = torch.constant.int 1
    %1958 = torch.aten.mul.Scalar %1957, %int1_2669 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_2670 = torch.constant.int 1
    %1959 = torch.aten.mul.Scalar %1954, %int1_2670 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_2671 = torch.constant.int 1
    %1960 = torch.aten.add.Tensor %1958, %1959, %int1_2671 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_2672 = torch.constant.int 5
    %1961 = torch.prims.convert_element_type %1960, %int5_2672 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_2673 = torch.constant.int 1
    %int4096_2674 = torch.constant.int 4096
    %int9216_2675 = torch.constant.int 9216
    %1962 = torch.prim.ListConstruct %int1_2673, %int4096_2674, %int9216_2675 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1963 = torch.aten.view %1961, %1962 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %1964 = torch_c.to_builtin_tensor %1963 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_2676 = tensor.cast %1964 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_2677 = arith.constant 0 : index
    %dim_2678 = tensor.dim %cast_2676, %c0_2677 : tensor<?x?x?xf16>
    %c1_2679 = arith.constant 1 : index
    %dim_2680 = tensor.dim %cast_2676, %c1_2679 : tensor<?x?x?xf16>
    %c2_2681 = arith.constant 2 : index
    %dim_2682 = tensor.dim %cast_2676, %c2_2681 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_2676 : tensor<?x?x?xf16>{%dim_2678, %dim_2680, %dim_2682}]
    %cast_2683 = tensor.cast %cast_2676 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %1965 = torch_c.from_builtin_tensor %cast_2683 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_2684 = torch.constant.int 1
    %int4096_2685 = torch.constant.int 4096
    %int3_2686 = torch.constant.int 3
    %int24_2687 = torch.constant.int 24
    %int128_2688 = torch.constant.int 128
    %1966 = torch.prim.ListConstruct %int1_2684, %int4096_2685, %int3_2686, %int24_2687, %int128_2688 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1967 = torch.aten.view %1965, %1966 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_2689 = torch.constant.int 2
    %int0_2690 = torch.constant.int 0
    %int3_2691 = torch.constant.int 3
    %int1_2692 = torch.constant.int 1
    %int4_2693 = torch.constant.int 4
    %1968 = torch.prim.ListConstruct %int2_2689, %int0_2690, %int3_2691, %int1_2692, %int4_2693 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1969 = torch.aten.permute %1967, %1968 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_2694 = torch.constant.int 0
    %int0_2695 = torch.constant.int 0
    %1970 = torch.aten.select.int %1969, %int0_2694, %int0_2695 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_2696 = torch.constant.int 6
    %1971 = torch.prims.convert_element_type %1970, %int6_2696 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_2697 = torch.constant.int 2
    %1972 = torch.aten.pow.Tensor_Scalar %1971, %int2_2697 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_2698 = torch.constant.int -1
    %1973 = torch.prim.ListConstruct %int-1_2698 : (!torch.int) -> !torch.list<int>
    %true_2699 = torch.constant.bool true
    %none_2700 = torch.constant.none
    %1974 = torch.aten.mean.dim %1972, %1973, %true_2699, %none_2700 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_2701 = torch.constant.float 9.9999999999999995E-7
    %int1_2702 = torch.constant.int 1
    %1975 = torch.aten.add.Scalar %1974, %float9.999990e-07_2701, %int1_2702 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %1976 = torch.aten.rsqrt %1975 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %1977 = torch.aten.mul.Tensor %1971, %1976 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_2703 = torch.constant.int 5
    %1978 = torch.prims.convert_element_type %1977, %int5_2703 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.5.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.5.img_attn.norm.query_norm.scale : tensor<128xf16>
    %1979 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1980 = torch.aten.mul.Tensor %1978, %1979 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_2704 = torch.constant.int 1
    %int4096_2705 = torch.constant.int 4096
    %int3_2706 = torch.constant.int 3
    %int24_2707 = torch.constant.int 24
    %int128_2708 = torch.constant.int 128
    %1981 = torch.prim.ListConstruct %int1_2704, %int4096_2705, %int3_2706, %int24_2707, %int128_2708 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1982 = torch.aten.view %1965, %1981 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_2709 = torch.constant.int 2
    %int0_2710 = torch.constant.int 0
    %int3_2711 = torch.constant.int 3
    %int1_2712 = torch.constant.int 1
    %int4_2713 = torch.constant.int 4
    %1983 = torch.prim.ListConstruct %int2_2709, %int0_2710, %int3_2711, %int1_2712, %int4_2713 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1984 = torch.aten.permute %1982, %1983 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_2714 = torch.constant.int 0
    %int1_2715 = torch.constant.int 1
    %1985 = torch.aten.select.int %1984, %int0_2714, %int1_2715 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_2716 = torch.constant.int 6
    %1986 = torch.prims.convert_element_type %1985, %int6_2716 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_2717 = torch.constant.int 2
    %1987 = torch.aten.pow.Tensor_Scalar %1986, %int2_2717 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_2718 = torch.constant.int -1
    %1988 = torch.prim.ListConstruct %int-1_2718 : (!torch.int) -> !torch.list<int>
    %true_2719 = torch.constant.bool true
    %none_2720 = torch.constant.none
    %1989 = torch.aten.mean.dim %1987, %1988, %true_2719, %none_2720 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_2721 = torch.constant.float 9.9999999999999995E-7
    %int1_2722 = torch.constant.int 1
    %1990 = torch.aten.add.Scalar %1989, %float9.999990e-07_2721, %int1_2722 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %1991 = torch.aten.rsqrt %1990 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %1992 = torch.aten.mul.Tensor %1986, %1991 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_2723 = torch.constant.int 5
    %1993 = torch.prims.convert_element_type %1992, %int5_2723 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.5.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.5.img_attn.norm.key_norm.scale : tensor<128xf16>
    %1994 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1995 = torch.aten.mul.Tensor %1993, %1994 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_2724 = torch.constant.int 5
    %1996 = torch.prims.convert_element_type %1980, %int5_2724 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_2725 = torch.constant.int 5
    %1997 = torch.prims.convert_element_type %1995, %int5_2725 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_2726 = torch.constant.int 6
    %1998 = torch.prims.convert_element_type %1896, %int6_2726 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_2727 = torch.constant.int 2
    %1999 = torch.prim.ListConstruct %int2_2727 : (!torch.int) -> !torch.list<int>
    %int0_2728 = torch.constant.int 0
    %true_2729 = torch.constant.bool true
    %result0_2730, %result1_2731 = torch.aten.var_mean.correction %1998, %1999, %int0_2728, %true_2729 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_2732 = torch.constant.float 9.9999999999999995E-7
    %int1_2733 = torch.constant.int 1
    %2000 = torch.aten.add.Scalar %result0_2730, %float9.999990e-07_2732, %int1_2733 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %2001 = torch.aten.rsqrt %2000 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_2734 = torch.constant.int 1
    %2002 = torch.aten.sub.Tensor %1896, %result1_2731, %int1_2734 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %2003 = torch.aten.mul.Tensor %2002, %2001 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_2735 = torch.constant.int 5
    %2004 = torch.prims.convert_element_type %2003, %int5_2735 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_2736 = torch.constant.int 1
    %int1_2737 = torch.constant.int 1
    %2005 = torch.aten.add.Scalar %1934, %int1_2736, %int1_2737 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2006 = torch.aten.mul.Tensor %2005, %2004 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2738 = torch.constant.int 1
    %2007 = torch.aten.add.Tensor %2006, %1933, %int1_2738 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_2739 = torch.constant.int 512
    %int3072_2740 = torch.constant.int 3072
    %2008 = torch.prim.ListConstruct %int512_2739, %int3072_2740 : (!torch.int, !torch.int) -> !torch.list<int>
    %2009 = torch.aten.view %2007, %2008 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.5.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.5.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %2010 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_2741 = torch.constant.int 0
    %int1_2742 = torch.constant.int 1
    %2011 = torch.aten.transpose.int %2010, %int0_2741, %int1_2742 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.5.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.5.txt_attn.qkv.bias : tensor<9216xf16>
    %2012 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_2743 = torch.constant.int 6
    %2013 = torch.prims.convert_element_type %2012, %int6_2743 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_2744 = torch.constant.int 6
    %2014 = torch.prims.convert_element_type %2009, %int6_2744 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_2745 = torch.constant.int 6
    %2015 = torch.prims.convert_element_type %2011, %int6_2745 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %2016 = torch.aten.mm %2014, %2015 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_2746 = torch.constant.int 1
    %2017 = torch.aten.mul.Scalar %2016, %int1_2746 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_2747 = torch.constant.int 1
    %2018 = torch.aten.mul.Scalar %2013, %int1_2747 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_2748 = torch.constant.int 1
    %2019 = torch.aten.add.Tensor %2017, %2018, %int1_2748 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_2749 = torch.constant.int 5
    %2020 = torch.prims.convert_element_type %2019, %int5_2749 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_2750 = torch.constant.int 1
    %int512_2751 = torch.constant.int 512
    %int9216_2752 = torch.constant.int 9216
    %2021 = torch.prim.ListConstruct %int1_2750, %int512_2751, %int9216_2752 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2022 = torch.aten.view %2020, %2021 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %2023 = torch_c.to_builtin_tensor %2022 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_2753 = tensor.cast %2023 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_2754 = arith.constant 0 : index
    %dim_2755 = tensor.dim %cast_2753, %c0_2754 : tensor<?x?x?xf16>
    %c1_2756 = arith.constant 1 : index
    %dim_2757 = tensor.dim %cast_2753, %c1_2756 : tensor<?x?x?xf16>
    %c2_2758 = arith.constant 2 : index
    %dim_2759 = tensor.dim %cast_2753, %c2_2758 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_2753 : tensor<?x?x?xf16>{%dim_2755, %dim_2757, %dim_2759}]
    %cast_2760 = tensor.cast %cast_2753 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %2024 = torch_c.from_builtin_tensor %cast_2760 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_2761 = torch.constant.int 1
    %int512_2762 = torch.constant.int 512
    %int3_2763 = torch.constant.int 3
    %int24_2764 = torch.constant.int 24
    %int128_2765 = torch.constant.int 128
    %2025 = torch.prim.ListConstruct %int1_2761, %int512_2762, %int3_2763, %int24_2764, %int128_2765 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2026 = torch.aten.view %2024, %2025 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_2766 = torch.constant.int 2
    %int0_2767 = torch.constant.int 0
    %int3_2768 = torch.constant.int 3
    %int1_2769 = torch.constant.int 1
    %int4_2770 = torch.constant.int 4
    %2027 = torch.prim.ListConstruct %int2_2766, %int0_2767, %int3_2768, %int1_2769, %int4_2770 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2028 = torch.aten.permute %2026, %2027 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_2771 = torch.constant.int 0
    %int0_2772 = torch.constant.int 0
    %2029 = torch.aten.select.int %2028, %int0_2771, %int0_2772 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_2773 = torch.constant.int 6
    %2030 = torch.prims.convert_element_type %2029, %int6_2773 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_2774 = torch.constant.int 2
    %2031 = torch.aten.pow.Tensor_Scalar %2030, %int2_2774 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_2775 = torch.constant.int -1
    %2032 = torch.prim.ListConstruct %int-1_2775 : (!torch.int) -> !torch.list<int>
    %true_2776 = torch.constant.bool true
    %none_2777 = torch.constant.none
    %2033 = torch.aten.mean.dim %2031, %2032, %true_2776, %none_2777 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_2778 = torch.constant.float 9.9999999999999995E-7
    %int1_2779 = torch.constant.int 1
    %2034 = torch.aten.add.Scalar %2033, %float9.999990e-07_2778, %int1_2779 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %2035 = torch.aten.rsqrt %2034 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %2036 = torch.aten.mul.Tensor %2030, %2035 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_2780 = torch.constant.int 5
    %2037 = torch.prims.convert_element_type %2036, %int5_2780 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.5.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.5.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %2038 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2039 = torch.aten.mul.Tensor %2037, %2038 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_2781 = torch.constant.int 1
    %int512_2782 = torch.constant.int 512
    %int3_2783 = torch.constant.int 3
    %int24_2784 = torch.constant.int 24
    %int128_2785 = torch.constant.int 128
    %2040 = torch.prim.ListConstruct %int1_2781, %int512_2782, %int3_2783, %int24_2784, %int128_2785 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2041 = torch.aten.view %2024, %2040 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_2786 = torch.constant.int 2
    %int0_2787 = torch.constant.int 0
    %int3_2788 = torch.constant.int 3
    %int1_2789 = torch.constant.int 1
    %int4_2790 = torch.constant.int 4
    %2042 = torch.prim.ListConstruct %int2_2786, %int0_2787, %int3_2788, %int1_2789, %int4_2790 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2043 = torch.aten.permute %2041, %2042 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_2791 = torch.constant.int 0
    %int1_2792 = torch.constant.int 1
    %2044 = torch.aten.select.int %2043, %int0_2791, %int1_2792 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_2793 = torch.constant.int 6
    %2045 = torch.prims.convert_element_type %2044, %int6_2793 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_2794 = torch.constant.int 2
    %2046 = torch.aten.pow.Tensor_Scalar %2045, %int2_2794 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_2795 = torch.constant.int -1
    %2047 = torch.prim.ListConstruct %int-1_2795 : (!torch.int) -> !torch.list<int>
    %true_2796 = torch.constant.bool true
    %none_2797 = torch.constant.none
    %2048 = torch.aten.mean.dim %2046, %2047, %true_2796, %none_2797 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_2798 = torch.constant.float 9.9999999999999995E-7
    %int1_2799 = torch.constant.int 1
    %2049 = torch.aten.add.Scalar %2048, %float9.999990e-07_2798, %int1_2799 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %2050 = torch.aten.rsqrt %2049 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %2051 = torch.aten.mul.Tensor %2045, %2050 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_2800 = torch.constant.int 5
    %2052 = torch.prims.convert_element_type %2051, %int5_2800 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.5.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.5.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %2053 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2054 = torch.aten.mul.Tensor %2052, %2053 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_2801 = torch.constant.int 5
    %2055 = torch.prims.convert_element_type %2039, %int5_2801 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_2802 = torch.constant.int 5
    %2056 = torch.prims.convert_element_type %2054, %int5_2802 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %2057 = torch.prim.ListConstruct %2055, %1996 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_2803 = torch.constant.int 2
    %2058 = torch.aten.cat %2057, %int2_2803 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %2059 = torch.prim.ListConstruct %2056, %1997 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_2804 = torch.constant.int 2
    %2060 = torch.aten.cat %2059, %int2_2804 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_2805 = torch.constant.int 1
    %int512_2806 = torch.constant.int 512
    %int3_2807 = torch.constant.int 3
    %int24_2808 = torch.constant.int 24
    %int128_2809 = torch.constant.int 128
    %2061 = torch.prim.ListConstruct %int1_2805, %int512_2806, %int3_2807, %int24_2808, %int128_2809 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2062 = torch.aten.view %2024, %2061 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_2810 = torch.constant.int 2
    %int0_2811 = torch.constant.int 0
    %int3_2812 = torch.constant.int 3
    %int1_2813 = torch.constant.int 1
    %int4_2814 = torch.constant.int 4
    %2063 = torch.prim.ListConstruct %int2_2810, %int0_2811, %int3_2812, %int1_2813, %int4_2814 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2064 = torch.aten.permute %2062, %2063 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_2815 = torch.constant.int 0
    %int2_2816 = torch.constant.int 2
    %2065 = torch.aten.select.int %2064, %int0_2815, %int2_2816 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_2817 = torch.constant.int 1
    %int4096_2818 = torch.constant.int 4096
    %int3_2819 = torch.constant.int 3
    %int24_2820 = torch.constant.int 24
    %int128_2821 = torch.constant.int 128
    %2066 = torch.prim.ListConstruct %int1_2817, %int4096_2818, %int3_2819, %int24_2820, %int128_2821 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2067 = torch.aten.view %1965, %2066 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_2822 = torch.constant.int 2
    %int0_2823 = torch.constant.int 0
    %int3_2824 = torch.constant.int 3
    %int1_2825 = torch.constant.int 1
    %int4_2826 = torch.constant.int 4
    %2068 = torch.prim.ListConstruct %int2_2822, %int0_2823, %int3_2824, %int1_2825, %int4_2826 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2069 = torch.aten.permute %2067, %2068 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_2827 = torch.constant.int 0
    %int2_2828 = torch.constant.int 2
    %2070 = torch.aten.select.int %2069, %int0_2827, %int2_2828 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %2071 = torch.prim.ListConstruct %2065, %2070 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_2829 = torch.constant.int 2
    %2072 = torch.aten.cat %2071, %int2_2829 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %2073 = torch_c.to_builtin_tensor %2058 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_2830 = tensor.cast %2073 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_2831 = arith.constant 0 : index
    %dim_2832 = tensor.dim %cast_2830, %c0_2831 : tensor<?x?x?x?xf16>
    %c1_2833 = arith.constant 1 : index
    %dim_2834 = tensor.dim %cast_2830, %c1_2833 : tensor<?x?x?x?xf16>
    %c2_2835 = arith.constant 2 : index
    %dim_2836 = tensor.dim %cast_2830, %c2_2835 : tensor<?x?x?x?xf16>
    %c3_2837 = arith.constant 3 : index
    %dim_2838 = tensor.dim %cast_2830, %c3_2837 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_2830 : tensor<?x?x?x?xf16>{%dim_2832, %dim_2834, %dim_2836, %dim_2838}]
    %cast_2839 = tensor.cast %cast_2830 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %2074 = torch_c.from_builtin_tensor %cast_2839 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %2075 = torch_c.to_builtin_tensor %2060 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_2840 = tensor.cast %2075 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_2841 = arith.constant 0 : index
    %dim_2842 = tensor.dim %cast_2840, %c0_2841 : tensor<?x?x?x?xf16>
    %c1_2843 = arith.constant 1 : index
    %dim_2844 = tensor.dim %cast_2840, %c1_2843 : tensor<?x?x?x?xf16>
    %c2_2845 = arith.constant 2 : index
    %dim_2846 = tensor.dim %cast_2840, %c2_2845 : tensor<?x?x?x?xf16>
    %c3_2847 = arith.constant 3 : index
    %dim_2848 = tensor.dim %cast_2840, %c3_2847 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_2840 : tensor<?x?x?x?xf16>{%dim_2842, %dim_2844, %dim_2846, %dim_2848}]
    %cast_2849 = tensor.cast %cast_2840 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %2076 = torch_c.from_builtin_tensor %cast_2849 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %2077 = torch_c.to_builtin_tensor %2072 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_2850 = tensor.cast %2077 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_2851 = arith.constant 0 : index
    %dim_2852 = tensor.dim %cast_2850, %c0_2851 : tensor<?x?x?x?xf16>
    %c1_2853 = arith.constant 1 : index
    %dim_2854 = tensor.dim %cast_2850, %c1_2853 : tensor<?x?x?x?xf16>
    %c2_2855 = arith.constant 2 : index
    %dim_2856 = tensor.dim %cast_2850, %c2_2855 : tensor<?x?x?x?xf16>
    %c3_2857 = arith.constant 3 : index
    %dim_2858 = tensor.dim %cast_2850, %c3_2857 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_2850 : tensor<?x?x?x?xf16>{%dim_2852, %dim_2854, %dim_2856, %dim_2858}]
    %cast_2859 = tensor.cast %cast_2850 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %2078 = torch_c.from_builtin_tensor %cast_2859 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_2860 = torch.constant.int 6
    %2079 = torch.prims.convert_element_type %2074, %int6_2860 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_2861 = torch.constant.int 1
    %int24_2862 = torch.constant.int 24
    %int4608_2863 = torch.constant.int 4608
    %int-1_2864 = torch.constant.int -1
    %int1_2865 = torch.constant.int 1
    %int2_2866 = torch.constant.int 2
    %2080 = torch.prim.ListConstruct %int1_2861, %int24_2862, %int4608_2863, %int-1_2864, %int1_2865, %int2_2866 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2081 = torch.aten.view %2079, %2080 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_2867 = torch.constant.int 6
    %2082 = torch.prims.convert_element_type %2076, %int6_2867 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_2868 = torch.constant.int 1
    %int24_2869 = torch.constant.int 24
    %int4608_2870 = torch.constant.int 4608
    %int-1_2871 = torch.constant.int -1
    %int1_2872 = torch.constant.int 1
    %int2_2873 = torch.constant.int 2
    %2083 = torch.prim.ListConstruct %int1_2868, %int24_2869, %int4608_2870, %int-1_2871, %int1_2872, %int2_2873 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2084 = torch.aten.view %2082, %2083 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_2874 = torch.constant.int 5
    %int0_2875 = torch.constant.int 0
    %2085 = torch.aten.select.int %211, %int5_2874, %int0_2875 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2876 = torch.constant.int 5
    %int0_2877 = torch.constant.int 0
    %2086 = torch.aten.select.int %2081, %int5_2876, %int0_2877 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2087 = torch.aten.mul.Tensor %2085, %2086 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_2878 = torch.constant.int 5
    %int1_2879 = torch.constant.int 1
    %2088 = torch.aten.select.int %211, %int5_2878, %int1_2879 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2880 = torch.constant.int 5
    %int1_2881 = torch.constant.int 1
    %2089 = torch.aten.select.int %2081, %int5_2880, %int1_2881 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2090 = torch.aten.mul.Tensor %2088, %2089 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_2882 = torch.constant.int 1
    %2091 = torch.aten.add.Tensor %2087, %2090, %int1_2882 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_2883 = torch.constant.int 5
    %int0_2884 = torch.constant.int 0
    %2092 = torch.aten.select.int %211, %int5_2883, %int0_2884 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2885 = torch.constant.int 5
    %int0_2886 = torch.constant.int 0
    %2093 = torch.aten.select.int %2084, %int5_2885, %int0_2886 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2094 = torch.aten.mul.Tensor %2092, %2093 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_2887 = torch.constant.int 5
    %int1_2888 = torch.constant.int 1
    %2095 = torch.aten.select.int %211, %int5_2887, %int1_2888 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2889 = torch.constant.int 5
    %int1_2890 = torch.constant.int 1
    %2096 = torch.aten.select.int %2084, %int5_2889, %int1_2890 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2097 = torch.aten.mul.Tensor %2095, %2096 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_2891 = torch.constant.int 1
    %2098 = torch.aten.add.Tensor %2094, %2097, %int1_2891 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_2892 = torch.constant.int 1
    %int24_2893 = torch.constant.int 24
    %int4608_2894 = torch.constant.int 4608
    %int128_2895 = torch.constant.int 128
    %2099 = torch.prim.ListConstruct %int1_2892, %int24_2893, %int4608_2894, %int128_2895 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2100 = torch.aten.view %2091, %2099 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_2896 = torch.constant.int 5
    %2101 = torch.prims.convert_element_type %2100, %int5_2896 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_2897 = torch.constant.int 1
    %int24_2898 = torch.constant.int 24
    %int4608_2899 = torch.constant.int 4608
    %int128_2900 = torch.constant.int 128
    %2102 = torch.prim.ListConstruct %int1_2897, %int24_2898, %int4608_2899, %int128_2900 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2103 = torch.aten.view %2098, %2102 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_2901 = torch.constant.int 5
    %2104 = torch.prims.convert_element_type %2103, %int5_2901 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_2902 = torch.constant.float 0.000000e+00
    %false_2903 = torch.constant.bool false
    %none_2904 = torch.constant.none
    %none_2905 = torch.constant.none
    %2105:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2101, %2104, %2078, %float0.000000e00_2902, %false_2903, %none_2904, %none_2905) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_2906 = torch.constant.int 0
    %int2_2907 = torch.constant.int 2
    %int1_2908 = torch.constant.int 1
    %int3_2909 = torch.constant.int 3
    %2106 = torch.prim.ListConstruct %int0_2906, %int2_2907, %int1_2908, %int3_2909 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2107 = torch.aten.permute %2105#0, %2106 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_2910 = torch.constant.int 1
    %int4608_2911 = torch.constant.int 4608
    %int3072_2912 = torch.constant.int 3072
    %2108 = torch.prim.ListConstruct %int1_2910, %int4608_2911, %int3072_2912 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2109 = torch.aten.view %2107, %2108 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_2913 = torch.constant.int 0
    %int0_2914 = torch.constant.int 0
    %int9223372036854775807_2915 = torch.constant.int 9223372036854775807
    %int1_2916 = torch.constant.int 1
    %2110 = torch.aten.slice.Tensor %2109, %int0_2913, %int0_2914, %int9223372036854775807_2915, %int1_2916 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_2917 = torch.constant.int 1
    %int0_2918 = torch.constant.int 0
    %int512_2919 = torch.constant.int 512
    %int1_2920 = torch.constant.int 1
    %2111 = torch.aten.slice.Tensor %2110, %int1_2917, %int0_2918, %int512_2919, %int1_2920 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_2921 = torch.constant.int 0
    %int0_2922 = torch.constant.int 0
    %int9223372036854775807_2923 = torch.constant.int 9223372036854775807
    %int1_2924 = torch.constant.int 1
    %2112 = torch.aten.slice.Tensor %2109, %int0_2921, %int0_2922, %int9223372036854775807_2923, %int1_2924 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_2925 = torch.constant.int 1
    %int512_2926 = torch.constant.int 512
    %int9223372036854775807_2927 = torch.constant.int 9223372036854775807
    %int1_2928 = torch.constant.int 1
    %2113 = torch.aten.slice.Tensor %2112, %int1_2925, %int512_2926, %int9223372036854775807_2927, %int1_2928 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_2929 = torch.constant.int 4096
    %int3072_2930 = torch.constant.int 3072
    %2114 = torch.prim.ListConstruct %int4096_2929, %int3072_2930 : (!torch.int, !torch.int) -> !torch.list<int>
    %2115 = torch.aten.view %2113, %2114 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.5.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.5.img_attn.proj.weight : tensor<3072x3072xf16>
    %2116 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_2931 = torch.constant.int 0
    %int1_2932 = torch.constant.int 1
    %2117 = torch.aten.transpose.int %2116, %int0_2931, %int1_2932 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.5.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.5.img_attn.proj.bias : tensor<3072xf16>
    %2118 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2933 = torch.constant.int 6
    %2119 = torch.prims.convert_element_type %2118, %int6_2933 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2934 = torch.constant.int 6
    %2120 = torch.prims.convert_element_type %2115, %int6_2934 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_2935 = torch.constant.int 6
    %2121 = torch.prims.convert_element_type %2117, %int6_2935 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %2122 = torch.aten.mm %2120, %2121 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_2936 = torch.constant.int 1
    %2123 = torch.aten.mul.Scalar %2122, %int1_2936 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_2937 = torch.constant.int 1
    %2124 = torch.aten.mul.Scalar %2119, %int1_2937 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2938 = torch.constant.int 1
    %2125 = torch.aten.add.Tensor %2123, %2124, %int1_2938 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_2939 = torch.constant.int 5
    %2126 = torch.prims.convert_element_type %2125, %int5_2939 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_2940 = torch.constant.int 1
    %int4096_2941 = torch.constant.int 4096
    %int3072_2942 = torch.constant.int 3072
    %2127 = torch.prim.ListConstruct %int1_2940, %int4096_2941, %int3072_2942 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2128 = torch.aten.view %2126, %2127 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %2129 = torch.aten.mul.Tensor %1914, %2128 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2943 = torch.constant.int 1
    %2130 = torch.aten.add.Tensor %1836, %2129, %int1_2943 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2944 = torch.constant.int 1
    %int1_2945 = torch.constant.int 1
    %2131 = torch.aten.add.Scalar %1916, %int1_2944, %int1_2945 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_2946 = torch.constant.int 6
    %2132 = torch.prims.convert_element_type %2130, %int6_2946 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_2947 = torch.constant.int 2
    %2133 = torch.prim.ListConstruct %int2_2947 : (!torch.int) -> !torch.list<int>
    %int0_2948 = torch.constant.int 0
    %true_2949 = torch.constant.bool true
    %result0_2950, %result1_2951 = torch.aten.var_mean.correction %2132, %2133, %int0_2948, %true_2949 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_2952 = torch.constant.float 9.9999999999999995E-7
    %int1_2953 = torch.constant.int 1
    %2134 = torch.aten.add.Scalar %result0_2950, %float9.999990e-07_2952, %int1_2953 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %2135 = torch.aten.rsqrt %2134 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_2954 = torch.constant.int 1
    %2136 = torch.aten.sub.Tensor %2130, %result1_2951, %int1_2954 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %2137 = torch.aten.mul.Tensor %2136, %2135 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_2955 = torch.constant.int 5
    %2138 = torch.prims.convert_element_type %2137, %int5_2955 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %2139 = torch.aten.mul.Tensor %2131, %2138 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2956 = torch.constant.int 1
    %2140 = torch.aten.add.Tensor %2139, %1915, %int1_2956 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_2957 = torch.constant.int 4096
    %int3072_2958 = torch.constant.int 3072
    %2141 = torch.prim.ListConstruct %int4096_2957, %int3072_2958 : (!torch.int, !torch.int) -> !torch.list<int>
    %2142 = torch.aten.view %2140, %2141 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.5.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.5.img_mlp.0.weight : tensor<12288x3072xf16>
    %2143 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_2959 = torch.constant.int 0
    %int1_2960 = torch.constant.int 1
    %2144 = torch.aten.transpose.int %2143, %int0_2959, %int1_2960 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.5.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.5.img_mlp.0.bias : tensor<12288xf16>
    %2145 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_2961 = torch.constant.int 6
    %2146 = torch.prims.convert_element_type %2145, %int6_2961 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_2962 = torch.constant.int 6
    %2147 = torch.prims.convert_element_type %2142, %int6_2962 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_2963 = torch.constant.int 6
    %2148 = torch.prims.convert_element_type %2144, %int6_2963 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %2149 = torch.aten.mm %2147, %2148 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_2964 = torch.constant.int 1
    %2150 = torch.aten.mul.Scalar %2149, %int1_2964 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_2965 = torch.constant.int 1
    %2151 = torch.aten.mul.Scalar %2146, %int1_2965 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_2966 = torch.constant.int 1
    %2152 = torch.aten.add.Tensor %2150, %2151, %int1_2966 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_2967 = torch.constant.int 5
    %2153 = torch.prims.convert_element_type %2152, %int5_2967 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_2968 = torch.constant.int 1
    %int4096_2969 = torch.constant.int 4096
    %int12288_2970 = torch.constant.int 12288
    %2154 = torch.prim.ListConstruct %int1_2968, %int4096_2969, %int12288_2970 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2155 = torch.aten.view %2153, %2154 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_2971 = torch.constant.str "tanh"
    %2156 = torch.aten.gelu %2155, %str_2971 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_2972 = torch.constant.int 4096
    %int12288_2973 = torch.constant.int 12288
    %2157 = torch.prim.ListConstruct %int4096_2972, %int12288_2973 : (!torch.int, !torch.int) -> !torch.list<int>
    %2158 = torch.aten.view %2156, %2157 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.5.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.5.img_mlp.2.weight : tensor<3072x12288xf16>
    %2159 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_2974 = torch.constant.int 0
    %int1_2975 = torch.constant.int 1
    %2160 = torch.aten.transpose.int %2159, %int0_2974, %int1_2975 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.5.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.5.img_mlp.2.bias : tensor<3072xf16>
    %2161 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2976 = torch.constant.int 6
    %2162 = torch.prims.convert_element_type %2161, %int6_2976 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2977 = torch.constant.int 6
    %2163 = torch.prims.convert_element_type %2158, %int6_2977 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_2978 = torch.constant.int 6
    %2164 = torch.prims.convert_element_type %2160, %int6_2978 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %2165 = torch.aten.mm %2163, %2164 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_2979 = torch.constant.int 1
    %2166 = torch.aten.mul.Scalar %2165, %int1_2979 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_2980 = torch.constant.int 1
    %2167 = torch.aten.mul.Scalar %2162, %int1_2980 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2981 = torch.constant.int 1
    %2168 = torch.aten.add.Tensor %2166, %2167, %int1_2981 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_2982 = torch.constant.int 5
    %2169 = torch.prims.convert_element_type %2168, %int5_2982 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_2983 = torch.constant.int 1
    %int4096_2984 = torch.constant.int 4096
    %int3072_2985 = torch.constant.int 3072
    %2170 = torch.prim.ListConstruct %int1_2983, %int4096_2984, %int3072_2985 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2171 = torch.aten.view %2169, %2170 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %2172 = torch.aten.mul.Tensor %1917, %2171 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2986 = torch.constant.int 1
    %2173 = torch.aten.add.Tensor %2130, %2172, %int1_2986 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_2987 = torch.constant.int 512
    %int3072_2988 = torch.constant.int 3072
    %2174 = torch.prim.ListConstruct %int512_2987, %int3072_2988 : (!torch.int, !torch.int) -> !torch.list<int>
    %2175 = torch.aten.view %2111, %2174 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.5.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.5.txt_attn.proj.weight : tensor<3072x3072xf16>
    %2176 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_2989 = torch.constant.int 0
    %int1_2990 = torch.constant.int 1
    %2177 = torch.aten.transpose.int %2176, %int0_2989, %int1_2990 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.5.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.5.txt_attn.proj.bias : tensor<3072xf16>
    %2178 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2991 = torch.constant.int 6
    %2179 = torch.prims.convert_element_type %2178, %int6_2991 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2992 = torch.constant.int 6
    %2180 = torch.prims.convert_element_type %2175, %int6_2992 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_2993 = torch.constant.int 6
    %2181 = torch.prims.convert_element_type %2177, %int6_2993 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %2182 = torch.aten.mm %2180, %2181 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_2994 = torch.constant.int 1
    %2183 = torch.aten.mul.Scalar %2182, %int1_2994 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_2995 = torch.constant.int 1
    %2184 = torch.aten.mul.Scalar %2179, %int1_2995 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2996 = torch.constant.int 1
    %2185 = torch.aten.add.Tensor %2183, %2184, %int1_2996 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_2997 = torch.constant.int 5
    %2186 = torch.prims.convert_element_type %2185, %int5_2997 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_2998 = torch.constant.int 1
    %int512_2999 = torch.constant.int 512
    %int3072_3000 = torch.constant.int 3072
    %2187 = torch.prim.ListConstruct %int1_2998, %int512_2999, %int3072_3000 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2188 = torch.aten.view %2186, %2187 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %2189 = torch.aten.mul.Tensor %1935, %2188 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3001 = torch.constant.int 1
    %2190 = torch.aten.add.Tensor %1896, %2189, %int1_3001 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_3002 = torch.constant.int 1
    %int1_3003 = torch.constant.int 1
    %2191 = torch.aten.add.Scalar %1937, %int1_3002, %int1_3003 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_3004 = torch.constant.int 6
    %2192 = torch.prims.convert_element_type %2190, %int6_3004 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_3005 = torch.constant.int 2
    %2193 = torch.prim.ListConstruct %int2_3005 : (!torch.int) -> !torch.list<int>
    %int0_3006 = torch.constant.int 0
    %true_3007 = torch.constant.bool true
    %result0_3008, %result1_3009 = torch.aten.var_mean.correction %2192, %2193, %int0_3006, %true_3007 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_3010 = torch.constant.float 9.9999999999999995E-7
    %int1_3011 = torch.constant.int 1
    %2194 = torch.aten.add.Scalar %result0_3008, %float9.999990e-07_3010, %int1_3011 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %2195 = torch.aten.rsqrt %2194 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_3012 = torch.constant.int 1
    %2196 = torch.aten.sub.Tensor %2190, %result1_3009, %int1_3012 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %2197 = torch.aten.mul.Tensor %2196, %2195 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_3013 = torch.constant.int 5
    %2198 = torch.prims.convert_element_type %2197, %int5_3013 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %2199 = torch.aten.mul.Tensor %2191, %2198 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3014 = torch.constant.int 1
    %2200 = torch.aten.add.Tensor %2199, %1936, %int1_3014 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_3015 = torch.constant.int 512
    %int3072_3016 = torch.constant.int 3072
    %2201 = torch.prim.ListConstruct %int512_3015, %int3072_3016 : (!torch.int, !torch.int) -> !torch.list<int>
    %2202 = torch.aten.view %2200, %2201 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.5.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.5.txt_mlp.0.weight : tensor<12288x3072xf16>
    %2203 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_3017 = torch.constant.int 0
    %int1_3018 = torch.constant.int 1
    %2204 = torch.aten.transpose.int %2203, %int0_3017, %int1_3018 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.5.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.5.txt_mlp.0.bias : tensor<12288xf16>
    %2205 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_3019 = torch.constant.int 6
    %2206 = torch.prims.convert_element_type %2205, %int6_3019 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_3020 = torch.constant.int 6
    %2207 = torch.prims.convert_element_type %2202, %int6_3020 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_3021 = torch.constant.int 6
    %2208 = torch.prims.convert_element_type %2204, %int6_3021 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %2209 = torch.aten.mm %2207, %2208 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_3022 = torch.constant.int 1
    %2210 = torch.aten.mul.Scalar %2209, %int1_3022 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_3023 = torch.constant.int 1
    %2211 = torch.aten.mul.Scalar %2206, %int1_3023 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_3024 = torch.constant.int 1
    %2212 = torch.aten.add.Tensor %2210, %2211, %int1_3024 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_3025 = torch.constant.int 5
    %2213 = torch.prims.convert_element_type %2212, %int5_3025 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_3026 = torch.constant.int 1
    %int512_3027 = torch.constant.int 512
    %int12288_3028 = torch.constant.int 12288
    %2214 = torch.prim.ListConstruct %int1_3026, %int512_3027, %int12288_3028 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2215 = torch.aten.view %2213, %2214 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_3029 = torch.constant.str "tanh"
    %2216 = torch.aten.gelu %2215, %str_3029 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_3030 = torch.constant.int 512
    %int12288_3031 = torch.constant.int 12288
    %2217 = torch.prim.ListConstruct %int512_3030, %int12288_3031 : (!torch.int, !torch.int) -> !torch.list<int>
    %2218 = torch.aten.view %2216, %2217 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.5.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.5.txt_mlp.2.weight : tensor<3072x12288xf16>
    %2219 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_3032 = torch.constant.int 0
    %int1_3033 = torch.constant.int 1
    %2220 = torch.aten.transpose.int %2219, %int0_3032, %int1_3033 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.5.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.5.txt_mlp.2.bias : tensor<3072xf16>
    %2221 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3034 = torch.constant.int 6
    %2222 = torch.prims.convert_element_type %2221, %int6_3034 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3035 = torch.constant.int 6
    %2223 = torch.prims.convert_element_type %2218, %int6_3035 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_3036 = torch.constant.int 6
    %2224 = torch.prims.convert_element_type %2220, %int6_3036 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %2225 = torch.aten.mm %2223, %2224 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_3037 = torch.constant.int 1
    %2226 = torch.aten.mul.Scalar %2225, %int1_3037 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_3038 = torch.constant.int 1
    %2227 = torch.aten.mul.Scalar %2222, %int1_3038 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3039 = torch.constant.int 1
    %2228 = torch.aten.add.Tensor %2226, %2227, %int1_3039 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_3040 = torch.constant.int 5
    %2229 = torch.prims.convert_element_type %2228, %int5_3040 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_3041 = torch.constant.int 1
    %int512_3042 = torch.constant.int 512
    %int3072_3043 = torch.constant.int 3072
    %2230 = torch.prim.ListConstruct %int1_3041, %int512_3042, %int3072_3043 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2231 = torch.aten.view %2229, %2230 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %2232 = torch.aten.mul.Tensor %1938, %2231 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3044 = torch.constant.int 1
    %2233 = torch.aten.add.Tensor %2190, %2232, %int1_3044 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %2234 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.6.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.6.img_mod.lin.weight : tensor<18432x3072xf16>
    %2235 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_3045 = torch.constant.int 0
    %int1_3046 = torch.constant.int 1
    %2236 = torch.aten.transpose.int %2235, %int0_3045, %int1_3046 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.6.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.6.img_mod.lin.bias : tensor<18432xf16>
    %2237 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_3047 = torch.constant.int 6
    %2238 = torch.prims.convert_element_type %2237, %int6_3047 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_3048 = torch.constant.int 6
    %2239 = torch.prims.convert_element_type %2234, %int6_3048 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_3049 = torch.constant.int 6
    %2240 = torch.prims.convert_element_type %2236, %int6_3049 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %2241 = torch.aten.mm %2239, %2240 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_3050 = torch.constant.int 1
    %2242 = torch.aten.mul.Scalar %2241, %int1_3050 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_3051 = torch.constant.int 1
    %2243 = torch.aten.mul.Scalar %2238, %int1_3051 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_3052 = torch.constant.int 1
    %2244 = torch.aten.add.Tensor %2242, %2243, %int1_3052 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_3053 = torch.constant.int 5
    %2245 = torch.prims.convert_element_type %2244, %int5_3053 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_3054 = torch.constant.int 0
    %int0_3055 = torch.constant.int 0
    %int9223372036854775807_3056 = torch.constant.int 9223372036854775807
    %int1_3057 = torch.constant.int 1
    %2246 = torch.aten.slice.Tensor %2245, %int0_3054, %int0_3055, %int9223372036854775807_3056, %int1_3057 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_3058 = torch.constant.int 1
    %2247 = torch.aten.unsqueeze %2246, %int1_3058 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_3059 = torch.constant.int 2
    %int0_3060 = torch.constant.int 0
    %int9223372036854775807_3061 = torch.constant.int 9223372036854775807
    %int1_3062 = torch.constant.int 1
    %2248 = torch.aten.slice.Tensor %2247, %int2_3059, %int0_3060, %int9223372036854775807_3061, %int1_3062 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_3063 = torch.constant.int -1
    %int0_3064 = torch.constant.int 0
    %int3072_3065 = torch.constant.int 3072
    %int1_3066 = torch.constant.int 1
    %2249 = torch.aten.slice.Tensor %2248, %int-1_3063, %int0_3064, %int3072_3065, %int1_3066 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3067 = torch.constant.int -1
    %int3072_3068 = torch.constant.int 3072
    %int6144_3069 = torch.constant.int 6144
    %int1_3070 = torch.constant.int 1
    %2250 = torch.aten.slice.Tensor %2248, %int-1_3067, %int3072_3068, %int6144_3069, %int1_3070 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3071 = torch.constant.int -1
    %int6144_3072 = torch.constant.int 6144
    %int9216_3073 = torch.constant.int 9216
    %int1_3074 = torch.constant.int 1
    %2251 = torch.aten.slice.Tensor %2248, %int-1_3071, %int6144_3072, %int9216_3073, %int1_3074 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3075 = torch.constant.int -1
    %int9216_3076 = torch.constant.int 9216
    %int12288_3077 = torch.constant.int 12288
    %int1_3078 = torch.constant.int 1
    %2252 = torch.aten.slice.Tensor %2248, %int-1_3075, %int9216_3076, %int12288_3077, %int1_3078 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3079 = torch.constant.int -1
    %int12288_3080 = torch.constant.int 12288
    %int15360_3081 = torch.constant.int 15360
    %int1_3082 = torch.constant.int 1
    %2253 = torch.aten.slice.Tensor %2248, %int-1_3079, %int12288_3080, %int15360_3081, %int1_3082 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3083 = torch.constant.int -1
    %int15360_3084 = torch.constant.int 15360
    %int18432_3085 = torch.constant.int 18432
    %int1_3086 = torch.constant.int 1
    %2254 = torch.aten.slice.Tensor %2248, %int-1_3083, %int15360_3084, %int18432_3085, %int1_3086 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2255 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.6.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.6.txt_mod.lin.weight : tensor<18432x3072xf16>
    %2256 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_3087 = torch.constant.int 0
    %int1_3088 = torch.constant.int 1
    %2257 = torch.aten.transpose.int %2256, %int0_3087, %int1_3088 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.6.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.6.txt_mod.lin.bias : tensor<18432xf16>
    %2258 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_3089 = torch.constant.int 6
    %2259 = torch.prims.convert_element_type %2258, %int6_3089 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_3090 = torch.constant.int 6
    %2260 = torch.prims.convert_element_type %2255, %int6_3090 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_3091 = torch.constant.int 6
    %2261 = torch.prims.convert_element_type %2257, %int6_3091 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %2262 = torch.aten.mm %2260, %2261 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_3092 = torch.constant.int 1
    %2263 = torch.aten.mul.Scalar %2262, %int1_3092 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_3093 = torch.constant.int 1
    %2264 = torch.aten.mul.Scalar %2259, %int1_3093 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_3094 = torch.constant.int 1
    %2265 = torch.aten.add.Tensor %2263, %2264, %int1_3094 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_3095 = torch.constant.int 5
    %2266 = torch.prims.convert_element_type %2265, %int5_3095 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_3096 = torch.constant.int 0
    %int0_3097 = torch.constant.int 0
    %int9223372036854775807_3098 = torch.constant.int 9223372036854775807
    %int1_3099 = torch.constant.int 1
    %2267 = torch.aten.slice.Tensor %2266, %int0_3096, %int0_3097, %int9223372036854775807_3098, %int1_3099 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_3100 = torch.constant.int 1
    %2268 = torch.aten.unsqueeze %2267, %int1_3100 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_3101 = torch.constant.int 2
    %int0_3102 = torch.constant.int 0
    %int9223372036854775807_3103 = torch.constant.int 9223372036854775807
    %int1_3104 = torch.constant.int 1
    %2269 = torch.aten.slice.Tensor %2268, %int2_3101, %int0_3102, %int9223372036854775807_3103, %int1_3104 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_3105 = torch.constant.int -1
    %int0_3106 = torch.constant.int 0
    %int3072_3107 = torch.constant.int 3072
    %int1_3108 = torch.constant.int 1
    %2270 = torch.aten.slice.Tensor %2269, %int-1_3105, %int0_3106, %int3072_3107, %int1_3108 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3109 = torch.constant.int -1
    %int3072_3110 = torch.constant.int 3072
    %int6144_3111 = torch.constant.int 6144
    %int1_3112 = torch.constant.int 1
    %2271 = torch.aten.slice.Tensor %2269, %int-1_3109, %int3072_3110, %int6144_3111, %int1_3112 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3113 = torch.constant.int -1
    %int6144_3114 = torch.constant.int 6144
    %int9216_3115 = torch.constant.int 9216
    %int1_3116 = torch.constant.int 1
    %2272 = torch.aten.slice.Tensor %2269, %int-1_3113, %int6144_3114, %int9216_3115, %int1_3116 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3117 = torch.constant.int -1
    %int9216_3118 = torch.constant.int 9216
    %int12288_3119 = torch.constant.int 12288
    %int1_3120 = torch.constant.int 1
    %2273 = torch.aten.slice.Tensor %2269, %int-1_3117, %int9216_3118, %int12288_3119, %int1_3120 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3121 = torch.constant.int -1
    %int12288_3122 = torch.constant.int 12288
    %int15360_3123 = torch.constant.int 15360
    %int1_3124 = torch.constant.int 1
    %2274 = torch.aten.slice.Tensor %2269, %int-1_3121, %int12288_3122, %int15360_3123, %int1_3124 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3125 = torch.constant.int -1
    %int15360_3126 = torch.constant.int 15360
    %int18432_3127 = torch.constant.int 18432
    %int1_3128 = torch.constant.int 1
    %2275 = torch.aten.slice.Tensor %2269, %int-1_3125, %int15360_3126, %int18432_3127, %int1_3128 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_3129 = torch.constant.int 6
    %2276 = torch.prims.convert_element_type %2173, %int6_3129 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_3130 = torch.constant.int 2
    %2277 = torch.prim.ListConstruct %int2_3130 : (!torch.int) -> !torch.list<int>
    %int0_3131 = torch.constant.int 0
    %true_3132 = torch.constant.bool true
    %result0_3133, %result1_3134 = torch.aten.var_mean.correction %2276, %2277, %int0_3131, %true_3132 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_3135 = torch.constant.float 9.9999999999999995E-7
    %int1_3136 = torch.constant.int 1
    %2278 = torch.aten.add.Scalar %result0_3133, %float9.999990e-07_3135, %int1_3136 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %2279 = torch.aten.rsqrt %2278 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_3137 = torch.constant.int 1
    %2280 = torch.aten.sub.Tensor %2173, %result1_3134, %int1_3137 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %2281 = torch.aten.mul.Tensor %2280, %2279 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_3138 = torch.constant.int 5
    %2282 = torch.prims.convert_element_type %2281, %int5_3138 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3139 = torch.constant.int 1
    %int1_3140 = torch.constant.int 1
    %2283 = torch.aten.add.Scalar %2250, %int1_3139, %int1_3140 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2284 = torch.aten.mul.Tensor %2283, %2282 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3141 = torch.constant.int 1
    %2285 = torch.aten.add.Tensor %2284, %2249, %int1_3141 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_3142 = torch.constant.int 4096
    %int3072_3143 = torch.constant.int 3072
    %2286 = torch.prim.ListConstruct %int4096_3142, %int3072_3143 : (!torch.int, !torch.int) -> !torch.list<int>
    %2287 = torch.aten.view %2285, %2286 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.6.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.6.img_attn.qkv.weight : tensor<9216x3072xf16>
    %2288 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_3144 = torch.constant.int 0
    %int1_3145 = torch.constant.int 1
    %2289 = torch.aten.transpose.int %2288, %int0_3144, %int1_3145 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.6.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.6.img_attn.qkv.bias : tensor<9216xf16>
    %2290 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_3146 = torch.constant.int 6
    %2291 = torch.prims.convert_element_type %2290, %int6_3146 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_3147 = torch.constant.int 6
    %2292 = torch.prims.convert_element_type %2287, %int6_3147 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_3148 = torch.constant.int 6
    %2293 = torch.prims.convert_element_type %2289, %int6_3148 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %2294 = torch.aten.mm %2292, %2293 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_3149 = torch.constant.int 1
    %2295 = torch.aten.mul.Scalar %2294, %int1_3149 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_3150 = torch.constant.int 1
    %2296 = torch.aten.mul.Scalar %2291, %int1_3150 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_3151 = torch.constant.int 1
    %2297 = torch.aten.add.Tensor %2295, %2296, %int1_3151 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_3152 = torch.constant.int 5
    %2298 = torch.prims.convert_element_type %2297, %int5_3152 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_3153 = torch.constant.int 1
    %int4096_3154 = torch.constant.int 4096
    %int9216_3155 = torch.constant.int 9216
    %2299 = torch.prim.ListConstruct %int1_3153, %int4096_3154, %int9216_3155 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2300 = torch.aten.view %2298, %2299 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %2301 = torch_c.to_builtin_tensor %2300 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_3156 = tensor.cast %2301 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_3157 = arith.constant 0 : index
    %dim_3158 = tensor.dim %cast_3156, %c0_3157 : tensor<?x?x?xf16>
    %c1_3159 = arith.constant 1 : index
    %dim_3160 = tensor.dim %cast_3156, %c1_3159 : tensor<?x?x?xf16>
    %c2_3161 = arith.constant 2 : index
    %dim_3162 = tensor.dim %cast_3156, %c2_3161 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_3156 : tensor<?x?x?xf16>{%dim_3158, %dim_3160, %dim_3162}]
    %cast_3163 = tensor.cast %cast_3156 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %2302 = torch_c.from_builtin_tensor %cast_3163 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_3164 = torch.constant.int 1
    %int4096_3165 = torch.constant.int 4096
    %int3_3166 = torch.constant.int 3
    %int24_3167 = torch.constant.int 24
    %int128_3168 = torch.constant.int 128
    %2303 = torch.prim.ListConstruct %int1_3164, %int4096_3165, %int3_3166, %int24_3167, %int128_3168 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2304 = torch.aten.view %2302, %2303 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_3169 = torch.constant.int 2
    %int0_3170 = torch.constant.int 0
    %int3_3171 = torch.constant.int 3
    %int1_3172 = torch.constant.int 1
    %int4_3173 = torch.constant.int 4
    %2305 = torch.prim.ListConstruct %int2_3169, %int0_3170, %int3_3171, %int1_3172, %int4_3173 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2306 = torch.aten.permute %2304, %2305 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_3174 = torch.constant.int 0
    %int0_3175 = torch.constant.int 0
    %2307 = torch.aten.select.int %2306, %int0_3174, %int0_3175 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_3176 = torch.constant.int 6
    %2308 = torch.prims.convert_element_type %2307, %int6_3176 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_3177 = torch.constant.int 2
    %2309 = torch.aten.pow.Tensor_Scalar %2308, %int2_3177 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_3178 = torch.constant.int -1
    %2310 = torch.prim.ListConstruct %int-1_3178 : (!torch.int) -> !torch.list<int>
    %true_3179 = torch.constant.bool true
    %none_3180 = torch.constant.none
    %2311 = torch.aten.mean.dim %2309, %2310, %true_3179, %none_3180 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_3181 = torch.constant.float 9.9999999999999995E-7
    %int1_3182 = torch.constant.int 1
    %2312 = torch.aten.add.Scalar %2311, %float9.999990e-07_3181, %int1_3182 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %2313 = torch.aten.rsqrt %2312 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %2314 = torch.aten.mul.Tensor %2308, %2313 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_3183 = torch.constant.int 5
    %2315 = torch.prims.convert_element_type %2314, %int5_3183 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.6.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.6.img_attn.norm.query_norm.scale : tensor<128xf16>
    %2316 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2317 = torch.aten.mul.Tensor %2315, %2316 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_3184 = torch.constant.int 1
    %int4096_3185 = torch.constant.int 4096
    %int3_3186 = torch.constant.int 3
    %int24_3187 = torch.constant.int 24
    %int128_3188 = torch.constant.int 128
    %2318 = torch.prim.ListConstruct %int1_3184, %int4096_3185, %int3_3186, %int24_3187, %int128_3188 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2319 = torch.aten.view %2302, %2318 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_3189 = torch.constant.int 2
    %int0_3190 = torch.constant.int 0
    %int3_3191 = torch.constant.int 3
    %int1_3192 = torch.constant.int 1
    %int4_3193 = torch.constant.int 4
    %2320 = torch.prim.ListConstruct %int2_3189, %int0_3190, %int3_3191, %int1_3192, %int4_3193 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2321 = torch.aten.permute %2319, %2320 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_3194 = torch.constant.int 0
    %int1_3195 = torch.constant.int 1
    %2322 = torch.aten.select.int %2321, %int0_3194, %int1_3195 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_3196 = torch.constant.int 6
    %2323 = torch.prims.convert_element_type %2322, %int6_3196 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_3197 = torch.constant.int 2
    %2324 = torch.aten.pow.Tensor_Scalar %2323, %int2_3197 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_3198 = torch.constant.int -1
    %2325 = torch.prim.ListConstruct %int-1_3198 : (!torch.int) -> !torch.list<int>
    %true_3199 = torch.constant.bool true
    %none_3200 = torch.constant.none
    %2326 = torch.aten.mean.dim %2324, %2325, %true_3199, %none_3200 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_3201 = torch.constant.float 9.9999999999999995E-7
    %int1_3202 = torch.constant.int 1
    %2327 = torch.aten.add.Scalar %2326, %float9.999990e-07_3201, %int1_3202 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %2328 = torch.aten.rsqrt %2327 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %2329 = torch.aten.mul.Tensor %2323, %2328 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_3203 = torch.constant.int 5
    %2330 = torch.prims.convert_element_type %2329, %int5_3203 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.6.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.6.img_attn.norm.key_norm.scale : tensor<128xf16>
    %2331 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2332 = torch.aten.mul.Tensor %2330, %2331 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_3204 = torch.constant.int 5
    %2333 = torch.prims.convert_element_type %2317, %int5_3204 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_3205 = torch.constant.int 5
    %2334 = torch.prims.convert_element_type %2332, %int5_3205 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_3206 = torch.constant.int 6
    %2335 = torch.prims.convert_element_type %2233, %int6_3206 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_3207 = torch.constant.int 2
    %2336 = torch.prim.ListConstruct %int2_3207 : (!torch.int) -> !torch.list<int>
    %int0_3208 = torch.constant.int 0
    %true_3209 = torch.constant.bool true
    %result0_3210, %result1_3211 = torch.aten.var_mean.correction %2335, %2336, %int0_3208, %true_3209 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_3212 = torch.constant.float 9.9999999999999995E-7
    %int1_3213 = torch.constant.int 1
    %2337 = torch.aten.add.Scalar %result0_3210, %float9.999990e-07_3212, %int1_3213 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %2338 = torch.aten.rsqrt %2337 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_3214 = torch.constant.int 1
    %2339 = torch.aten.sub.Tensor %2233, %result1_3211, %int1_3214 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %2340 = torch.aten.mul.Tensor %2339, %2338 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_3215 = torch.constant.int 5
    %2341 = torch.prims.convert_element_type %2340, %int5_3215 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_3216 = torch.constant.int 1
    %int1_3217 = torch.constant.int 1
    %2342 = torch.aten.add.Scalar %2271, %int1_3216, %int1_3217 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2343 = torch.aten.mul.Tensor %2342, %2341 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3218 = torch.constant.int 1
    %2344 = torch.aten.add.Tensor %2343, %2270, %int1_3218 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_3219 = torch.constant.int 512
    %int3072_3220 = torch.constant.int 3072
    %2345 = torch.prim.ListConstruct %int512_3219, %int3072_3220 : (!torch.int, !torch.int) -> !torch.list<int>
    %2346 = torch.aten.view %2344, %2345 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.6.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.6.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %2347 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_3221 = torch.constant.int 0
    %int1_3222 = torch.constant.int 1
    %2348 = torch.aten.transpose.int %2347, %int0_3221, %int1_3222 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.6.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.6.txt_attn.qkv.bias : tensor<9216xf16>
    %2349 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_3223 = torch.constant.int 6
    %2350 = torch.prims.convert_element_type %2349, %int6_3223 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_3224 = torch.constant.int 6
    %2351 = torch.prims.convert_element_type %2346, %int6_3224 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_3225 = torch.constant.int 6
    %2352 = torch.prims.convert_element_type %2348, %int6_3225 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %2353 = torch.aten.mm %2351, %2352 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_3226 = torch.constant.int 1
    %2354 = torch.aten.mul.Scalar %2353, %int1_3226 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_3227 = torch.constant.int 1
    %2355 = torch.aten.mul.Scalar %2350, %int1_3227 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_3228 = torch.constant.int 1
    %2356 = torch.aten.add.Tensor %2354, %2355, %int1_3228 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_3229 = torch.constant.int 5
    %2357 = torch.prims.convert_element_type %2356, %int5_3229 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_3230 = torch.constant.int 1
    %int512_3231 = torch.constant.int 512
    %int9216_3232 = torch.constant.int 9216
    %2358 = torch.prim.ListConstruct %int1_3230, %int512_3231, %int9216_3232 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2359 = torch.aten.view %2357, %2358 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %2360 = torch_c.to_builtin_tensor %2359 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_3233 = tensor.cast %2360 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_3234 = arith.constant 0 : index
    %dim_3235 = tensor.dim %cast_3233, %c0_3234 : tensor<?x?x?xf16>
    %c1_3236 = arith.constant 1 : index
    %dim_3237 = tensor.dim %cast_3233, %c1_3236 : tensor<?x?x?xf16>
    %c2_3238 = arith.constant 2 : index
    %dim_3239 = tensor.dim %cast_3233, %c2_3238 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_3233 : tensor<?x?x?xf16>{%dim_3235, %dim_3237, %dim_3239}]
    %cast_3240 = tensor.cast %cast_3233 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %2361 = torch_c.from_builtin_tensor %cast_3240 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_3241 = torch.constant.int 1
    %int512_3242 = torch.constant.int 512
    %int3_3243 = torch.constant.int 3
    %int24_3244 = torch.constant.int 24
    %int128_3245 = torch.constant.int 128
    %2362 = torch.prim.ListConstruct %int1_3241, %int512_3242, %int3_3243, %int24_3244, %int128_3245 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2363 = torch.aten.view %2361, %2362 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_3246 = torch.constant.int 2
    %int0_3247 = torch.constant.int 0
    %int3_3248 = torch.constant.int 3
    %int1_3249 = torch.constant.int 1
    %int4_3250 = torch.constant.int 4
    %2364 = torch.prim.ListConstruct %int2_3246, %int0_3247, %int3_3248, %int1_3249, %int4_3250 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2365 = torch.aten.permute %2363, %2364 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_3251 = torch.constant.int 0
    %int0_3252 = torch.constant.int 0
    %2366 = torch.aten.select.int %2365, %int0_3251, %int0_3252 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_3253 = torch.constant.int 6
    %2367 = torch.prims.convert_element_type %2366, %int6_3253 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_3254 = torch.constant.int 2
    %2368 = torch.aten.pow.Tensor_Scalar %2367, %int2_3254 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_3255 = torch.constant.int -1
    %2369 = torch.prim.ListConstruct %int-1_3255 : (!torch.int) -> !torch.list<int>
    %true_3256 = torch.constant.bool true
    %none_3257 = torch.constant.none
    %2370 = torch.aten.mean.dim %2368, %2369, %true_3256, %none_3257 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_3258 = torch.constant.float 9.9999999999999995E-7
    %int1_3259 = torch.constant.int 1
    %2371 = torch.aten.add.Scalar %2370, %float9.999990e-07_3258, %int1_3259 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %2372 = torch.aten.rsqrt %2371 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %2373 = torch.aten.mul.Tensor %2367, %2372 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_3260 = torch.constant.int 5
    %2374 = torch.prims.convert_element_type %2373, %int5_3260 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.6.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.6.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %2375 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2376 = torch.aten.mul.Tensor %2374, %2375 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_3261 = torch.constant.int 1
    %int512_3262 = torch.constant.int 512
    %int3_3263 = torch.constant.int 3
    %int24_3264 = torch.constant.int 24
    %int128_3265 = torch.constant.int 128
    %2377 = torch.prim.ListConstruct %int1_3261, %int512_3262, %int3_3263, %int24_3264, %int128_3265 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2378 = torch.aten.view %2361, %2377 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_3266 = torch.constant.int 2
    %int0_3267 = torch.constant.int 0
    %int3_3268 = torch.constant.int 3
    %int1_3269 = torch.constant.int 1
    %int4_3270 = torch.constant.int 4
    %2379 = torch.prim.ListConstruct %int2_3266, %int0_3267, %int3_3268, %int1_3269, %int4_3270 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2380 = torch.aten.permute %2378, %2379 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_3271 = torch.constant.int 0
    %int1_3272 = torch.constant.int 1
    %2381 = torch.aten.select.int %2380, %int0_3271, %int1_3272 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_3273 = torch.constant.int 6
    %2382 = torch.prims.convert_element_type %2381, %int6_3273 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_3274 = torch.constant.int 2
    %2383 = torch.aten.pow.Tensor_Scalar %2382, %int2_3274 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_3275 = torch.constant.int -1
    %2384 = torch.prim.ListConstruct %int-1_3275 : (!torch.int) -> !torch.list<int>
    %true_3276 = torch.constant.bool true
    %none_3277 = torch.constant.none
    %2385 = torch.aten.mean.dim %2383, %2384, %true_3276, %none_3277 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_3278 = torch.constant.float 9.9999999999999995E-7
    %int1_3279 = torch.constant.int 1
    %2386 = torch.aten.add.Scalar %2385, %float9.999990e-07_3278, %int1_3279 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %2387 = torch.aten.rsqrt %2386 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %2388 = torch.aten.mul.Tensor %2382, %2387 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_3280 = torch.constant.int 5
    %2389 = torch.prims.convert_element_type %2388, %int5_3280 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.6.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.6.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %2390 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2391 = torch.aten.mul.Tensor %2389, %2390 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_3281 = torch.constant.int 5
    %2392 = torch.prims.convert_element_type %2376, %int5_3281 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_3282 = torch.constant.int 5
    %2393 = torch.prims.convert_element_type %2391, %int5_3282 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %2394 = torch.prim.ListConstruct %2392, %2333 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_3283 = torch.constant.int 2
    %2395 = torch.aten.cat %2394, %int2_3283 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %2396 = torch.prim.ListConstruct %2393, %2334 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_3284 = torch.constant.int 2
    %2397 = torch.aten.cat %2396, %int2_3284 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_3285 = torch.constant.int 1
    %int512_3286 = torch.constant.int 512
    %int3_3287 = torch.constant.int 3
    %int24_3288 = torch.constant.int 24
    %int128_3289 = torch.constant.int 128
    %2398 = torch.prim.ListConstruct %int1_3285, %int512_3286, %int3_3287, %int24_3288, %int128_3289 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2399 = torch.aten.view %2361, %2398 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_3290 = torch.constant.int 2
    %int0_3291 = torch.constant.int 0
    %int3_3292 = torch.constant.int 3
    %int1_3293 = torch.constant.int 1
    %int4_3294 = torch.constant.int 4
    %2400 = torch.prim.ListConstruct %int2_3290, %int0_3291, %int3_3292, %int1_3293, %int4_3294 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2401 = torch.aten.permute %2399, %2400 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_3295 = torch.constant.int 0
    %int2_3296 = torch.constant.int 2
    %2402 = torch.aten.select.int %2401, %int0_3295, %int2_3296 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_3297 = torch.constant.int 1
    %int4096_3298 = torch.constant.int 4096
    %int3_3299 = torch.constant.int 3
    %int24_3300 = torch.constant.int 24
    %int128_3301 = torch.constant.int 128
    %2403 = torch.prim.ListConstruct %int1_3297, %int4096_3298, %int3_3299, %int24_3300, %int128_3301 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2404 = torch.aten.view %2302, %2403 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_3302 = torch.constant.int 2
    %int0_3303 = torch.constant.int 0
    %int3_3304 = torch.constant.int 3
    %int1_3305 = torch.constant.int 1
    %int4_3306 = torch.constant.int 4
    %2405 = torch.prim.ListConstruct %int2_3302, %int0_3303, %int3_3304, %int1_3305, %int4_3306 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2406 = torch.aten.permute %2404, %2405 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_3307 = torch.constant.int 0
    %int2_3308 = torch.constant.int 2
    %2407 = torch.aten.select.int %2406, %int0_3307, %int2_3308 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %2408 = torch.prim.ListConstruct %2402, %2407 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_3309 = torch.constant.int 2
    %2409 = torch.aten.cat %2408, %int2_3309 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %2410 = torch_c.to_builtin_tensor %2395 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_3310 = tensor.cast %2410 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_3311 = arith.constant 0 : index
    %dim_3312 = tensor.dim %cast_3310, %c0_3311 : tensor<?x?x?x?xf16>
    %c1_3313 = arith.constant 1 : index
    %dim_3314 = tensor.dim %cast_3310, %c1_3313 : tensor<?x?x?x?xf16>
    %c2_3315 = arith.constant 2 : index
    %dim_3316 = tensor.dim %cast_3310, %c2_3315 : tensor<?x?x?x?xf16>
    %c3_3317 = arith.constant 3 : index
    %dim_3318 = tensor.dim %cast_3310, %c3_3317 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_3310 : tensor<?x?x?x?xf16>{%dim_3312, %dim_3314, %dim_3316, %dim_3318}]
    %cast_3319 = tensor.cast %cast_3310 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %2411 = torch_c.from_builtin_tensor %cast_3319 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %2412 = torch_c.to_builtin_tensor %2397 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_3320 = tensor.cast %2412 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_3321 = arith.constant 0 : index
    %dim_3322 = tensor.dim %cast_3320, %c0_3321 : tensor<?x?x?x?xf16>
    %c1_3323 = arith.constant 1 : index
    %dim_3324 = tensor.dim %cast_3320, %c1_3323 : tensor<?x?x?x?xf16>
    %c2_3325 = arith.constant 2 : index
    %dim_3326 = tensor.dim %cast_3320, %c2_3325 : tensor<?x?x?x?xf16>
    %c3_3327 = arith.constant 3 : index
    %dim_3328 = tensor.dim %cast_3320, %c3_3327 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_3320 : tensor<?x?x?x?xf16>{%dim_3322, %dim_3324, %dim_3326, %dim_3328}]
    %cast_3329 = tensor.cast %cast_3320 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %2413 = torch_c.from_builtin_tensor %cast_3329 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %2414 = torch_c.to_builtin_tensor %2409 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_3330 = tensor.cast %2414 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_3331 = arith.constant 0 : index
    %dim_3332 = tensor.dim %cast_3330, %c0_3331 : tensor<?x?x?x?xf16>
    %c1_3333 = arith.constant 1 : index
    %dim_3334 = tensor.dim %cast_3330, %c1_3333 : tensor<?x?x?x?xf16>
    %c2_3335 = arith.constant 2 : index
    %dim_3336 = tensor.dim %cast_3330, %c2_3335 : tensor<?x?x?x?xf16>
    %c3_3337 = arith.constant 3 : index
    %dim_3338 = tensor.dim %cast_3330, %c3_3337 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_3330 : tensor<?x?x?x?xf16>{%dim_3332, %dim_3334, %dim_3336, %dim_3338}]
    %cast_3339 = tensor.cast %cast_3330 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %2415 = torch_c.from_builtin_tensor %cast_3339 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_3340 = torch.constant.int 6
    %2416 = torch.prims.convert_element_type %2411, %int6_3340 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_3341 = torch.constant.int 1
    %int24_3342 = torch.constant.int 24
    %int4608_3343 = torch.constant.int 4608
    %int-1_3344 = torch.constant.int -1
    %int1_3345 = torch.constant.int 1
    %int2_3346 = torch.constant.int 2
    %2417 = torch.prim.ListConstruct %int1_3341, %int24_3342, %int4608_3343, %int-1_3344, %int1_3345, %int2_3346 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2418 = torch.aten.view %2416, %2417 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_3347 = torch.constant.int 6
    %2419 = torch.prims.convert_element_type %2413, %int6_3347 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_3348 = torch.constant.int 1
    %int24_3349 = torch.constant.int 24
    %int4608_3350 = torch.constant.int 4608
    %int-1_3351 = torch.constant.int -1
    %int1_3352 = torch.constant.int 1
    %int2_3353 = torch.constant.int 2
    %2420 = torch.prim.ListConstruct %int1_3348, %int24_3349, %int4608_3350, %int-1_3351, %int1_3352, %int2_3353 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2421 = torch.aten.view %2419, %2420 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_3354 = torch.constant.int 5
    %int0_3355 = torch.constant.int 0
    %2422 = torch.aten.select.int %211, %int5_3354, %int0_3355 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3356 = torch.constant.int 5
    %int0_3357 = torch.constant.int 0
    %2423 = torch.aten.select.int %2418, %int5_3356, %int0_3357 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2424 = torch.aten.mul.Tensor %2422, %2423 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_3358 = torch.constant.int 5
    %int1_3359 = torch.constant.int 1
    %2425 = torch.aten.select.int %211, %int5_3358, %int1_3359 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3360 = torch.constant.int 5
    %int1_3361 = torch.constant.int 1
    %2426 = torch.aten.select.int %2418, %int5_3360, %int1_3361 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2427 = torch.aten.mul.Tensor %2425, %2426 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_3362 = torch.constant.int 1
    %2428 = torch.aten.add.Tensor %2424, %2427, %int1_3362 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_3363 = torch.constant.int 5
    %int0_3364 = torch.constant.int 0
    %2429 = torch.aten.select.int %211, %int5_3363, %int0_3364 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3365 = torch.constant.int 5
    %int0_3366 = torch.constant.int 0
    %2430 = torch.aten.select.int %2421, %int5_3365, %int0_3366 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2431 = torch.aten.mul.Tensor %2429, %2430 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_3367 = torch.constant.int 5
    %int1_3368 = torch.constant.int 1
    %2432 = torch.aten.select.int %211, %int5_3367, %int1_3368 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3369 = torch.constant.int 5
    %int1_3370 = torch.constant.int 1
    %2433 = torch.aten.select.int %2421, %int5_3369, %int1_3370 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2434 = torch.aten.mul.Tensor %2432, %2433 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_3371 = torch.constant.int 1
    %2435 = torch.aten.add.Tensor %2431, %2434, %int1_3371 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_3372 = torch.constant.int 1
    %int24_3373 = torch.constant.int 24
    %int4608_3374 = torch.constant.int 4608
    %int128_3375 = torch.constant.int 128
    %2436 = torch.prim.ListConstruct %int1_3372, %int24_3373, %int4608_3374, %int128_3375 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2437 = torch.aten.view %2428, %2436 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_3376 = torch.constant.int 5
    %2438 = torch.prims.convert_element_type %2437, %int5_3376 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_3377 = torch.constant.int 1
    %int24_3378 = torch.constant.int 24
    %int4608_3379 = torch.constant.int 4608
    %int128_3380 = torch.constant.int 128
    %2439 = torch.prim.ListConstruct %int1_3377, %int24_3378, %int4608_3379, %int128_3380 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2440 = torch.aten.view %2435, %2439 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_3381 = torch.constant.int 5
    %2441 = torch.prims.convert_element_type %2440, %int5_3381 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_3382 = torch.constant.float 0.000000e+00
    %false_3383 = torch.constant.bool false
    %none_3384 = torch.constant.none
    %none_3385 = torch.constant.none
    %2442:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2438, %2441, %2415, %float0.000000e00_3382, %false_3383, %none_3384, %none_3385) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_3386 = torch.constant.int 0
    %int2_3387 = torch.constant.int 2
    %int1_3388 = torch.constant.int 1
    %int3_3389 = torch.constant.int 3
    %2443 = torch.prim.ListConstruct %int0_3386, %int2_3387, %int1_3388, %int3_3389 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2444 = torch.aten.permute %2442#0, %2443 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_3390 = torch.constant.int 1
    %int4608_3391 = torch.constant.int 4608
    %int3072_3392 = torch.constant.int 3072
    %2445 = torch.prim.ListConstruct %int1_3390, %int4608_3391, %int3072_3392 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2446 = torch.aten.view %2444, %2445 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_3393 = torch.constant.int 0
    %int0_3394 = torch.constant.int 0
    %int9223372036854775807_3395 = torch.constant.int 9223372036854775807
    %int1_3396 = torch.constant.int 1
    %2447 = torch.aten.slice.Tensor %2446, %int0_3393, %int0_3394, %int9223372036854775807_3395, %int1_3396 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_3397 = torch.constant.int 1
    %int0_3398 = torch.constant.int 0
    %int512_3399 = torch.constant.int 512
    %int1_3400 = torch.constant.int 1
    %2448 = torch.aten.slice.Tensor %2447, %int1_3397, %int0_3398, %int512_3399, %int1_3400 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_3401 = torch.constant.int 0
    %int0_3402 = torch.constant.int 0
    %int9223372036854775807_3403 = torch.constant.int 9223372036854775807
    %int1_3404 = torch.constant.int 1
    %2449 = torch.aten.slice.Tensor %2446, %int0_3401, %int0_3402, %int9223372036854775807_3403, %int1_3404 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_3405 = torch.constant.int 1
    %int512_3406 = torch.constant.int 512
    %int9223372036854775807_3407 = torch.constant.int 9223372036854775807
    %int1_3408 = torch.constant.int 1
    %2450 = torch.aten.slice.Tensor %2449, %int1_3405, %int512_3406, %int9223372036854775807_3407, %int1_3408 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_3409 = torch.constant.int 4096
    %int3072_3410 = torch.constant.int 3072
    %2451 = torch.prim.ListConstruct %int4096_3409, %int3072_3410 : (!torch.int, !torch.int) -> !torch.list<int>
    %2452 = torch.aten.view %2450, %2451 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.6.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.6.img_attn.proj.weight : tensor<3072x3072xf16>
    %2453 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_3411 = torch.constant.int 0
    %int1_3412 = torch.constant.int 1
    %2454 = torch.aten.transpose.int %2453, %int0_3411, %int1_3412 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.6.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.6.img_attn.proj.bias : tensor<3072xf16>
    %2455 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3413 = torch.constant.int 6
    %2456 = torch.prims.convert_element_type %2455, %int6_3413 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3414 = torch.constant.int 6
    %2457 = torch.prims.convert_element_type %2452, %int6_3414 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_3415 = torch.constant.int 6
    %2458 = torch.prims.convert_element_type %2454, %int6_3415 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %2459 = torch.aten.mm %2457, %2458 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_3416 = torch.constant.int 1
    %2460 = torch.aten.mul.Scalar %2459, %int1_3416 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_3417 = torch.constant.int 1
    %2461 = torch.aten.mul.Scalar %2456, %int1_3417 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3418 = torch.constant.int 1
    %2462 = torch.aten.add.Tensor %2460, %2461, %int1_3418 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_3419 = torch.constant.int 5
    %2463 = torch.prims.convert_element_type %2462, %int5_3419 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_3420 = torch.constant.int 1
    %int4096_3421 = torch.constant.int 4096
    %int3072_3422 = torch.constant.int 3072
    %2464 = torch.prim.ListConstruct %int1_3420, %int4096_3421, %int3072_3422 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2465 = torch.aten.view %2463, %2464 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %2466 = torch.aten.mul.Tensor %2251, %2465 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3423 = torch.constant.int 1
    %2467 = torch.aten.add.Tensor %2173, %2466, %int1_3423 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3424 = torch.constant.int 1
    %int1_3425 = torch.constant.int 1
    %2468 = torch.aten.add.Scalar %2253, %int1_3424, %int1_3425 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_3426 = torch.constant.int 6
    %2469 = torch.prims.convert_element_type %2467, %int6_3426 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_3427 = torch.constant.int 2
    %2470 = torch.prim.ListConstruct %int2_3427 : (!torch.int) -> !torch.list<int>
    %int0_3428 = torch.constant.int 0
    %true_3429 = torch.constant.bool true
    %result0_3430, %result1_3431 = torch.aten.var_mean.correction %2469, %2470, %int0_3428, %true_3429 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_3432 = torch.constant.float 9.9999999999999995E-7
    %int1_3433 = torch.constant.int 1
    %2471 = torch.aten.add.Scalar %result0_3430, %float9.999990e-07_3432, %int1_3433 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %2472 = torch.aten.rsqrt %2471 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_3434 = torch.constant.int 1
    %2473 = torch.aten.sub.Tensor %2467, %result1_3431, %int1_3434 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %2474 = torch.aten.mul.Tensor %2473, %2472 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_3435 = torch.constant.int 5
    %2475 = torch.prims.convert_element_type %2474, %int5_3435 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %2476 = torch.aten.mul.Tensor %2468, %2475 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3436 = torch.constant.int 1
    %2477 = torch.aten.add.Tensor %2476, %2252, %int1_3436 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_3437 = torch.constant.int 4096
    %int3072_3438 = torch.constant.int 3072
    %2478 = torch.prim.ListConstruct %int4096_3437, %int3072_3438 : (!torch.int, !torch.int) -> !torch.list<int>
    %2479 = torch.aten.view %2477, %2478 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.6.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.6.img_mlp.0.weight : tensor<12288x3072xf16>
    %2480 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_3439 = torch.constant.int 0
    %int1_3440 = torch.constant.int 1
    %2481 = torch.aten.transpose.int %2480, %int0_3439, %int1_3440 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.6.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.6.img_mlp.0.bias : tensor<12288xf16>
    %2482 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_3441 = torch.constant.int 6
    %2483 = torch.prims.convert_element_type %2482, %int6_3441 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_3442 = torch.constant.int 6
    %2484 = torch.prims.convert_element_type %2479, %int6_3442 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_3443 = torch.constant.int 6
    %2485 = torch.prims.convert_element_type %2481, %int6_3443 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %2486 = torch.aten.mm %2484, %2485 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_3444 = torch.constant.int 1
    %2487 = torch.aten.mul.Scalar %2486, %int1_3444 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_3445 = torch.constant.int 1
    %2488 = torch.aten.mul.Scalar %2483, %int1_3445 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_3446 = torch.constant.int 1
    %2489 = torch.aten.add.Tensor %2487, %2488, %int1_3446 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_3447 = torch.constant.int 5
    %2490 = torch.prims.convert_element_type %2489, %int5_3447 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_3448 = torch.constant.int 1
    %int4096_3449 = torch.constant.int 4096
    %int12288_3450 = torch.constant.int 12288
    %2491 = torch.prim.ListConstruct %int1_3448, %int4096_3449, %int12288_3450 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2492 = torch.aten.view %2490, %2491 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_3451 = torch.constant.str "tanh"
    %2493 = torch.aten.gelu %2492, %str_3451 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_3452 = torch.constant.int 4096
    %int12288_3453 = torch.constant.int 12288
    %2494 = torch.prim.ListConstruct %int4096_3452, %int12288_3453 : (!torch.int, !torch.int) -> !torch.list<int>
    %2495 = torch.aten.view %2493, %2494 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.6.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.6.img_mlp.2.weight : tensor<3072x12288xf16>
    %2496 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_3454 = torch.constant.int 0
    %int1_3455 = torch.constant.int 1
    %2497 = torch.aten.transpose.int %2496, %int0_3454, %int1_3455 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.6.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.6.img_mlp.2.bias : tensor<3072xf16>
    %2498 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3456 = torch.constant.int 6
    %2499 = torch.prims.convert_element_type %2498, %int6_3456 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3457 = torch.constant.int 6
    %2500 = torch.prims.convert_element_type %2495, %int6_3457 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_3458 = torch.constant.int 6
    %2501 = torch.prims.convert_element_type %2497, %int6_3458 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %2502 = torch.aten.mm %2500, %2501 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_3459 = torch.constant.int 1
    %2503 = torch.aten.mul.Scalar %2502, %int1_3459 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_3460 = torch.constant.int 1
    %2504 = torch.aten.mul.Scalar %2499, %int1_3460 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3461 = torch.constant.int 1
    %2505 = torch.aten.add.Tensor %2503, %2504, %int1_3461 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_3462 = torch.constant.int 5
    %2506 = torch.prims.convert_element_type %2505, %int5_3462 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_3463 = torch.constant.int 1
    %int4096_3464 = torch.constant.int 4096
    %int3072_3465 = torch.constant.int 3072
    %2507 = torch.prim.ListConstruct %int1_3463, %int4096_3464, %int3072_3465 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2508 = torch.aten.view %2506, %2507 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %2509 = torch.aten.mul.Tensor %2254, %2508 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3466 = torch.constant.int 1
    %2510 = torch.aten.add.Tensor %2467, %2509, %int1_3466 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_3467 = torch.constant.int 512
    %int3072_3468 = torch.constant.int 3072
    %2511 = torch.prim.ListConstruct %int512_3467, %int3072_3468 : (!torch.int, !torch.int) -> !torch.list<int>
    %2512 = torch.aten.view %2448, %2511 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.6.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.6.txt_attn.proj.weight : tensor<3072x3072xf16>
    %2513 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_3469 = torch.constant.int 0
    %int1_3470 = torch.constant.int 1
    %2514 = torch.aten.transpose.int %2513, %int0_3469, %int1_3470 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.6.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.6.txt_attn.proj.bias : tensor<3072xf16>
    %2515 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3471 = torch.constant.int 6
    %2516 = torch.prims.convert_element_type %2515, %int6_3471 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3472 = torch.constant.int 6
    %2517 = torch.prims.convert_element_type %2512, %int6_3472 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_3473 = torch.constant.int 6
    %2518 = torch.prims.convert_element_type %2514, %int6_3473 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %2519 = torch.aten.mm %2517, %2518 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_3474 = torch.constant.int 1
    %2520 = torch.aten.mul.Scalar %2519, %int1_3474 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_3475 = torch.constant.int 1
    %2521 = torch.aten.mul.Scalar %2516, %int1_3475 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3476 = torch.constant.int 1
    %2522 = torch.aten.add.Tensor %2520, %2521, %int1_3476 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_3477 = torch.constant.int 5
    %2523 = torch.prims.convert_element_type %2522, %int5_3477 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_3478 = torch.constant.int 1
    %int512_3479 = torch.constant.int 512
    %int3072_3480 = torch.constant.int 3072
    %2524 = torch.prim.ListConstruct %int1_3478, %int512_3479, %int3072_3480 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2525 = torch.aten.view %2523, %2524 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %2526 = torch.aten.mul.Tensor %2272, %2525 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3481 = torch.constant.int 1
    %2527 = torch.aten.add.Tensor %2233, %2526, %int1_3481 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_3482 = torch.constant.int 1
    %int1_3483 = torch.constant.int 1
    %2528 = torch.aten.add.Scalar %2274, %int1_3482, %int1_3483 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_3484 = torch.constant.int 6
    %2529 = torch.prims.convert_element_type %2527, %int6_3484 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_3485 = torch.constant.int 2
    %2530 = torch.prim.ListConstruct %int2_3485 : (!torch.int) -> !torch.list<int>
    %int0_3486 = torch.constant.int 0
    %true_3487 = torch.constant.bool true
    %result0_3488, %result1_3489 = torch.aten.var_mean.correction %2529, %2530, %int0_3486, %true_3487 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_3490 = torch.constant.float 9.9999999999999995E-7
    %int1_3491 = torch.constant.int 1
    %2531 = torch.aten.add.Scalar %result0_3488, %float9.999990e-07_3490, %int1_3491 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %2532 = torch.aten.rsqrt %2531 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_3492 = torch.constant.int 1
    %2533 = torch.aten.sub.Tensor %2527, %result1_3489, %int1_3492 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %2534 = torch.aten.mul.Tensor %2533, %2532 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_3493 = torch.constant.int 5
    %2535 = torch.prims.convert_element_type %2534, %int5_3493 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %2536 = torch.aten.mul.Tensor %2528, %2535 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3494 = torch.constant.int 1
    %2537 = torch.aten.add.Tensor %2536, %2273, %int1_3494 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_3495 = torch.constant.int 512
    %int3072_3496 = torch.constant.int 3072
    %2538 = torch.prim.ListConstruct %int512_3495, %int3072_3496 : (!torch.int, !torch.int) -> !torch.list<int>
    %2539 = torch.aten.view %2537, %2538 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.6.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.6.txt_mlp.0.weight : tensor<12288x3072xf16>
    %2540 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_3497 = torch.constant.int 0
    %int1_3498 = torch.constant.int 1
    %2541 = torch.aten.transpose.int %2540, %int0_3497, %int1_3498 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.6.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.6.txt_mlp.0.bias : tensor<12288xf16>
    %2542 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_3499 = torch.constant.int 6
    %2543 = torch.prims.convert_element_type %2542, %int6_3499 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_3500 = torch.constant.int 6
    %2544 = torch.prims.convert_element_type %2539, %int6_3500 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_3501 = torch.constant.int 6
    %2545 = torch.prims.convert_element_type %2541, %int6_3501 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %2546 = torch.aten.mm %2544, %2545 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_3502 = torch.constant.int 1
    %2547 = torch.aten.mul.Scalar %2546, %int1_3502 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_3503 = torch.constant.int 1
    %2548 = torch.aten.mul.Scalar %2543, %int1_3503 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_3504 = torch.constant.int 1
    %2549 = torch.aten.add.Tensor %2547, %2548, %int1_3504 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_3505 = torch.constant.int 5
    %2550 = torch.prims.convert_element_type %2549, %int5_3505 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_3506 = torch.constant.int 1
    %int512_3507 = torch.constant.int 512
    %int12288_3508 = torch.constant.int 12288
    %2551 = torch.prim.ListConstruct %int1_3506, %int512_3507, %int12288_3508 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2552 = torch.aten.view %2550, %2551 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_3509 = torch.constant.str "tanh"
    %2553 = torch.aten.gelu %2552, %str_3509 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_3510 = torch.constant.int 512
    %int12288_3511 = torch.constant.int 12288
    %2554 = torch.prim.ListConstruct %int512_3510, %int12288_3511 : (!torch.int, !torch.int) -> !torch.list<int>
    %2555 = torch.aten.view %2553, %2554 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.6.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.6.txt_mlp.2.weight : tensor<3072x12288xf16>
    %2556 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_3512 = torch.constant.int 0
    %int1_3513 = torch.constant.int 1
    %2557 = torch.aten.transpose.int %2556, %int0_3512, %int1_3513 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.6.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.6.txt_mlp.2.bias : tensor<3072xf16>
    %2558 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3514 = torch.constant.int 6
    %2559 = torch.prims.convert_element_type %2558, %int6_3514 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3515 = torch.constant.int 6
    %2560 = torch.prims.convert_element_type %2555, %int6_3515 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_3516 = torch.constant.int 6
    %2561 = torch.prims.convert_element_type %2557, %int6_3516 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %2562 = torch.aten.mm %2560, %2561 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_3517 = torch.constant.int 1
    %2563 = torch.aten.mul.Scalar %2562, %int1_3517 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_3518 = torch.constant.int 1
    %2564 = torch.aten.mul.Scalar %2559, %int1_3518 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3519 = torch.constant.int 1
    %2565 = torch.aten.add.Tensor %2563, %2564, %int1_3519 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_3520 = torch.constant.int 5
    %2566 = torch.prims.convert_element_type %2565, %int5_3520 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_3521 = torch.constant.int 1
    %int512_3522 = torch.constant.int 512
    %int3072_3523 = torch.constant.int 3072
    %2567 = torch.prim.ListConstruct %int1_3521, %int512_3522, %int3072_3523 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2568 = torch.aten.view %2566, %2567 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %2569 = torch.aten.mul.Tensor %2275, %2568 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3524 = torch.constant.int 1
    %2570 = torch.aten.add.Tensor %2527, %2569, %int1_3524 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %2571 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.7.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.7.img_mod.lin.weight : tensor<18432x3072xf16>
    %2572 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_3525 = torch.constant.int 0
    %int1_3526 = torch.constant.int 1
    %2573 = torch.aten.transpose.int %2572, %int0_3525, %int1_3526 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.7.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.7.img_mod.lin.bias : tensor<18432xf16>
    %2574 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_3527 = torch.constant.int 6
    %2575 = torch.prims.convert_element_type %2574, %int6_3527 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_3528 = torch.constant.int 6
    %2576 = torch.prims.convert_element_type %2571, %int6_3528 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_3529 = torch.constant.int 6
    %2577 = torch.prims.convert_element_type %2573, %int6_3529 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %2578 = torch.aten.mm %2576, %2577 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_3530 = torch.constant.int 1
    %2579 = torch.aten.mul.Scalar %2578, %int1_3530 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_3531 = torch.constant.int 1
    %2580 = torch.aten.mul.Scalar %2575, %int1_3531 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_3532 = torch.constant.int 1
    %2581 = torch.aten.add.Tensor %2579, %2580, %int1_3532 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_3533 = torch.constant.int 5
    %2582 = torch.prims.convert_element_type %2581, %int5_3533 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_3534 = torch.constant.int 0
    %int0_3535 = torch.constant.int 0
    %int9223372036854775807_3536 = torch.constant.int 9223372036854775807
    %int1_3537 = torch.constant.int 1
    %2583 = torch.aten.slice.Tensor %2582, %int0_3534, %int0_3535, %int9223372036854775807_3536, %int1_3537 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_3538 = torch.constant.int 1
    %2584 = torch.aten.unsqueeze %2583, %int1_3538 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_3539 = torch.constant.int 2
    %int0_3540 = torch.constant.int 0
    %int9223372036854775807_3541 = torch.constant.int 9223372036854775807
    %int1_3542 = torch.constant.int 1
    %2585 = torch.aten.slice.Tensor %2584, %int2_3539, %int0_3540, %int9223372036854775807_3541, %int1_3542 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_3543 = torch.constant.int -1
    %int0_3544 = torch.constant.int 0
    %int3072_3545 = torch.constant.int 3072
    %int1_3546 = torch.constant.int 1
    %2586 = torch.aten.slice.Tensor %2585, %int-1_3543, %int0_3544, %int3072_3545, %int1_3546 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3547 = torch.constant.int -1
    %int3072_3548 = torch.constant.int 3072
    %int6144_3549 = torch.constant.int 6144
    %int1_3550 = torch.constant.int 1
    %2587 = torch.aten.slice.Tensor %2585, %int-1_3547, %int3072_3548, %int6144_3549, %int1_3550 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3551 = torch.constant.int -1
    %int6144_3552 = torch.constant.int 6144
    %int9216_3553 = torch.constant.int 9216
    %int1_3554 = torch.constant.int 1
    %2588 = torch.aten.slice.Tensor %2585, %int-1_3551, %int6144_3552, %int9216_3553, %int1_3554 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3555 = torch.constant.int -1
    %int9216_3556 = torch.constant.int 9216
    %int12288_3557 = torch.constant.int 12288
    %int1_3558 = torch.constant.int 1
    %2589 = torch.aten.slice.Tensor %2585, %int-1_3555, %int9216_3556, %int12288_3557, %int1_3558 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3559 = torch.constant.int -1
    %int12288_3560 = torch.constant.int 12288
    %int15360_3561 = torch.constant.int 15360
    %int1_3562 = torch.constant.int 1
    %2590 = torch.aten.slice.Tensor %2585, %int-1_3559, %int12288_3560, %int15360_3561, %int1_3562 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3563 = torch.constant.int -1
    %int15360_3564 = torch.constant.int 15360
    %int18432_3565 = torch.constant.int 18432
    %int1_3566 = torch.constant.int 1
    %2591 = torch.aten.slice.Tensor %2585, %int-1_3563, %int15360_3564, %int18432_3565, %int1_3566 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2592 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.7.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.7.txt_mod.lin.weight : tensor<18432x3072xf16>
    %2593 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_3567 = torch.constant.int 0
    %int1_3568 = torch.constant.int 1
    %2594 = torch.aten.transpose.int %2593, %int0_3567, %int1_3568 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.7.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.7.txt_mod.lin.bias : tensor<18432xf16>
    %2595 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_3569 = torch.constant.int 6
    %2596 = torch.prims.convert_element_type %2595, %int6_3569 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_3570 = torch.constant.int 6
    %2597 = torch.prims.convert_element_type %2592, %int6_3570 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_3571 = torch.constant.int 6
    %2598 = torch.prims.convert_element_type %2594, %int6_3571 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %2599 = torch.aten.mm %2597, %2598 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_3572 = torch.constant.int 1
    %2600 = torch.aten.mul.Scalar %2599, %int1_3572 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_3573 = torch.constant.int 1
    %2601 = torch.aten.mul.Scalar %2596, %int1_3573 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_3574 = torch.constant.int 1
    %2602 = torch.aten.add.Tensor %2600, %2601, %int1_3574 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_3575 = torch.constant.int 5
    %2603 = torch.prims.convert_element_type %2602, %int5_3575 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_3576 = torch.constant.int 0
    %int0_3577 = torch.constant.int 0
    %int9223372036854775807_3578 = torch.constant.int 9223372036854775807
    %int1_3579 = torch.constant.int 1
    %2604 = torch.aten.slice.Tensor %2603, %int0_3576, %int0_3577, %int9223372036854775807_3578, %int1_3579 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_3580 = torch.constant.int 1
    %2605 = torch.aten.unsqueeze %2604, %int1_3580 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_3581 = torch.constant.int 2
    %int0_3582 = torch.constant.int 0
    %int9223372036854775807_3583 = torch.constant.int 9223372036854775807
    %int1_3584 = torch.constant.int 1
    %2606 = torch.aten.slice.Tensor %2605, %int2_3581, %int0_3582, %int9223372036854775807_3583, %int1_3584 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_3585 = torch.constant.int -1
    %int0_3586 = torch.constant.int 0
    %int3072_3587 = torch.constant.int 3072
    %int1_3588 = torch.constant.int 1
    %2607 = torch.aten.slice.Tensor %2606, %int-1_3585, %int0_3586, %int3072_3587, %int1_3588 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3589 = torch.constant.int -1
    %int3072_3590 = torch.constant.int 3072
    %int6144_3591 = torch.constant.int 6144
    %int1_3592 = torch.constant.int 1
    %2608 = torch.aten.slice.Tensor %2606, %int-1_3589, %int3072_3590, %int6144_3591, %int1_3592 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3593 = torch.constant.int -1
    %int6144_3594 = torch.constant.int 6144
    %int9216_3595 = torch.constant.int 9216
    %int1_3596 = torch.constant.int 1
    %2609 = torch.aten.slice.Tensor %2606, %int-1_3593, %int6144_3594, %int9216_3595, %int1_3596 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3597 = torch.constant.int -1
    %int9216_3598 = torch.constant.int 9216
    %int12288_3599 = torch.constant.int 12288
    %int1_3600 = torch.constant.int 1
    %2610 = torch.aten.slice.Tensor %2606, %int-1_3597, %int9216_3598, %int12288_3599, %int1_3600 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3601 = torch.constant.int -1
    %int12288_3602 = torch.constant.int 12288
    %int15360_3603 = torch.constant.int 15360
    %int1_3604 = torch.constant.int 1
    %2611 = torch.aten.slice.Tensor %2606, %int-1_3601, %int12288_3602, %int15360_3603, %int1_3604 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3605 = torch.constant.int -1
    %int15360_3606 = torch.constant.int 15360
    %int18432_3607 = torch.constant.int 18432
    %int1_3608 = torch.constant.int 1
    %2612 = torch.aten.slice.Tensor %2606, %int-1_3605, %int15360_3606, %int18432_3607, %int1_3608 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_3609 = torch.constant.int 6
    %2613 = torch.prims.convert_element_type %2510, %int6_3609 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_3610 = torch.constant.int 2
    %2614 = torch.prim.ListConstruct %int2_3610 : (!torch.int) -> !torch.list<int>
    %int0_3611 = torch.constant.int 0
    %true_3612 = torch.constant.bool true
    %result0_3613, %result1_3614 = torch.aten.var_mean.correction %2613, %2614, %int0_3611, %true_3612 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_3615 = torch.constant.float 9.9999999999999995E-7
    %int1_3616 = torch.constant.int 1
    %2615 = torch.aten.add.Scalar %result0_3613, %float9.999990e-07_3615, %int1_3616 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %2616 = torch.aten.rsqrt %2615 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_3617 = torch.constant.int 1
    %2617 = torch.aten.sub.Tensor %2510, %result1_3614, %int1_3617 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %2618 = torch.aten.mul.Tensor %2617, %2616 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_3618 = torch.constant.int 5
    %2619 = torch.prims.convert_element_type %2618, %int5_3618 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3619 = torch.constant.int 1
    %int1_3620 = torch.constant.int 1
    %2620 = torch.aten.add.Scalar %2587, %int1_3619, %int1_3620 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2621 = torch.aten.mul.Tensor %2620, %2619 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3621 = torch.constant.int 1
    %2622 = torch.aten.add.Tensor %2621, %2586, %int1_3621 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_3622 = torch.constant.int 4096
    %int3072_3623 = torch.constant.int 3072
    %2623 = torch.prim.ListConstruct %int4096_3622, %int3072_3623 : (!torch.int, !torch.int) -> !torch.list<int>
    %2624 = torch.aten.view %2622, %2623 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.7.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.7.img_attn.qkv.weight : tensor<9216x3072xf16>
    %2625 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_3624 = torch.constant.int 0
    %int1_3625 = torch.constant.int 1
    %2626 = torch.aten.transpose.int %2625, %int0_3624, %int1_3625 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.7.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.7.img_attn.qkv.bias : tensor<9216xf16>
    %2627 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_3626 = torch.constant.int 6
    %2628 = torch.prims.convert_element_type %2627, %int6_3626 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_3627 = torch.constant.int 6
    %2629 = torch.prims.convert_element_type %2624, %int6_3627 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_3628 = torch.constant.int 6
    %2630 = torch.prims.convert_element_type %2626, %int6_3628 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %2631 = torch.aten.mm %2629, %2630 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_3629 = torch.constant.int 1
    %2632 = torch.aten.mul.Scalar %2631, %int1_3629 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_3630 = torch.constant.int 1
    %2633 = torch.aten.mul.Scalar %2628, %int1_3630 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_3631 = torch.constant.int 1
    %2634 = torch.aten.add.Tensor %2632, %2633, %int1_3631 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_3632 = torch.constant.int 5
    %2635 = torch.prims.convert_element_type %2634, %int5_3632 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_3633 = torch.constant.int 1
    %int4096_3634 = torch.constant.int 4096
    %int9216_3635 = torch.constant.int 9216
    %2636 = torch.prim.ListConstruct %int1_3633, %int4096_3634, %int9216_3635 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2637 = torch.aten.view %2635, %2636 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %2638 = torch_c.to_builtin_tensor %2637 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_3636 = tensor.cast %2638 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_3637 = arith.constant 0 : index
    %dim_3638 = tensor.dim %cast_3636, %c0_3637 : tensor<?x?x?xf16>
    %c1_3639 = arith.constant 1 : index
    %dim_3640 = tensor.dim %cast_3636, %c1_3639 : tensor<?x?x?xf16>
    %c2_3641 = arith.constant 2 : index
    %dim_3642 = tensor.dim %cast_3636, %c2_3641 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_3636 : tensor<?x?x?xf16>{%dim_3638, %dim_3640, %dim_3642}]
    %cast_3643 = tensor.cast %cast_3636 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %2639 = torch_c.from_builtin_tensor %cast_3643 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_3644 = torch.constant.int 1
    %int4096_3645 = torch.constant.int 4096
    %int3_3646 = torch.constant.int 3
    %int24_3647 = torch.constant.int 24
    %int128_3648 = torch.constant.int 128
    %2640 = torch.prim.ListConstruct %int1_3644, %int4096_3645, %int3_3646, %int24_3647, %int128_3648 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2641 = torch.aten.view %2639, %2640 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_3649 = torch.constant.int 2
    %int0_3650 = torch.constant.int 0
    %int3_3651 = torch.constant.int 3
    %int1_3652 = torch.constant.int 1
    %int4_3653 = torch.constant.int 4
    %2642 = torch.prim.ListConstruct %int2_3649, %int0_3650, %int3_3651, %int1_3652, %int4_3653 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2643 = torch.aten.permute %2641, %2642 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_3654 = torch.constant.int 0
    %int0_3655 = torch.constant.int 0
    %2644 = torch.aten.select.int %2643, %int0_3654, %int0_3655 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_3656 = torch.constant.int 6
    %2645 = torch.prims.convert_element_type %2644, %int6_3656 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_3657 = torch.constant.int 2
    %2646 = torch.aten.pow.Tensor_Scalar %2645, %int2_3657 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_3658 = torch.constant.int -1
    %2647 = torch.prim.ListConstruct %int-1_3658 : (!torch.int) -> !torch.list<int>
    %true_3659 = torch.constant.bool true
    %none_3660 = torch.constant.none
    %2648 = torch.aten.mean.dim %2646, %2647, %true_3659, %none_3660 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_3661 = torch.constant.float 9.9999999999999995E-7
    %int1_3662 = torch.constant.int 1
    %2649 = torch.aten.add.Scalar %2648, %float9.999990e-07_3661, %int1_3662 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %2650 = torch.aten.rsqrt %2649 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %2651 = torch.aten.mul.Tensor %2645, %2650 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_3663 = torch.constant.int 5
    %2652 = torch.prims.convert_element_type %2651, %int5_3663 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.7.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.7.img_attn.norm.query_norm.scale : tensor<128xf16>
    %2653 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2654 = torch.aten.mul.Tensor %2652, %2653 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_3664 = torch.constant.int 1
    %int4096_3665 = torch.constant.int 4096
    %int3_3666 = torch.constant.int 3
    %int24_3667 = torch.constant.int 24
    %int128_3668 = torch.constant.int 128
    %2655 = torch.prim.ListConstruct %int1_3664, %int4096_3665, %int3_3666, %int24_3667, %int128_3668 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2656 = torch.aten.view %2639, %2655 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_3669 = torch.constant.int 2
    %int0_3670 = torch.constant.int 0
    %int3_3671 = torch.constant.int 3
    %int1_3672 = torch.constant.int 1
    %int4_3673 = torch.constant.int 4
    %2657 = torch.prim.ListConstruct %int2_3669, %int0_3670, %int3_3671, %int1_3672, %int4_3673 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2658 = torch.aten.permute %2656, %2657 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_3674 = torch.constant.int 0
    %int1_3675 = torch.constant.int 1
    %2659 = torch.aten.select.int %2658, %int0_3674, %int1_3675 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_3676 = torch.constant.int 6
    %2660 = torch.prims.convert_element_type %2659, %int6_3676 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_3677 = torch.constant.int 2
    %2661 = torch.aten.pow.Tensor_Scalar %2660, %int2_3677 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_3678 = torch.constant.int -1
    %2662 = torch.prim.ListConstruct %int-1_3678 : (!torch.int) -> !torch.list<int>
    %true_3679 = torch.constant.bool true
    %none_3680 = torch.constant.none
    %2663 = torch.aten.mean.dim %2661, %2662, %true_3679, %none_3680 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_3681 = torch.constant.float 9.9999999999999995E-7
    %int1_3682 = torch.constant.int 1
    %2664 = torch.aten.add.Scalar %2663, %float9.999990e-07_3681, %int1_3682 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %2665 = torch.aten.rsqrt %2664 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %2666 = torch.aten.mul.Tensor %2660, %2665 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_3683 = torch.constant.int 5
    %2667 = torch.prims.convert_element_type %2666, %int5_3683 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.7.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.7.img_attn.norm.key_norm.scale : tensor<128xf16>
    %2668 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2669 = torch.aten.mul.Tensor %2667, %2668 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_3684 = torch.constant.int 5
    %2670 = torch.prims.convert_element_type %2654, %int5_3684 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_3685 = torch.constant.int 5
    %2671 = torch.prims.convert_element_type %2669, %int5_3685 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_3686 = torch.constant.int 6
    %2672 = torch.prims.convert_element_type %2570, %int6_3686 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_3687 = torch.constant.int 2
    %2673 = torch.prim.ListConstruct %int2_3687 : (!torch.int) -> !torch.list<int>
    %int0_3688 = torch.constant.int 0
    %true_3689 = torch.constant.bool true
    %result0_3690, %result1_3691 = torch.aten.var_mean.correction %2672, %2673, %int0_3688, %true_3689 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_3692 = torch.constant.float 9.9999999999999995E-7
    %int1_3693 = torch.constant.int 1
    %2674 = torch.aten.add.Scalar %result0_3690, %float9.999990e-07_3692, %int1_3693 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %2675 = torch.aten.rsqrt %2674 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_3694 = torch.constant.int 1
    %2676 = torch.aten.sub.Tensor %2570, %result1_3691, %int1_3694 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %2677 = torch.aten.mul.Tensor %2676, %2675 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_3695 = torch.constant.int 5
    %2678 = torch.prims.convert_element_type %2677, %int5_3695 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_3696 = torch.constant.int 1
    %int1_3697 = torch.constant.int 1
    %2679 = torch.aten.add.Scalar %2608, %int1_3696, %int1_3697 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2680 = torch.aten.mul.Tensor %2679, %2678 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3698 = torch.constant.int 1
    %2681 = torch.aten.add.Tensor %2680, %2607, %int1_3698 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_3699 = torch.constant.int 512
    %int3072_3700 = torch.constant.int 3072
    %2682 = torch.prim.ListConstruct %int512_3699, %int3072_3700 : (!torch.int, !torch.int) -> !torch.list<int>
    %2683 = torch.aten.view %2681, %2682 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.7.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.7.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %2684 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_3701 = torch.constant.int 0
    %int1_3702 = torch.constant.int 1
    %2685 = torch.aten.transpose.int %2684, %int0_3701, %int1_3702 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.7.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.7.txt_attn.qkv.bias : tensor<9216xf16>
    %2686 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_3703 = torch.constant.int 6
    %2687 = torch.prims.convert_element_type %2686, %int6_3703 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_3704 = torch.constant.int 6
    %2688 = torch.prims.convert_element_type %2683, %int6_3704 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_3705 = torch.constant.int 6
    %2689 = torch.prims.convert_element_type %2685, %int6_3705 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %2690 = torch.aten.mm %2688, %2689 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_3706 = torch.constant.int 1
    %2691 = torch.aten.mul.Scalar %2690, %int1_3706 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_3707 = torch.constant.int 1
    %2692 = torch.aten.mul.Scalar %2687, %int1_3707 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_3708 = torch.constant.int 1
    %2693 = torch.aten.add.Tensor %2691, %2692, %int1_3708 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_3709 = torch.constant.int 5
    %2694 = torch.prims.convert_element_type %2693, %int5_3709 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_3710 = torch.constant.int 1
    %int512_3711 = torch.constant.int 512
    %int9216_3712 = torch.constant.int 9216
    %2695 = torch.prim.ListConstruct %int1_3710, %int512_3711, %int9216_3712 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2696 = torch.aten.view %2694, %2695 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %2697 = torch_c.to_builtin_tensor %2696 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_3713 = tensor.cast %2697 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_3714 = arith.constant 0 : index
    %dim_3715 = tensor.dim %cast_3713, %c0_3714 : tensor<?x?x?xf16>
    %c1_3716 = arith.constant 1 : index
    %dim_3717 = tensor.dim %cast_3713, %c1_3716 : tensor<?x?x?xf16>
    %c2_3718 = arith.constant 2 : index
    %dim_3719 = tensor.dim %cast_3713, %c2_3718 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_3713 : tensor<?x?x?xf16>{%dim_3715, %dim_3717, %dim_3719}]
    %cast_3720 = tensor.cast %cast_3713 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %2698 = torch_c.from_builtin_tensor %cast_3720 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_3721 = torch.constant.int 1
    %int512_3722 = torch.constant.int 512
    %int3_3723 = torch.constant.int 3
    %int24_3724 = torch.constant.int 24
    %int128_3725 = torch.constant.int 128
    %2699 = torch.prim.ListConstruct %int1_3721, %int512_3722, %int3_3723, %int24_3724, %int128_3725 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2700 = torch.aten.view %2698, %2699 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_3726 = torch.constant.int 2
    %int0_3727 = torch.constant.int 0
    %int3_3728 = torch.constant.int 3
    %int1_3729 = torch.constant.int 1
    %int4_3730 = torch.constant.int 4
    %2701 = torch.prim.ListConstruct %int2_3726, %int0_3727, %int3_3728, %int1_3729, %int4_3730 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2702 = torch.aten.permute %2700, %2701 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_3731 = torch.constant.int 0
    %int0_3732 = torch.constant.int 0
    %2703 = torch.aten.select.int %2702, %int0_3731, %int0_3732 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_3733 = torch.constant.int 6
    %2704 = torch.prims.convert_element_type %2703, %int6_3733 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_3734 = torch.constant.int 2
    %2705 = torch.aten.pow.Tensor_Scalar %2704, %int2_3734 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_3735 = torch.constant.int -1
    %2706 = torch.prim.ListConstruct %int-1_3735 : (!torch.int) -> !torch.list<int>
    %true_3736 = torch.constant.bool true
    %none_3737 = torch.constant.none
    %2707 = torch.aten.mean.dim %2705, %2706, %true_3736, %none_3737 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_3738 = torch.constant.float 9.9999999999999995E-7
    %int1_3739 = torch.constant.int 1
    %2708 = torch.aten.add.Scalar %2707, %float9.999990e-07_3738, %int1_3739 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %2709 = torch.aten.rsqrt %2708 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %2710 = torch.aten.mul.Tensor %2704, %2709 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_3740 = torch.constant.int 5
    %2711 = torch.prims.convert_element_type %2710, %int5_3740 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.7.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.7.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %2712 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2713 = torch.aten.mul.Tensor %2711, %2712 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_3741 = torch.constant.int 1
    %int512_3742 = torch.constant.int 512
    %int3_3743 = torch.constant.int 3
    %int24_3744 = torch.constant.int 24
    %int128_3745 = torch.constant.int 128
    %2714 = torch.prim.ListConstruct %int1_3741, %int512_3742, %int3_3743, %int24_3744, %int128_3745 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2715 = torch.aten.view %2698, %2714 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_3746 = torch.constant.int 2
    %int0_3747 = torch.constant.int 0
    %int3_3748 = torch.constant.int 3
    %int1_3749 = torch.constant.int 1
    %int4_3750 = torch.constant.int 4
    %2716 = torch.prim.ListConstruct %int2_3746, %int0_3747, %int3_3748, %int1_3749, %int4_3750 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2717 = torch.aten.permute %2715, %2716 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_3751 = torch.constant.int 0
    %int1_3752 = torch.constant.int 1
    %2718 = torch.aten.select.int %2717, %int0_3751, %int1_3752 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_3753 = torch.constant.int 6
    %2719 = torch.prims.convert_element_type %2718, %int6_3753 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_3754 = torch.constant.int 2
    %2720 = torch.aten.pow.Tensor_Scalar %2719, %int2_3754 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_3755 = torch.constant.int -1
    %2721 = torch.prim.ListConstruct %int-1_3755 : (!torch.int) -> !torch.list<int>
    %true_3756 = torch.constant.bool true
    %none_3757 = torch.constant.none
    %2722 = torch.aten.mean.dim %2720, %2721, %true_3756, %none_3757 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_3758 = torch.constant.float 9.9999999999999995E-7
    %int1_3759 = torch.constant.int 1
    %2723 = torch.aten.add.Scalar %2722, %float9.999990e-07_3758, %int1_3759 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %2724 = torch.aten.rsqrt %2723 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %2725 = torch.aten.mul.Tensor %2719, %2724 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_3760 = torch.constant.int 5
    %2726 = torch.prims.convert_element_type %2725, %int5_3760 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.7.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.7.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %2727 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2728 = torch.aten.mul.Tensor %2726, %2727 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_3761 = torch.constant.int 5
    %2729 = torch.prims.convert_element_type %2713, %int5_3761 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_3762 = torch.constant.int 5
    %2730 = torch.prims.convert_element_type %2728, %int5_3762 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %2731 = torch.prim.ListConstruct %2729, %2670 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_3763 = torch.constant.int 2
    %2732 = torch.aten.cat %2731, %int2_3763 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %2733 = torch.prim.ListConstruct %2730, %2671 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_3764 = torch.constant.int 2
    %2734 = torch.aten.cat %2733, %int2_3764 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_3765 = torch.constant.int 1
    %int512_3766 = torch.constant.int 512
    %int3_3767 = torch.constant.int 3
    %int24_3768 = torch.constant.int 24
    %int128_3769 = torch.constant.int 128
    %2735 = torch.prim.ListConstruct %int1_3765, %int512_3766, %int3_3767, %int24_3768, %int128_3769 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2736 = torch.aten.view %2698, %2735 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_3770 = torch.constant.int 2
    %int0_3771 = torch.constant.int 0
    %int3_3772 = torch.constant.int 3
    %int1_3773 = torch.constant.int 1
    %int4_3774 = torch.constant.int 4
    %2737 = torch.prim.ListConstruct %int2_3770, %int0_3771, %int3_3772, %int1_3773, %int4_3774 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2738 = torch.aten.permute %2736, %2737 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_3775 = torch.constant.int 0
    %int2_3776 = torch.constant.int 2
    %2739 = torch.aten.select.int %2738, %int0_3775, %int2_3776 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_3777 = torch.constant.int 1
    %int4096_3778 = torch.constant.int 4096
    %int3_3779 = torch.constant.int 3
    %int24_3780 = torch.constant.int 24
    %int128_3781 = torch.constant.int 128
    %2740 = torch.prim.ListConstruct %int1_3777, %int4096_3778, %int3_3779, %int24_3780, %int128_3781 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2741 = torch.aten.view %2639, %2740 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_3782 = torch.constant.int 2
    %int0_3783 = torch.constant.int 0
    %int3_3784 = torch.constant.int 3
    %int1_3785 = torch.constant.int 1
    %int4_3786 = torch.constant.int 4
    %2742 = torch.prim.ListConstruct %int2_3782, %int0_3783, %int3_3784, %int1_3785, %int4_3786 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2743 = torch.aten.permute %2741, %2742 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_3787 = torch.constant.int 0
    %int2_3788 = torch.constant.int 2
    %2744 = torch.aten.select.int %2743, %int0_3787, %int2_3788 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %2745 = torch.prim.ListConstruct %2739, %2744 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_3789 = torch.constant.int 2
    %2746 = torch.aten.cat %2745, %int2_3789 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %2747 = torch_c.to_builtin_tensor %2732 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_3790 = tensor.cast %2747 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_3791 = arith.constant 0 : index
    %dim_3792 = tensor.dim %cast_3790, %c0_3791 : tensor<?x?x?x?xf16>
    %c1_3793 = arith.constant 1 : index
    %dim_3794 = tensor.dim %cast_3790, %c1_3793 : tensor<?x?x?x?xf16>
    %c2_3795 = arith.constant 2 : index
    %dim_3796 = tensor.dim %cast_3790, %c2_3795 : tensor<?x?x?x?xf16>
    %c3_3797 = arith.constant 3 : index
    %dim_3798 = tensor.dim %cast_3790, %c3_3797 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_3790 : tensor<?x?x?x?xf16>{%dim_3792, %dim_3794, %dim_3796, %dim_3798}]
    %cast_3799 = tensor.cast %cast_3790 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %2748 = torch_c.from_builtin_tensor %cast_3799 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %2749 = torch_c.to_builtin_tensor %2734 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_3800 = tensor.cast %2749 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_3801 = arith.constant 0 : index
    %dim_3802 = tensor.dim %cast_3800, %c0_3801 : tensor<?x?x?x?xf16>
    %c1_3803 = arith.constant 1 : index
    %dim_3804 = tensor.dim %cast_3800, %c1_3803 : tensor<?x?x?x?xf16>
    %c2_3805 = arith.constant 2 : index
    %dim_3806 = tensor.dim %cast_3800, %c2_3805 : tensor<?x?x?x?xf16>
    %c3_3807 = arith.constant 3 : index
    %dim_3808 = tensor.dim %cast_3800, %c3_3807 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_3800 : tensor<?x?x?x?xf16>{%dim_3802, %dim_3804, %dim_3806, %dim_3808}]
    %cast_3809 = tensor.cast %cast_3800 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %2750 = torch_c.from_builtin_tensor %cast_3809 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %2751 = torch_c.to_builtin_tensor %2746 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_3810 = tensor.cast %2751 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_3811 = arith.constant 0 : index
    %dim_3812 = tensor.dim %cast_3810, %c0_3811 : tensor<?x?x?x?xf16>
    %c1_3813 = arith.constant 1 : index
    %dim_3814 = tensor.dim %cast_3810, %c1_3813 : tensor<?x?x?x?xf16>
    %c2_3815 = arith.constant 2 : index
    %dim_3816 = tensor.dim %cast_3810, %c2_3815 : tensor<?x?x?x?xf16>
    %c3_3817 = arith.constant 3 : index
    %dim_3818 = tensor.dim %cast_3810, %c3_3817 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_3810 : tensor<?x?x?x?xf16>{%dim_3812, %dim_3814, %dim_3816, %dim_3818}]
    %cast_3819 = tensor.cast %cast_3810 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %2752 = torch_c.from_builtin_tensor %cast_3819 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_3820 = torch.constant.int 6
    %2753 = torch.prims.convert_element_type %2748, %int6_3820 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_3821 = torch.constant.int 1
    %int24_3822 = torch.constant.int 24
    %int4608_3823 = torch.constant.int 4608
    %int-1_3824 = torch.constant.int -1
    %int1_3825 = torch.constant.int 1
    %int2_3826 = torch.constant.int 2
    %2754 = torch.prim.ListConstruct %int1_3821, %int24_3822, %int4608_3823, %int-1_3824, %int1_3825, %int2_3826 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2755 = torch.aten.view %2753, %2754 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_3827 = torch.constant.int 6
    %2756 = torch.prims.convert_element_type %2750, %int6_3827 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_3828 = torch.constant.int 1
    %int24_3829 = torch.constant.int 24
    %int4608_3830 = torch.constant.int 4608
    %int-1_3831 = torch.constant.int -1
    %int1_3832 = torch.constant.int 1
    %int2_3833 = torch.constant.int 2
    %2757 = torch.prim.ListConstruct %int1_3828, %int24_3829, %int4608_3830, %int-1_3831, %int1_3832, %int2_3833 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2758 = torch.aten.view %2756, %2757 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_3834 = torch.constant.int 5
    %int0_3835 = torch.constant.int 0
    %2759 = torch.aten.select.int %211, %int5_3834, %int0_3835 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3836 = torch.constant.int 5
    %int0_3837 = torch.constant.int 0
    %2760 = torch.aten.select.int %2755, %int5_3836, %int0_3837 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2761 = torch.aten.mul.Tensor %2759, %2760 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_3838 = torch.constant.int 5
    %int1_3839 = torch.constant.int 1
    %2762 = torch.aten.select.int %211, %int5_3838, %int1_3839 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3840 = torch.constant.int 5
    %int1_3841 = torch.constant.int 1
    %2763 = torch.aten.select.int %2755, %int5_3840, %int1_3841 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2764 = torch.aten.mul.Tensor %2762, %2763 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_3842 = torch.constant.int 1
    %2765 = torch.aten.add.Tensor %2761, %2764, %int1_3842 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_3843 = torch.constant.int 5
    %int0_3844 = torch.constant.int 0
    %2766 = torch.aten.select.int %211, %int5_3843, %int0_3844 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3845 = torch.constant.int 5
    %int0_3846 = torch.constant.int 0
    %2767 = torch.aten.select.int %2758, %int5_3845, %int0_3846 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2768 = torch.aten.mul.Tensor %2766, %2767 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_3847 = torch.constant.int 5
    %int1_3848 = torch.constant.int 1
    %2769 = torch.aten.select.int %211, %int5_3847, %int1_3848 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3849 = torch.constant.int 5
    %int1_3850 = torch.constant.int 1
    %2770 = torch.aten.select.int %2758, %int5_3849, %int1_3850 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2771 = torch.aten.mul.Tensor %2769, %2770 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_3851 = torch.constant.int 1
    %2772 = torch.aten.add.Tensor %2768, %2771, %int1_3851 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_3852 = torch.constant.int 1
    %int24_3853 = torch.constant.int 24
    %int4608_3854 = torch.constant.int 4608
    %int128_3855 = torch.constant.int 128
    %2773 = torch.prim.ListConstruct %int1_3852, %int24_3853, %int4608_3854, %int128_3855 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2774 = torch.aten.view %2765, %2773 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_3856 = torch.constant.int 5
    %2775 = torch.prims.convert_element_type %2774, %int5_3856 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_3857 = torch.constant.int 1
    %int24_3858 = torch.constant.int 24
    %int4608_3859 = torch.constant.int 4608
    %int128_3860 = torch.constant.int 128
    %2776 = torch.prim.ListConstruct %int1_3857, %int24_3858, %int4608_3859, %int128_3860 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2777 = torch.aten.view %2772, %2776 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_3861 = torch.constant.int 5
    %2778 = torch.prims.convert_element_type %2777, %int5_3861 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_3862 = torch.constant.float 0.000000e+00
    %false_3863 = torch.constant.bool false
    %none_3864 = torch.constant.none
    %none_3865 = torch.constant.none
    %2779:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2775, %2778, %2752, %float0.000000e00_3862, %false_3863, %none_3864, %none_3865) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_3866 = torch.constant.int 0
    %int2_3867 = torch.constant.int 2
    %int1_3868 = torch.constant.int 1
    %int3_3869 = torch.constant.int 3
    %2780 = torch.prim.ListConstruct %int0_3866, %int2_3867, %int1_3868, %int3_3869 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2781 = torch.aten.permute %2779#0, %2780 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_3870 = torch.constant.int 1
    %int4608_3871 = torch.constant.int 4608
    %int3072_3872 = torch.constant.int 3072
    %2782 = torch.prim.ListConstruct %int1_3870, %int4608_3871, %int3072_3872 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2783 = torch.aten.view %2781, %2782 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_3873 = torch.constant.int 0
    %int0_3874 = torch.constant.int 0
    %int9223372036854775807_3875 = torch.constant.int 9223372036854775807
    %int1_3876 = torch.constant.int 1
    %2784 = torch.aten.slice.Tensor %2783, %int0_3873, %int0_3874, %int9223372036854775807_3875, %int1_3876 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_3877 = torch.constant.int 1
    %int0_3878 = torch.constant.int 0
    %int512_3879 = torch.constant.int 512
    %int1_3880 = torch.constant.int 1
    %2785 = torch.aten.slice.Tensor %2784, %int1_3877, %int0_3878, %int512_3879, %int1_3880 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_3881 = torch.constant.int 0
    %int0_3882 = torch.constant.int 0
    %int9223372036854775807_3883 = torch.constant.int 9223372036854775807
    %int1_3884 = torch.constant.int 1
    %2786 = torch.aten.slice.Tensor %2783, %int0_3881, %int0_3882, %int9223372036854775807_3883, %int1_3884 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_3885 = torch.constant.int 1
    %int512_3886 = torch.constant.int 512
    %int9223372036854775807_3887 = torch.constant.int 9223372036854775807
    %int1_3888 = torch.constant.int 1
    %2787 = torch.aten.slice.Tensor %2786, %int1_3885, %int512_3886, %int9223372036854775807_3887, %int1_3888 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_3889 = torch.constant.int 4096
    %int3072_3890 = torch.constant.int 3072
    %2788 = torch.prim.ListConstruct %int4096_3889, %int3072_3890 : (!torch.int, !torch.int) -> !torch.list<int>
    %2789 = torch.aten.view %2787, %2788 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.7.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.7.img_attn.proj.weight : tensor<3072x3072xf16>
    %2790 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_3891 = torch.constant.int 0
    %int1_3892 = torch.constant.int 1
    %2791 = torch.aten.transpose.int %2790, %int0_3891, %int1_3892 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.7.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.7.img_attn.proj.bias : tensor<3072xf16>
    %2792 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3893 = torch.constant.int 6
    %2793 = torch.prims.convert_element_type %2792, %int6_3893 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3894 = torch.constant.int 6
    %2794 = torch.prims.convert_element_type %2789, %int6_3894 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_3895 = torch.constant.int 6
    %2795 = torch.prims.convert_element_type %2791, %int6_3895 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %2796 = torch.aten.mm %2794, %2795 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_3896 = torch.constant.int 1
    %2797 = torch.aten.mul.Scalar %2796, %int1_3896 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_3897 = torch.constant.int 1
    %2798 = torch.aten.mul.Scalar %2793, %int1_3897 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3898 = torch.constant.int 1
    %2799 = torch.aten.add.Tensor %2797, %2798, %int1_3898 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_3899 = torch.constant.int 5
    %2800 = torch.prims.convert_element_type %2799, %int5_3899 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_3900 = torch.constant.int 1
    %int4096_3901 = torch.constant.int 4096
    %int3072_3902 = torch.constant.int 3072
    %2801 = torch.prim.ListConstruct %int1_3900, %int4096_3901, %int3072_3902 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2802 = torch.aten.view %2800, %2801 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %2803 = torch.aten.mul.Tensor %2588, %2802 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3903 = torch.constant.int 1
    %2804 = torch.aten.add.Tensor %2510, %2803, %int1_3903 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3904 = torch.constant.int 1
    %int1_3905 = torch.constant.int 1
    %2805 = torch.aten.add.Scalar %2590, %int1_3904, %int1_3905 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_3906 = torch.constant.int 6
    %2806 = torch.prims.convert_element_type %2804, %int6_3906 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_3907 = torch.constant.int 2
    %2807 = torch.prim.ListConstruct %int2_3907 : (!torch.int) -> !torch.list<int>
    %int0_3908 = torch.constant.int 0
    %true_3909 = torch.constant.bool true
    %result0_3910, %result1_3911 = torch.aten.var_mean.correction %2806, %2807, %int0_3908, %true_3909 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_3912 = torch.constant.float 9.9999999999999995E-7
    %int1_3913 = torch.constant.int 1
    %2808 = torch.aten.add.Scalar %result0_3910, %float9.999990e-07_3912, %int1_3913 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %2809 = torch.aten.rsqrt %2808 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_3914 = torch.constant.int 1
    %2810 = torch.aten.sub.Tensor %2804, %result1_3911, %int1_3914 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %2811 = torch.aten.mul.Tensor %2810, %2809 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_3915 = torch.constant.int 5
    %2812 = torch.prims.convert_element_type %2811, %int5_3915 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %2813 = torch.aten.mul.Tensor %2805, %2812 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3916 = torch.constant.int 1
    %2814 = torch.aten.add.Tensor %2813, %2589, %int1_3916 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_3917 = torch.constant.int 4096
    %int3072_3918 = torch.constant.int 3072
    %2815 = torch.prim.ListConstruct %int4096_3917, %int3072_3918 : (!torch.int, !torch.int) -> !torch.list<int>
    %2816 = torch.aten.view %2814, %2815 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.7.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.7.img_mlp.0.weight : tensor<12288x3072xf16>
    %2817 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_3919 = torch.constant.int 0
    %int1_3920 = torch.constant.int 1
    %2818 = torch.aten.transpose.int %2817, %int0_3919, %int1_3920 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.7.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.7.img_mlp.0.bias : tensor<12288xf16>
    %2819 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_3921 = torch.constant.int 6
    %2820 = torch.prims.convert_element_type %2819, %int6_3921 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_3922 = torch.constant.int 6
    %2821 = torch.prims.convert_element_type %2816, %int6_3922 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_3923 = torch.constant.int 6
    %2822 = torch.prims.convert_element_type %2818, %int6_3923 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %2823 = torch.aten.mm %2821, %2822 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_3924 = torch.constant.int 1
    %2824 = torch.aten.mul.Scalar %2823, %int1_3924 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_3925 = torch.constant.int 1
    %2825 = torch.aten.mul.Scalar %2820, %int1_3925 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_3926 = torch.constant.int 1
    %2826 = torch.aten.add.Tensor %2824, %2825, %int1_3926 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_3927 = torch.constant.int 5
    %2827 = torch.prims.convert_element_type %2826, %int5_3927 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_3928 = torch.constant.int 1
    %int4096_3929 = torch.constant.int 4096
    %int12288_3930 = torch.constant.int 12288
    %2828 = torch.prim.ListConstruct %int1_3928, %int4096_3929, %int12288_3930 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2829 = torch.aten.view %2827, %2828 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_3931 = torch.constant.str "tanh"
    %2830 = torch.aten.gelu %2829, %str_3931 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_3932 = torch.constant.int 4096
    %int12288_3933 = torch.constant.int 12288
    %2831 = torch.prim.ListConstruct %int4096_3932, %int12288_3933 : (!torch.int, !torch.int) -> !torch.list<int>
    %2832 = torch.aten.view %2830, %2831 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.7.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.7.img_mlp.2.weight : tensor<3072x12288xf16>
    %2833 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_3934 = torch.constant.int 0
    %int1_3935 = torch.constant.int 1
    %2834 = torch.aten.transpose.int %2833, %int0_3934, %int1_3935 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.7.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.7.img_mlp.2.bias : tensor<3072xf16>
    %2835 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3936 = torch.constant.int 6
    %2836 = torch.prims.convert_element_type %2835, %int6_3936 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3937 = torch.constant.int 6
    %2837 = torch.prims.convert_element_type %2832, %int6_3937 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_3938 = torch.constant.int 6
    %2838 = torch.prims.convert_element_type %2834, %int6_3938 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %2839 = torch.aten.mm %2837, %2838 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_3939 = torch.constant.int 1
    %2840 = torch.aten.mul.Scalar %2839, %int1_3939 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_3940 = torch.constant.int 1
    %2841 = torch.aten.mul.Scalar %2836, %int1_3940 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3941 = torch.constant.int 1
    %2842 = torch.aten.add.Tensor %2840, %2841, %int1_3941 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_3942 = torch.constant.int 5
    %2843 = torch.prims.convert_element_type %2842, %int5_3942 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_3943 = torch.constant.int 1
    %int4096_3944 = torch.constant.int 4096
    %int3072_3945 = torch.constant.int 3072
    %2844 = torch.prim.ListConstruct %int1_3943, %int4096_3944, %int3072_3945 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2845 = torch.aten.view %2843, %2844 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %2846 = torch.aten.mul.Tensor %2591, %2845 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3946 = torch.constant.int 1
    %2847 = torch.aten.add.Tensor %2804, %2846, %int1_3946 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_3947 = torch.constant.int 512
    %int3072_3948 = torch.constant.int 3072
    %2848 = torch.prim.ListConstruct %int512_3947, %int3072_3948 : (!torch.int, !torch.int) -> !torch.list<int>
    %2849 = torch.aten.view %2785, %2848 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.7.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.7.txt_attn.proj.weight : tensor<3072x3072xf16>
    %2850 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_3949 = torch.constant.int 0
    %int1_3950 = torch.constant.int 1
    %2851 = torch.aten.transpose.int %2850, %int0_3949, %int1_3950 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.7.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.7.txt_attn.proj.bias : tensor<3072xf16>
    %2852 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3951 = torch.constant.int 6
    %2853 = torch.prims.convert_element_type %2852, %int6_3951 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3952 = torch.constant.int 6
    %2854 = torch.prims.convert_element_type %2849, %int6_3952 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_3953 = torch.constant.int 6
    %2855 = torch.prims.convert_element_type %2851, %int6_3953 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %2856 = torch.aten.mm %2854, %2855 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_3954 = torch.constant.int 1
    %2857 = torch.aten.mul.Scalar %2856, %int1_3954 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_3955 = torch.constant.int 1
    %2858 = torch.aten.mul.Scalar %2853, %int1_3955 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3956 = torch.constant.int 1
    %2859 = torch.aten.add.Tensor %2857, %2858, %int1_3956 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_3957 = torch.constant.int 5
    %2860 = torch.prims.convert_element_type %2859, %int5_3957 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_3958 = torch.constant.int 1
    %int512_3959 = torch.constant.int 512
    %int3072_3960 = torch.constant.int 3072
    %2861 = torch.prim.ListConstruct %int1_3958, %int512_3959, %int3072_3960 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2862 = torch.aten.view %2860, %2861 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %2863 = torch.aten.mul.Tensor %2609, %2862 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3961 = torch.constant.int 1
    %2864 = torch.aten.add.Tensor %2570, %2863, %int1_3961 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_3962 = torch.constant.int 1
    %int1_3963 = torch.constant.int 1
    %2865 = torch.aten.add.Scalar %2611, %int1_3962, %int1_3963 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_3964 = torch.constant.int 6
    %2866 = torch.prims.convert_element_type %2864, %int6_3964 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_3965 = torch.constant.int 2
    %2867 = torch.prim.ListConstruct %int2_3965 : (!torch.int) -> !torch.list<int>
    %int0_3966 = torch.constant.int 0
    %true_3967 = torch.constant.bool true
    %result0_3968, %result1_3969 = torch.aten.var_mean.correction %2866, %2867, %int0_3966, %true_3967 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_3970 = torch.constant.float 9.9999999999999995E-7
    %int1_3971 = torch.constant.int 1
    %2868 = torch.aten.add.Scalar %result0_3968, %float9.999990e-07_3970, %int1_3971 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %2869 = torch.aten.rsqrt %2868 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_3972 = torch.constant.int 1
    %2870 = torch.aten.sub.Tensor %2864, %result1_3969, %int1_3972 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %2871 = torch.aten.mul.Tensor %2870, %2869 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_3973 = torch.constant.int 5
    %2872 = torch.prims.convert_element_type %2871, %int5_3973 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %2873 = torch.aten.mul.Tensor %2865, %2872 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3974 = torch.constant.int 1
    %2874 = torch.aten.add.Tensor %2873, %2610, %int1_3974 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_3975 = torch.constant.int 512
    %int3072_3976 = torch.constant.int 3072
    %2875 = torch.prim.ListConstruct %int512_3975, %int3072_3976 : (!torch.int, !torch.int) -> !torch.list<int>
    %2876 = torch.aten.view %2874, %2875 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.7.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.7.txt_mlp.0.weight : tensor<12288x3072xf16>
    %2877 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_3977 = torch.constant.int 0
    %int1_3978 = torch.constant.int 1
    %2878 = torch.aten.transpose.int %2877, %int0_3977, %int1_3978 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.7.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.7.txt_mlp.0.bias : tensor<12288xf16>
    %2879 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_3979 = torch.constant.int 6
    %2880 = torch.prims.convert_element_type %2879, %int6_3979 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_3980 = torch.constant.int 6
    %2881 = torch.prims.convert_element_type %2876, %int6_3980 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_3981 = torch.constant.int 6
    %2882 = torch.prims.convert_element_type %2878, %int6_3981 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %2883 = torch.aten.mm %2881, %2882 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_3982 = torch.constant.int 1
    %2884 = torch.aten.mul.Scalar %2883, %int1_3982 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_3983 = torch.constant.int 1
    %2885 = torch.aten.mul.Scalar %2880, %int1_3983 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_3984 = torch.constant.int 1
    %2886 = torch.aten.add.Tensor %2884, %2885, %int1_3984 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_3985 = torch.constant.int 5
    %2887 = torch.prims.convert_element_type %2886, %int5_3985 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_3986 = torch.constant.int 1
    %int512_3987 = torch.constant.int 512
    %int12288_3988 = torch.constant.int 12288
    %2888 = torch.prim.ListConstruct %int1_3986, %int512_3987, %int12288_3988 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2889 = torch.aten.view %2887, %2888 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_3989 = torch.constant.str "tanh"
    %2890 = torch.aten.gelu %2889, %str_3989 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_3990 = torch.constant.int 512
    %int12288_3991 = torch.constant.int 12288
    %2891 = torch.prim.ListConstruct %int512_3990, %int12288_3991 : (!torch.int, !torch.int) -> !torch.list<int>
    %2892 = torch.aten.view %2890, %2891 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.7.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.7.txt_mlp.2.weight : tensor<3072x12288xf16>
    %2893 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_3992 = torch.constant.int 0
    %int1_3993 = torch.constant.int 1
    %2894 = torch.aten.transpose.int %2893, %int0_3992, %int1_3993 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.7.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.7.txt_mlp.2.bias : tensor<3072xf16>
    %2895 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3994 = torch.constant.int 6
    %2896 = torch.prims.convert_element_type %2895, %int6_3994 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3995 = torch.constant.int 6
    %2897 = torch.prims.convert_element_type %2892, %int6_3995 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_3996 = torch.constant.int 6
    %2898 = torch.prims.convert_element_type %2894, %int6_3996 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %2899 = torch.aten.mm %2897, %2898 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_3997 = torch.constant.int 1
    %2900 = torch.aten.mul.Scalar %2899, %int1_3997 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_3998 = torch.constant.int 1
    %2901 = torch.aten.mul.Scalar %2896, %int1_3998 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3999 = torch.constant.int 1
    %2902 = torch.aten.add.Tensor %2900, %2901, %int1_3999 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_4000 = torch.constant.int 5
    %2903 = torch.prims.convert_element_type %2902, %int5_4000 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_4001 = torch.constant.int 1
    %int512_4002 = torch.constant.int 512
    %int3072_4003 = torch.constant.int 3072
    %2904 = torch.prim.ListConstruct %int1_4001, %int512_4002, %int3072_4003 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2905 = torch.aten.view %2903, %2904 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %2906 = torch.aten.mul.Tensor %2612, %2905 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4004 = torch.constant.int 1
    %2907 = torch.aten.add.Tensor %2864, %2906, %int1_4004 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %2908 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.8.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.8.img_mod.lin.weight : tensor<18432x3072xf16>
    %2909 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_4005 = torch.constant.int 0
    %int1_4006 = torch.constant.int 1
    %2910 = torch.aten.transpose.int %2909, %int0_4005, %int1_4006 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.8.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.8.img_mod.lin.bias : tensor<18432xf16>
    %2911 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_4007 = torch.constant.int 6
    %2912 = torch.prims.convert_element_type %2911, %int6_4007 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_4008 = torch.constant.int 6
    %2913 = torch.prims.convert_element_type %2908, %int6_4008 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_4009 = torch.constant.int 6
    %2914 = torch.prims.convert_element_type %2910, %int6_4009 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %2915 = torch.aten.mm %2913, %2914 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_4010 = torch.constant.int 1
    %2916 = torch.aten.mul.Scalar %2915, %int1_4010 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_4011 = torch.constant.int 1
    %2917 = torch.aten.mul.Scalar %2912, %int1_4011 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_4012 = torch.constant.int 1
    %2918 = torch.aten.add.Tensor %2916, %2917, %int1_4012 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_4013 = torch.constant.int 5
    %2919 = torch.prims.convert_element_type %2918, %int5_4013 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_4014 = torch.constant.int 0
    %int0_4015 = torch.constant.int 0
    %int9223372036854775807_4016 = torch.constant.int 9223372036854775807
    %int1_4017 = torch.constant.int 1
    %2920 = torch.aten.slice.Tensor %2919, %int0_4014, %int0_4015, %int9223372036854775807_4016, %int1_4017 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_4018 = torch.constant.int 1
    %2921 = torch.aten.unsqueeze %2920, %int1_4018 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_4019 = torch.constant.int 2
    %int0_4020 = torch.constant.int 0
    %int9223372036854775807_4021 = torch.constant.int 9223372036854775807
    %int1_4022 = torch.constant.int 1
    %2922 = torch.aten.slice.Tensor %2921, %int2_4019, %int0_4020, %int9223372036854775807_4021, %int1_4022 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_4023 = torch.constant.int -1
    %int0_4024 = torch.constant.int 0
    %int3072_4025 = torch.constant.int 3072
    %int1_4026 = torch.constant.int 1
    %2923 = torch.aten.slice.Tensor %2922, %int-1_4023, %int0_4024, %int3072_4025, %int1_4026 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4027 = torch.constant.int -1
    %int3072_4028 = torch.constant.int 3072
    %int6144_4029 = torch.constant.int 6144
    %int1_4030 = torch.constant.int 1
    %2924 = torch.aten.slice.Tensor %2922, %int-1_4027, %int3072_4028, %int6144_4029, %int1_4030 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4031 = torch.constant.int -1
    %int6144_4032 = torch.constant.int 6144
    %int9216_4033 = torch.constant.int 9216
    %int1_4034 = torch.constant.int 1
    %2925 = torch.aten.slice.Tensor %2922, %int-1_4031, %int6144_4032, %int9216_4033, %int1_4034 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4035 = torch.constant.int -1
    %int9216_4036 = torch.constant.int 9216
    %int12288_4037 = torch.constant.int 12288
    %int1_4038 = torch.constant.int 1
    %2926 = torch.aten.slice.Tensor %2922, %int-1_4035, %int9216_4036, %int12288_4037, %int1_4038 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4039 = torch.constant.int -1
    %int12288_4040 = torch.constant.int 12288
    %int15360_4041 = torch.constant.int 15360
    %int1_4042 = torch.constant.int 1
    %2927 = torch.aten.slice.Tensor %2922, %int-1_4039, %int12288_4040, %int15360_4041, %int1_4042 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4043 = torch.constant.int -1
    %int15360_4044 = torch.constant.int 15360
    %int18432_4045 = torch.constant.int 18432
    %int1_4046 = torch.constant.int 1
    %2928 = torch.aten.slice.Tensor %2922, %int-1_4043, %int15360_4044, %int18432_4045, %int1_4046 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2929 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.8.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.8.txt_mod.lin.weight : tensor<18432x3072xf16>
    %2930 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_4047 = torch.constant.int 0
    %int1_4048 = torch.constant.int 1
    %2931 = torch.aten.transpose.int %2930, %int0_4047, %int1_4048 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.8.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.8.txt_mod.lin.bias : tensor<18432xf16>
    %2932 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_4049 = torch.constant.int 6
    %2933 = torch.prims.convert_element_type %2932, %int6_4049 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_4050 = torch.constant.int 6
    %2934 = torch.prims.convert_element_type %2929, %int6_4050 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_4051 = torch.constant.int 6
    %2935 = torch.prims.convert_element_type %2931, %int6_4051 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %2936 = torch.aten.mm %2934, %2935 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_4052 = torch.constant.int 1
    %2937 = torch.aten.mul.Scalar %2936, %int1_4052 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_4053 = torch.constant.int 1
    %2938 = torch.aten.mul.Scalar %2933, %int1_4053 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_4054 = torch.constant.int 1
    %2939 = torch.aten.add.Tensor %2937, %2938, %int1_4054 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_4055 = torch.constant.int 5
    %2940 = torch.prims.convert_element_type %2939, %int5_4055 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_4056 = torch.constant.int 0
    %int0_4057 = torch.constant.int 0
    %int9223372036854775807_4058 = torch.constant.int 9223372036854775807
    %int1_4059 = torch.constant.int 1
    %2941 = torch.aten.slice.Tensor %2940, %int0_4056, %int0_4057, %int9223372036854775807_4058, %int1_4059 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_4060 = torch.constant.int 1
    %2942 = torch.aten.unsqueeze %2941, %int1_4060 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_4061 = torch.constant.int 2
    %int0_4062 = torch.constant.int 0
    %int9223372036854775807_4063 = torch.constant.int 9223372036854775807
    %int1_4064 = torch.constant.int 1
    %2943 = torch.aten.slice.Tensor %2942, %int2_4061, %int0_4062, %int9223372036854775807_4063, %int1_4064 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_4065 = torch.constant.int -1
    %int0_4066 = torch.constant.int 0
    %int3072_4067 = torch.constant.int 3072
    %int1_4068 = torch.constant.int 1
    %2944 = torch.aten.slice.Tensor %2943, %int-1_4065, %int0_4066, %int3072_4067, %int1_4068 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4069 = torch.constant.int -1
    %int3072_4070 = torch.constant.int 3072
    %int6144_4071 = torch.constant.int 6144
    %int1_4072 = torch.constant.int 1
    %2945 = torch.aten.slice.Tensor %2943, %int-1_4069, %int3072_4070, %int6144_4071, %int1_4072 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4073 = torch.constant.int -1
    %int6144_4074 = torch.constant.int 6144
    %int9216_4075 = torch.constant.int 9216
    %int1_4076 = torch.constant.int 1
    %2946 = torch.aten.slice.Tensor %2943, %int-1_4073, %int6144_4074, %int9216_4075, %int1_4076 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4077 = torch.constant.int -1
    %int9216_4078 = torch.constant.int 9216
    %int12288_4079 = torch.constant.int 12288
    %int1_4080 = torch.constant.int 1
    %2947 = torch.aten.slice.Tensor %2943, %int-1_4077, %int9216_4078, %int12288_4079, %int1_4080 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4081 = torch.constant.int -1
    %int12288_4082 = torch.constant.int 12288
    %int15360_4083 = torch.constant.int 15360
    %int1_4084 = torch.constant.int 1
    %2948 = torch.aten.slice.Tensor %2943, %int-1_4081, %int12288_4082, %int15360_4083, %int1_4084 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4085 = torch.constant.int -1
    %int15360_4086 = torch.constant.int 15360
    %int18432_4087 = torch.constant.int 18432
    %int1_4088 = torch.constant.int 1
    %2949 = torch.aten.slice.Tensor %2943, %int-1_4085, %int15360_4086, %int18432_4087, %int1_4088 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_4089 = torch.constant.int 6
    %2950 = torch.prims.convert_element_type %2847, %int6_4089 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_4090 = torch.constant.int 2
    %2951 = torch.prim.ListConstruct %int2_4090 : (!torch.int) -> !torch.list<int>
    %int0_4091 = torch.constant.int 0
    %true_4092 = torch.constant.bool true
    %result0_4093, %result1_4094 = torch.aten.var_mean.correction %2950, %2951, %int0_4091, %true_4092 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_4095 = torch.constant.float 9.9999999999999995E-7
    %int1_4096 = torch.constant.int 1
    %2952 = torch.aten.add.Scalar %result0_4093, %float9.999990e-07_4095, %int1_4096 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %2953 = torch.aten.rsqrt %2952 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_4097 = torch.constant.int 1
    %2954 = torch.aten.sub.Tensor %2847, %result1_4094, %int1_4097 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %2955 = torch.aten.mul.Tensor %2954, %2953 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_4098 = torch.constant.int 5
    %2956 = torch.prims.convert_element_type %2955, %int5_4098 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4099 = torch.constant.int 1
    %int1_4100 = torch.constant.int 1
    %2957 = torch.aten.add.Scalar %2924, %int1_4099, %int1_4100 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2958 = torch.aten.mul.Tensor %2957, %2956 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4101 = torch.constant.int 1
    %2959 = torch.aten.add.Tensor %2958, %2923, %int1_4101 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_4102 = torch.constant.int 4096
    %int3072_4103 = torch.constant.int 3072
    %2960 = torch.prim.ListConstruct %int4096_4102, %int3072_4103 : (!torch.int, !torch.int) -> !torch.list<int>
    %2961 = torch.aten.view %2959, %2960 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.8.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.8.img_attn.qkv.weight : tensor<9216x3072xf16>
    %2962 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_4104 = torch.constant.int 0
    %int1_4105 = torch.constant.int 1
    %2963 = torch.aten.transpose.int %2962, %int0_4104, %int1_4105 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.8.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.8.img_attn.qkv.bias : tensor<9216xf16>
    %2964 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_4106 = torch.constant.int 6
    %2965 = torch.prims.convert_element_type %2964, %int6_4106 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_4107 = torch.constant.int 6
    %2966 = torch.prims.convert_element_type %2961, %int6_4107 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_4108 = torch.constant.int 6
    %2967 = torch.prims.convert_element_type %2963, %int6_4108 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %2968 = torch.aten.mm %2966, %2967 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_4109 = torch.constant.int 1
    %2969 = torch.aten.mul.Scalar %2968, %int1_4109 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_4110 = torch.constant.int 1
    %2970 = torch.aten.mul.Scalar %2965, %int1_4110 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_4111 = torch.constant.int 1
    %2971 = torch.aten.add.Tensor %2969, %2970, %int1_4111 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_4112 = torch.constant.int 5
    %2972 = torch.prims.convert_element_type %2971, %int5_4112 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_4113 = torch.constant.int 1
    %int4096_4114 = torch.constant.int 4096
    %int9216_4115 = torch.constant.int 9216
    %2973 = torch.prim.ListConstruct %int1_4113, %int4096_4114, %int9216_4115 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2974 = torch.aten.view %2972, %2973 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %2975 = torch_c.to_builtin_tensor %2974 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_4116 = tensor.cast %2975 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_4117 = arith.constant 0 : index
    %dim_4118 = tensor.dim %cast_4116, %c0_4117 : tensor<?x?x?xf16>
    %c1_4119 = arith.constant 1 : index
    %dim_4120 = tensor.dim %cast_4116, %c1_4119 : tensor<?x?x?xf16>
    %c2_4121 = arith.constant 2 : index
    %dim_4122 = tensor.dim %cast_4116, %c2_4121 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_4116 : tensor<?x?x?xf16>{%dim_4118, %dim_4120, %dim_4122}]
    %cast_4123 = tensor.cast %cast_4116 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %2976 = torch_c.from_builtin_tensor %cast_4123 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_4124 = torch.constant.int 1
    %int4096_4125 = torch.constant.int 4096
    %int3_4126 = torch.constant.int 3
    %int24_4127 = torch.constant.int 24
    %int128_4128 = torch.constant.int 128
    %2977 = torch.prim.ListConstruct %int1_4124, %int4096_4125, %int3_4126, %int24_4127, %int128_4128 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2978 = torch.aten.view %2976, %2977 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_4129 = torch.constant.int 2
    %int0_4130 = torch.constant.int 0
    %int3_4131 = torch.constant.int 3
    %int1_4132 = torch.constant.int 1
    %int4_4133 = torch.constant.int 4
    %2979 = torch.prim.ListConstruct %int2_4129, %int0_4130, %int3_4131, %int1_4132, %int4_4133 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2980 = torch.aten.permute %2978, %2979 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_4134 = torch.constant.int 0
    %int0_4135 = torch.constant.int 0
    %2981 = torch.aten.select.int %2980, %int0_4134, %int0_4135 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_4136 = torch.constant.int 6
    %2982 = torch.prims.convert_element_type %2981, %int6_4136 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_4137 = torch.constant.int 2
    %2983 = torch.aten.pow.Tensor_Scalar %2982, %int2_4137 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_4138 = torch.constant.int -1
    %2984 = torch.prim.ListConstruct %int-1_4138 : (!torch.int) -> !torch.list<int>
    %true_4139 = torch.constant.bool true
    %none_4140 = torch.constant.none
    %2985 = torch.aten.mean.dim %2983, %2984, %true_4139, %none_4140 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_4141 = torch.constant.float 9.9999999999999995E-7
    %int1_4142 = torch.constant.int 1
    %2986 = torch.aten.add.Scalar %2985, %float9.999990e-07_4141, %int1_4142 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %2987 = torch.aten.rsqrt %2986 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %2988 = torch.aten.mul.Tensor %2982, %2987 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_4143 = torch.constant.int 5
    %2989 = torch.prims.convert_element_type %2988, %int5_4143 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.8.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.8.img_attn.norm.query_norm.scale : tensor<128xf16>
    %2990 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2991 = torch.aten.mul.Tensor %2989, %2990 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_4144 = torch.constant.int 1
    %int4096_4145 = torch.constant.int 4096
    %int3_4146 = torch.constant.int 3
    %int24_4147 = torch.constant.int 24
    %int128_4148 = torch.constant.int 128
    %2992 = torch.prim.ListConstruct %int1_4144, %int4096_4145, %int3_4146, %int24_4147, %int128_4148 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2993 = torch.aten.view %2976, %2992 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_4149 = torch.constant.int 2
    %int0_4150 = torch.constant.int 0
    %int3_4151 = torch.constant.int 3
    %int1_4152 = torch.constant.int 1
    %int4_4153 = torch.constant.int 4
    %2994 = torch.prim.ListConstruct %int2_4149, %int0_4150, %int3_4151, %int1_4152, %int4_4153 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2995 = torch.aten.permute %2993, %2994 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_4154 = torch.constant.int 0
    %int1_4155 = torch.constant.int 1
    %2996 = torch.aten.select.int %2995, %int0_4154, %int1_4155 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_4156 = torch.constant.int 6
    %2997 = torch.prims.convert_element_type %2996, %int6_4156 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_4157 = torch.constant.int 2
    %2998 = torch.aten.pow.Tensor_Scalar %2997, %int2_4157 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_4158 = torch.constant.int -1
    %2999 = torch.prim.ListConstruct %int-1_4158 : (!torch.int) -> !torch.list<int>
    %true_4159 = torch.constant.bool true
    %none_4160 = torch.constant.none
    %3000 = torch.aten.mean.dim %2998, %2999, %true_4159, %none_4160 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_4161 = torch.constant.float 9.9999999999999995E-7
    %int1_4162 = torch.constant.int 1
    %3001 = torch.aten.add.Scalar %3000, %float9.999990e-07_4161, %int1_4162 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %3002 = torch.aten.rsqrt %3001 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %3003 = torch.aten.mul.Tensor %2997, %3002 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_4163 = torch.constant.int 5
    %3004 = torch.prims.convert_element_type %3003, %int5_4163 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.8.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.8.img_attn.norm.key_norm.scale : tensor<128xf16>
    %3005 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3006 = torch.aten.mul.Tensor %3004, %3005 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_4164 = torch.constant.int 5
    %3007 = torch.prims.convert_element_type %2991, %int5_4164 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_4165 = torch.constant.int 5
    %3008 = torch.prims.convert_element_type %3006, %int5_4165 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_4166 = torch.constant.int 6
    %3009 = torch.prims.convert_element_type %2907, %int6_4166 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_4167 = torch.constant.int 2
    %3010 = torch.prim.ListConstruct %int2_4167 : (!torch.int) -> !torch.list<int>
    %int0_4168 = torch.constant.int 0
    %true_4169 = torch.constant.bool true
    %result0_4170, %result1_4171 = torch.aten.var_mean.correction %3009, %3010, %int0_4168, %true_4169 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_4172 = torch.constant.float 9.9999999999999995E-7
    %int1_4173 = torch.constant.int 1
    %3011 = torch.aten.add.Scalar %result0_4170, %float9.999990e-07_4172, %int1_4173 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %3012 = torch.aten.rsqrt %3011 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_4174 = torch.constant.int 1
    %3013 = torch.aten.sub.Tensor %2907, %result1_4171, %int1_4174 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %3014 = torch.aten.mul.Tensor %3013, %3012 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_4175 = torch.constant.int 5
    %3015 = torch.prims.convert_element_type %3014, %int5_4175 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_4176 = torch.constant.int 1
    %int1_4177 = torch.constant.int 1
    %3016 = torch.aten.add.Scalar %2945, %int1_4176, %int1_4177 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3017 = torch.aten.mul.Tensor %3016, %3015 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4178 = torch.constant.int 1
    %3018 = torch.aten.add.Tensor %3017, %2944, %int1_4178 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_4179 = torch.constant.int 512
    %int3072_4180 = torch.constant.int 3072
    %3019 = torch.prim.ListConstruct %int512_4179, %int3072_4180 : (!torch.int, !torch.int) -> !torch.list<int>
    %3020 = torch.aten.view %3018, %3019 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.8.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.8.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %3021 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_4181 = torch.constant.int 0
    %int1_4182 = torch.constant.int 1
    %3022 = torch.aten.transpose.int %3021, %int0_4181, %int1_4182 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.8.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.8.txt_attn.qkv.bias : tensor<9216xf16>
    %3023 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_4183 = torch.constant.int 6
    %3024 = torch.prims.convert_element_type %3023, %int6_4183 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_4184 = torch.constant.int 6
    %3025 = torch.prims.convert_element_type %3020, %int6_4184 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_4185 = torch.constant.int 6
    %3026 = torch.prims.convert_element_type %3022, %int6_4185 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %3027 = torch.aten.mm %3025, %3026 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_4186 = torch.constant.int 1
    %3028 = torch.aten.mul.Scalar %3027, %int1_4186 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_4187 = torch.constant.int 1
    %3029 = torch.aten.mul.Scalar %3024, %int1_4187 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_4188 = torch.constant.int 1
    %3030 = torch.aten.add.Tensor %3028, %3029, %int1_4188 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_4189 = torch.constant.int 5
    %3031 = torch.prims.convert_element_type %3030, %int5_4189 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_4190 = torch.constant.int 1
    %int512_4191 = torch.constant.int 512
    %int9216_4192 = torch.constant.int 9216
    %3032 = torch.prim.ListConstruct %int1_4190, %int512_4191, %int9216_4192 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3033 = torch.aten.view %3031, %3032 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %3034 = torch_c.to_builtin_tensor %3033 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_4193 = tensor.cast %3034 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_4194 = arith.constant 0 : index
    %dim_4195 = tensor.dim %cast_4193, %c0_4194 : tensor<?x?x?xf16>
    %c1_4196 = arith.constant 1 : index
    %dim_4197 = tensor.dim %cast_4193, %c1_4196 : tensor<?x?x?xf16>
    %c2_4198 = arith.constant 2 : index
    %dim_4199 = tensor.dim %cast_4193, %c2_4198 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_4193 : tensor<?x?x?xf16>{%dim_4195, %dim_4197, %dim_4199}]
    %cast_4200 = tensor.cast %cast_4193 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %3035 = torch_c.from_builtin_tensor %cast_4200 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_4201 = torch.constant.int 1
    %int512_4202 = torch.constant.int 512
    %int3_4203 = torch.constant.int 3
    %int24_4204 = torch.constant.int 24
    %int128_4205 = torch.constant.int 128
    %3036 = torch.prim.ListConstruct %int1_4201, %int512_4202, %int3_4203, %int24_4204, %int128_4205 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3037 = torch.aten.view %3035, %3036 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_4206 = torch.constant.int 2
    %int0_4207 = torch.constant.int 0
    %int3_4208 = torch.constant.int 3
    %int1_4209 = torch.constant.int 1
    %int4_4210 = torch.constant.int 4
    %3038 = torch.prim.ListConstruct %int2_4206, %int0_4207, %int3_4208, %int1_4209, %int4_4210 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3039 = torch.aten.permute %3037, %3038 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_4211 = torch.constant.int 0
    %int0_4212 = torch.constant.int 0
    %3040 = torch.aten.select.int %3039, %int0_4211, %int0_4212 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_4213 = torch.constant.int 6
    %3041 = torch.prims.convert_element_type %3040, %int6_4213 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_4214 = torch.constant.int 2
    %3042 = torch.aten.pow.Tensor_Scalar %3041, %int2_4214 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_4215 = torch.constant.int -1
    %3043 = torch.prim.ListConstruct %int-1_4215 : (!torch.int) -> !torch.list<int>
    %true_4216 = torch.constant.bool true
    %none_4217 = torch.constant.none
    %3044 = torch.aten.mean.dim %3042, %3043, %true_4216, %none_4217 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_4218 = torch.constant.float 9.9999999999999995E-7
    %int1_4219 = torch.constant.int 1
    %3045 = torch.aten.add.Scalar %3044, %float9.999990e-07_4218, %int1_4219 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %3046 = torch.aten.rsqrt %3045 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %3047 = torch.aten.mul.Tensor %3041, %3046 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_4220 = torch.constant.int 5
    %3048 = torch.prims.convert_element_type %3047, %int5_4220 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.8.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.8.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %3049 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3050 = torch.aten.mul.Tensor %3048, %3049 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_4221 = torch.constant.int 1
    %int512_4222 = torch.constant.int 512
    %int3_4223 = torch.constant.int 3
    %int24_4224 = torch.constant.int 24
    %int128_4225 = torch.constant.int 128
    %3051 = torch.prim.ListConstruct %int1_4221, %int512_4222, %int3_4223, %int24_4224, %int128_4225 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3052 = torch.aten.view %3035, %3051 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_4226 = torch.constant.int 2
    %int0_4227 = torch.constant.int 0
    %int3_4228 = torch.constant.int 3
    %int1_4229 = torch.constant.int 1
    %int4_4230 = torch.constant.int 4
    %3053 = torch.prim.ListConstruct %int2_4226, %int0_4227, %int3_4228, %int1_4229, %int4_4230 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3054 = torch.aten.permute %3052, %3053 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_4231 = torch.constant.int 0
    %int1_4232 = torch.constant.int 1
    %3055 = torch.aten.select.int %3054, %int0_4231, %int1_4232 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_4233 = torch.constant.int 6
    %3056 = torch.prims.convert_element_type %3055, %int6_4233 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_4234 = torch.constant.int 2
    %3057 = torch.aten.pow.Tensor_Scalar %3056, %int2_4234 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_4235 = torch.constant.int -1
    %3058 = torch.prim.ListConstruct %int-1_4235 : (!torch.int) -> !torch.list<int>
    %true_4236 = torch.constant.bool true
    %none_4237 = torch.constant.none
    %3059 = torch.aten.mean.dim %3057, %3058, %true_4236, %none_4237 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_4238 = torch.constant.float 9.9999999999999995E-7
    %int1_4239 = torch.constant.int 1
    %3060 = torch.aten.add.Scalar %3059, %float9.999990e-07_4238, %int1_4239 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %3061 = torch.aten.rsqrt %3060 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %3062 = torch.aten.mul.Tensor %3056, %3061 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_4240 = torch.constant.int 5
    %3063 = torch.prims.convert_element_type %3062, %int5_4240 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.8.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.8.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %3064 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3065 = torch.aten.mul.Tensor %3063, %3064 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_4241 = torch.constant.int 5
    %3066 = torch.prims.convert_element_type %3050, %int5_4241 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_4242 = torch.constant.int 5
    %3067 = torch.prims.convert_element_type %3065, %int5_4242 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %3068 = torch.prim.ListConstruct %3066, %3007 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_4243 = torch.constant.int 2
    %3069 = torch.aten.cat %3068, %int2_4243 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %3070 = torch.prim.ListConstruct %3067, %3008 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_4244 = torch.constant.int 2
    %3071 = torch.aten.cat %3070, %int2_4244 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_4245 = torch.constant.int 1
    %int512_4246 = torch.constant.int 512
    %int3_4247 = torch.constant.int 3
    %int24_4248 = torch.constant.int 24
    %int128_4249 = torch.constant.int 128
    %3072 = torch.prim.ListConstruct %int1_4245, %int512_4246, %int3_4247, %int24_4248, %int128_4249 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3073 = torch.aten.view %3035, %3072 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_4250 = torch.constant.int 2
    %int0_4251 = torch.constant.int 0
    %int3_4252 = torch.constant.int 3
    %int1_4253 = torch.constant.int 1
    %int4_4254 = torch.constant.int 4
    %3074 = torch.prim.ListConstruct %int2_4250, %int0_4251, %int3_4252, %int1_4253, %int4_4254 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3075 = torch.aten.permute %3073, %3074 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_4255 = torch.constant.int 0
    %int2_4256 = torch.constant.int 2
    %3076 = torch.aten.select.int %3075, %int0_4255, %int2_4256 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_4257 = torch.constant.int 1
    %int4096_4258 = torch.constant.int 4096
    %int3_4259 = torch.constant.int 3
    %int24_4260 = torch.constant.int 24
    %int128_4261 = torch.constant.int 128
    %3077 = torch.prim.ListConstruct %int1_4257, %int4096_4258, %int3_4259, %int24_4260, %int128_4261 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3078 = torch.aten.view %2976, %3077 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_4262 = torch.constant.int 2
    %int0_4263 = torch.constant.int 0
    %int3_4264 = torch.constant.int 3
    %int1_4265 = torch.constant.int 1
    %int4_4266 = torch.constant.int 4
    %3079 = torch.prim.ListConstruct %int2_4262, %int0_4263, %int3_4264, %int1_4265, %int4_4266 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3080 = torch.aten.permute %3078, %3079 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_4267 = torch.constant.int 0
    %int2_4268 = torch.constant.int 2
    %3081 = torch.aten.select.int %3080, %int0_4267, %int2_4268 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %3082 = torch.prim.ListConstruct %3076, %3081 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_4269 = torch.constant.int 2
    %3083 = torch.aten.cat %3082, %int2_4269 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %3084 = torch_c.to_builtin_tensor %3069 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_4270 = tensor.cast %3084 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_4271 = arith.constant 0 : index
    %dim_4272 = tensor.dim %cast_4270, %c0_4271 : tensor<?x?x?x?xf16>
    %c1_4273 = arith.constant 1 : index
    %dim_4274 = tensor.dim %cast_4270, %c1_4273 : tensor<?x?x?x?xf16>
    %c2_4275 = arith.constant 2 : index
    %dim_4276 = tensor.dim %cast_4270, %c2_4275 : tensor<?x?x?x?xf16>
    %c3_4277 = arith.constant 3 : index
    %dim_4278 = tensor.dim %cast_4270, %c3_4277 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_4270 : tensor<?x?x?x?xf16>{%dim_4272, %dim_4274, %dim_4276, %dim_4278}]
    %cast_4279 = tensor.cast %cast_4270 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %3085 = torch_c.from_builtin_tensor %cast_4279 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %3086 = torch_c.to_builtin_tensor %3071 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_4280 = tensor.cast %3086 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_4281 = arith.constant 0 : index
    %dim_4282 = tensor.dim %cast_4280, %c0_4281 : tensor<?x?x?x?xf16>
    %c1_4283 = arith.constant 1 : index
    %dim_4284 = tensor.dim %cast_4280, %c1_4283 : tensor<?x?x?x?xf16>
    %c2_4285 = arith.constant 2 : index
    %dim_4286 = tensor.dim %cast_4280, %c2_4285 : tensor<?x?x?x?xf16>
    %c3_4287 = arith.constant 3 : index
    %dim_4288 = tensor.dim %cast_4280, %c3_4287 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_4280 : tensor<?x?x?x?xf16>{%dim_4282, %dim_4284, %dim_4286, %dim_4288}]
    %cast_4289 = tensor.cast %cast_4280 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %3087 = torch_c.from_builtin_tensor %cast_4289 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %3088 = torch_c.to_builtin_tensor %3083 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_4290 = tensor.cast %3088 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_4291 = arith.constant 0 : index
    %dim_4292 = tensor.dim %cast_4290, %c0_4291 : tensor<?x?x?x?xf16>
    %c1_4293 = arith.constant 1 : index
    %dim_4294 = tensor.dim %cast_4290, %c1_4293 : tensor<?x?x?x?xf16>
    %c2_4295 = arith.constant 2 : index
    %dim_4296 = tensor.dim %cast_4290, %c2_4295 : tensor<?x?x?x?xf16>
    %c3_4297 = arith.constant 3 : index
    %dim_4298 = tensor.dim %cast_4290, %c3_4297 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_4290 : tensor<?x?x?x?xf16>{%dim_4292, %dim_4294, %dim_4296, %dim_4298}]
    %cast_4299 = tensor.cast %cast_4290 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %3089 = torch_c.from_builtin_tensor %cast_4299 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_4300 = torch.constant.int 6
    %3090 = torch.prims.convert_element_type %3085, %int6_4300 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_4301 = torch.constant.int 1
    %int24_4302 = torch.constant.int 24
    %int4608_4303 = torch.constant.int 4608
    %int-1_4304 = torch.constant.int -1
    %int1_4305 = torch.constant.int 1
    %int2_4306 = torch.constant.int 2
    %3091 = torch.prim.ListConstruct %int1_4301, %int24_4302, %int4608_4303, %int-1_4304, %int1_4305, %int2_4306 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3092 = torch.aten.view %3090, %3091 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_4307 = torch.constant.int 6
    %3093 = torch.prims.convert_element_type %3087, %int6_4307 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_4308 = torch.constant.int 1
    %int24_4309 = torch.constant.int 24
    %int4608_4310 = torch.constant.int 4608
    %int-1_4311 = torch.constant.int -1
    %int1_4312 = torch.constant.int 1
    %int2_4313 = torch.constant.int 2
    %3094 = torch.prim.ListConstruct %int1_4308, %int24_4309, %int4608_4310, %int-1_4311, %int1_4312, %int2_4313 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3095 = torch.aten.view %3093, %3094 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_4314 = torch.constant.int 5
    %int0_4315 = torch.constant.int 0
    %3096 = torch.aten.select.int %211, %int5_4314, %int0_4315 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4316 = torch.constant.int 5
    %int0_4317 = torch.constant.int 0
    %3097 = torch.aten.select.int %3092, %int5_4316, %int0_4317 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3098 = torch.aten.mul.Tensor %3096, %3097 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_4318 = torch.constant.int 5
    %int1_4319 = torch.constant.int 1
    %3099 = torch.aten.select.int %211, %int5_4318, %int1_4319 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4320 = torch.constant.int 5
    %int1_4321 = torch.constant.int 1
    %3100 = torch.aten.select.int %3092, %int5_4320, %int1_4321 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3101 = torch.aten.mul.Tensor %3099, %3100 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_4322 = torch.constant.int 1
    %3102 = torch.aten.add.Tensor %3098, %3101, %int1_4322 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_4323 = torch.constant.int 5
    %int0_4324 = torch.constant.int 0
    %3103 = torch.aten.select.int %211, %int5_4323, %int0_4324 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4325 = torch.constant.int 5
    %int0_4326 = torch.constant.int 0
    %3104 = torch.aten.select.int %3095, %int5_4325, %int0_4326 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3105 = torch.aten.mul.Tensor %3103, %3104 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_4327 = torch.constant.int 5
    %int1_4328 = torch.constant.int 1
    %3106 = torch.aten.select.int %211, %int5_4327, %int1_4328 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4329 = torch.constant.int 5
    %int1_4330 = torch.constant.int 1
    %3107 = torch.aten.select.int %3095, %int5_4329, %int1_4330 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3108 = torch.aten.mul.Tensor %3106, %3107 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_4331 = torch.constant.int 1
    %3109 = torch.aten.add.Tensor %3105, %3108, %int1_4331 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_4332 = torch.constant.int 1
    %int24_4333 = torch.constant.int 24
    %int4608_4334 = torch.constant.int 4608
    %int128_4335 = torch.constant.int 128
    %3110 = torch.prim.ListConstruct %int1_4332, %int24_4333, %int4608_4334, %int128_4335 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3111 = torch.aten.view %3102, %3110 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_4336 = torch.constant.int 5
    %3112 = torch.prims.convert_element_type %3111, %int5_4336 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_4337 = torch.constant.int 1
    %int24_4338 = torch.constant.int 24
    %int4608_4339 = torch.constant.int 4608
    %int128_4340 = torch.constant.int 128
    %3113 = torch.prim.ListConstruct %int1_4337, %int24_4338, %int4608_4339, %int128_4340 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3114 = torch.aten.view %3109, %3113 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_4341 = torch.constant.int 5
    %3115 = torch.prims.convert_element_type %3114, %int5_4341 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_4342 = torch.constant.float 0.000000e+00
    %false_4343 = torch.constant.bool false
    %none_4344 = torch.constant.none
    %none_4345 = torch.constant.none
    %3116:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%3112, %3115, %3089, %float0.000000e00_4342, %false_4343, %none_4344, %none_4345) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_4346 = torch.constant.int 0
    %int2_4347 = torch.constant.int 2
    %int1_4348 = torch.constant.int 1
    %int3_4349 = torch.constant.int 3
    %3117 = torch.prim.ListConstruct %int0_4346, %int2_4347, %int1_4348, %int3_4349 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3118 = torch.aten.permute %3116#0, %3117 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_4350 = torch.constant.int 1
    %int4608_4351 = torch.constant.int 4608
    %int3072_4352 = torch.constant.int 3072
    %3119 = torch.prim.ListConstruct %int1_4350, %int4608_4351, %int3072_4352 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3120 = torch.aten.view %3118, %3119 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_4353 = torch.constant.int 0
    %int0_4354 = torch.constant.int 0
    %int9223372036854775807_4355 = torch.constant.int 9223372036854775807
    %int1_4356 = torch.constant.int 1
    %3121 = torch.aten.slice.Tensor %3120, %int0_4353, %int0_4354, %int9223372036854775807_4355, %int1_4356 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_4357 = torch.constant.int 1
    %int0_4358 = torch.constant.int 0
    %int512_4359 = torch.constant.int 512
    %int1_4360 = torch.constant.int 1
    %3122 = torch.aten.slice.Tensor %3121, %int1_4357, %int0_4358, %int512_4359, %int1_4360 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_4361 = torch.constant.int 0
    %int0_4362 = torch.constant.int 0
    %int9223372036854775807_4363 = torch.constant.int 9223372036854775807
    %int1_4364 = torch.constant.int 1
    %3123 = torch.aten.slice.Tensor %3120, %int0_4361, %int0_4362, %int9223372036854775807_4363, %int1_4364 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_4365 = torch.constant.int 1
    %int512_4366 = torch.constant.int 512
    %int9223372036854775807_4367 = torch.constant.int 9223372036854775807
    %int1_4368 = torch.constant.int 1
    %3124 = torch.aten.slice.Tensor %3123, %int1_4365, %int512_4366, %int9223372036854775807_4367, %int1_4368 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_4369 = torch.constant.int 4096
    %int3072_4370 = torch.constant.int 3072
    %3125 = torch.prim.ListConstruct %int4096_4369, %int3072_4370 : (!torch.int, !torch.int) -> !torch.list<int>
    %3126 = torch.aten.view %3124, %3125 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.8.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.8.img_attn.proj.weight : tensor<3072x3072xf16>
    %3127 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_4371 = torch.constant.int 0
    %int1_4372 = torch.constant.int 1
    %3128 = torch.aten.transpose.int %3127, %int0_4371, %int1_4372 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.8.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.8.img_attn.proj.bias : tensor<3072xf16>
    %3129 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4373 = torch.constant.int 6
    %3130 = torch.prims.convert_element_type %3129, %int6_4373 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4374 = torch.constant.int 6
    %3131 = torch.prims.convert_element_type %3126, %int6_4374 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_4375 = torch.constant.int 6
    %3132 = torch.prims.convert_element_type %3128, %int6_4375 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %3133 = torch.aten.mm %3131, %3132 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_4376 = torch.constant.int 1
    %3134 = torch.aten.mul.Scalar %3133, %int1_4376 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_4377 = torch.constant.int 1
    %3135 = torch.aten.mul.Scalar %3130, %int1_4377 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4378 = torch.constant.int 1
    %3136 = torch.aten.add.Tensor %3134, %3135, %int1_4378 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_4379 = torch.constant.int 5
    %3137 = torch.prims.convert_element_type %3136, %int5_4379 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_4380 = torch.constant.int 1
    %int4096_4381 = torch.constant.int 4096
    %int3072_4382 = torch.constant.int 3072
    %3138 = torch.prim.ListConstruct %int1_4380, %int4096_4381, %int3072_4382 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3139 = torch.aten.view %3137, %3138 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %3140 = torch.aten.mul.Tensor %2925, %3139 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4383 = torch.constant.int 1
    %3141 = torch.aten.add.Tensor %2847, %3140, %int1_4383 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4384 = torch.constant.int 1
    %int1_4385 = torch.constant.int 1
    %3142 = torch.aten.add.Scalar %2927, %int1_4384, %int1_4385 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_4386 = torch.constant.int 6
    %3143 = torch.prims.convert_element_type %3141, %int6_4386 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_4387 = torch.constant.int 2
    %3144 = torch.prim.ListConstruct %int2_4387 : (!torch.int) -> !torch.list<int>
    %int0_4388 = torch.constant.int 0
    %true_4389 = torch.constant.bool true
    %result0_4390, %result1_4391 = torch.aten.var_mean.correction %3143, %3144, %int0_4388, %true_4389 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_4392 = torch.constant.float 9.9999999999999995E-7
    %int1_4393 = torch.constant.int 1
    %3145 = torch.aten.add.Scalar %result0_4390, %float9.999990e-07_4392, %int1_4393 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %3146 = torch.aten.rsqrt %3145 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_4394 = torch.constant.int 1
    %3147 = torch.aten.sub.Tensor %3141, %result1_4391, %int1_4394 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %3148 = torch.aten.mul.Tensor %3147, %3146 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_4395 = torch.constant.int 5
    %3149 = torch.prims.convert_element_type %3148, %int5_4395 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %3150 = torch.aten.mul.Tensor %3142, %3149 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4396 = torch.constant.int 1
    %3151 = torch.aten.add.Tensor %3150, %2926, %int1_4396 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_4397 = torch.constant.int 4096
    %int3072_4398 = torch.constant.int 3072
    %3152 = torch.prim.ListConstruct %int4096_4397, %int3072_4398 : (!torch.int, !torch.int) -> !torch.list<int>
    %3153 = torch.aten.view %3151, %3152 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.8.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.8.img_mlp.0.weight : tensor<12288x3072xf16>
    %3154 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_4399 = torch.constant.int 0
    %int1_4400 = torch.constant.int 1
    %3155 = torch.aten.transpose.int %3154, %int0_4399, %int1_4400 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.8.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.8.img_mlp.0.bias : tensor<12288xf16>
    %3156 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_4401 = torch.constant.int 6
    %3157 = torch.prims.convert_element_type %3156, %int6_4401 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_4402 = torch.constant.int 6
    %3158 = torch.prims.convert_element_type %3153, %int6_4402 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_4403 = torch.constant.int 6
    %3159 = torch.prims.convert_element_type %3155, %int6_4403 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %3160 = torch.aten.mm %3158, %3159 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_4404 = torch.constant.int 1
    %3161 = torch.aten.mul.Scalar %3160, %int1_4404 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_4405 = torch.constant.int 1
    %3162 = torch.aten.mul.Scalar %3157, %int1_4405 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_4406 = torch.constant.int 1
    %3163 = torch.aten.add.Tensor %3161, %3162, %int1_4406 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_4407 = torch.constant.int 5
    %3164 = torch.prims.convert_element_type %3163, %int5_4407 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_4408 = torch.constant.int 1
    %int4096_4409 = torch.constant.int 4096
    %int12288_4410 = torch.constant.int 12288
    %3165 = torch.prim.ListConstruct %int1_4408, %int4096_4409, %int12288_4410 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3166 = torch.aten.view %3164, %3165 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_4411 = torch.constant.str "tanh"
    %3167 = torch.aten.gelu %3166, %str_4411 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_4412 = torch.constant.int 4096
    %int12288_4413 = torch.constant.int 12288
    %3168 = torch.prim.ListConstruct %int4096_4412, %int12288_4413 : (!torch.int, !torch.int) -> !torch.list<int>
    %3169 = torch.aten.view %3167, %3168 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.8.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.8.img_mlp.2.weight : tensor<3072x12288xf16>
    %3170 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_4414 = torch.constant.int 0
    %int1_4415 = torch.constant.int 1
    %3171 = torch.aten.transpose.int %3170, %int0_4414, %int1_4415 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.8.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.8.img_mlp.2.bias : tensor<3072xf16>
    %3172 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4416 = torch.constant.int 6
    %3173 = torch.prims.convert_element_type %3172, %int6_4416 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4417 = torch.constant.int 6
    %3174 = torch.prims.convert_element_type %3169, %int6_4417 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_4418 = torch.constant.int 6
    %3175 = torch.prims.convert_element_type %3171, %int6_4418 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %3176 = torch.aten.mm %3174, %3175 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_4419 = torch.constant.int 1
    %3177 = torch.aten.mul.Scalar %3176, %int1_4419 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_4420 = torch.constant.int 1
    %3178 = torch.aten.mul.Scalar %3173, %int1_4420 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4421 = torch.constant.int 1
    %3179 = torch.aten.add.Tensor %3177, %3178, %int1_4421 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_4422 = torch.constant.int 5
    %3180 = torch.prims.convert_element_type %3179, %int5_4422 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_4423 = torch.constant.int 1
    %int4096_4424 = torch.constant.int 4096
    %int3072_4425 = torch.constant.int 3072
    %3181 = torch.prim.ListConstruct %int1_4423, %int4096_4424, %int3072_4425 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3182 = torch.aten.view %3180, %3181 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %3183 = torch.aten.mul.Tensor %2928, %3182 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4426 = torch.constant.int 1
    %3184 = torch.aten.add.Tensor %3141, %3183, %int1_4426 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_4427 = torch.constant.int 512
    %int3072_4428 = torch.constant.int 3072
    %3185 = torch.prim.ListConstruct %int512_4427, %int3072_4428 : (!torch.int, !torch.int) -> !torch.list<int>
    %3186 = torch.aten.view %3122, %3185 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.8.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.8.txt_attn.proj.weight : tensor<3072x3072xf16>
    %3187 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_4429 = torch.constant.int 0
    %int1_4430 = torch.constant.int 1
    %3188 = torch.aten.transpose.int %3187, %int0_4429, %int1_4430 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.8.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.8.txt_attn.proj.bias : tensor<3072xf16>
    %3189 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4431 = torch.constant.int 6
    %3190 = torch.prims.convert_element_type %3189, %int6_4431 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4432 = torch.constant.int 6
    %3191 = torch.prims.convert_element_type %3186, %int6_4432 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_4433 = torch.constant.int 6
    %3192 = torch.prims.convert_element_type %3188, %int6_4433 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %3193 = torch.aten.mm %3191, %3192 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_4434 = torch.constant.int 1
    %3194 = torch.aten.mul.Scalar %3193, %int1_4434 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_4435 = torch.constant.int 1
    %3195 = torch.aten.mul.Scalar %3190, %int1_4435 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4436 = torch.constant.int 1
    %3196 = torch.aten.add.Tensor %3194, %3195, %int1_4436 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_4437 = torch.constant.int 5
    %3197 = torch.prims.convert_element_type %3196, %int5_4437 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_4438 = torch.constant.int 1
    %int512_4439 = torch.constant.int 512
    %int3072_4440 = torch.constant.int 3072
    %3198 = torch.prim.ListConstruct %int1_4438, %int512_4439, %int3072_4440 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3199 = torch.aten.view %3197, %3198 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %3200 = torch.aten.mul.Tensor %2946, %3199 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4441 = torch.constant.int 1
    %3201 = torch.aten.add.Tensor %2907, %3200, %int1_4441 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_4442 = torch.constant.int 1
    %int1_4443 = torch.constant.int 1
    %3202 = torch.aten.add.Scalar %2948, %int1_4442, %int1_4443 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_4444 = torch.constant.int 6
    %3203 = torch.prims.convert_element_type %3201, %int6_4444 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_4445 = torch.constant.int 2
    %3204 = torch.prim.ListConstruct %int2_4445 : (!torch.int) -> !torch.list<int>
    %int0_4446 = torch.constant.int 0
    %true_4447 = torch.constant.bool true
    %result0_4448, %result1_4449 = torch.aten.var_mean.correction %3203, %3204, %int0_4446, %true_4447 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_4450 = torch.constant.float 9.9999999999999995E-7
    %int1_4451 = torch.constant.int 1
    %3205 = torch.aten.add.Scalar %result0_4448, %float9.999990e-07_4450, %int1_4451 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %3206 = torch.aten.rsqrt %3205 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_4452 = torch.constant.int 1
    %3207 = torch.aten.sub.Tensor %3201, %result1_4449, %int1_4452 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %3208 = torch.aten.mul.Tensor %3207, %3206 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_4453 = torch.constant.int 5
    %3209 = torch.prims.convert_element_type %3208, %int5_4453 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %3210 = torch.aten.mul.Tensor %3202, %3209 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4454 = torch.constant.int 1
    %3211 = torch.aten.add.Tensor %3210, %2947, %int1_4454 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_4455 = torch.constant.int 512
    %int3072_4456 = torch.constant.int 3072
    %3212 = torch.prim.ListConstruct %int512_4455, %int3072_4456 : (!torch.int, !torch.int) -> !torch.list<int>
    %3213 = torch.aten.view %3211, %3212 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.8.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.8.txt_mlp.0.weight : tensor<12288x3072xf16>
    %3214 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_4457 = torch.constant.int 0
    %int1_4458 = torch.constant.int 1
    %3215 = torch.aten.transpose.int %3214, %int0_4457, %int1_4458 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.8.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.8.txt_mlp.0.bias : tensor<12288xf16>
    %3216 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_4459 = torch.constant.int 6
    %3217 = torch.prims.convert_element_type %3216, %int6_4459 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_4460 = torch.constant.int 6
    %3218 = torch.prims.convert_element_type %3213, %int6_4460 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_4461 = torch.constant.int 6
    %3219 = torch.prims.convert_element_type %3215, %int6_4461 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %3220 = torch.aten.mm %3218, %3219 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_4462 = torch.constant.int 1
    %3221 = torch.aten.mul.Scalar %3220, %int1_4462 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_4463 = torch.constant.int 1
    %3222 = torch.aten.mul.Scalar %3217, %int1_4463 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_4464 = torch.constant.int 1
    %3223 = torch.aten.add.Tensor %3221, %3222, %int1_4464 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_4465 = torch.constant.int 5
    %3224 = torch.prims.convert_element_type %3223, %int5_4465 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_4466 = torch.constant.int 1
    %int512_4467 = torch.constant.int 512
    %int12288_4468 = torch.constant.int 12288
    %3225 = torch.prim.ListConstruct %int1_4466, %int512_4467, %int12288_4468 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3226 = torch.aten.view %3224, %3225 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_4469 = torch.constant.str "tanh"
    %3227 = torch.aten.gelu %3226, %str_4469 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_4470 = torch.constant.int 512
    %int12288_4471 = torch.constant.int 12288
    %3228 = torch.prim.ListConstruct %int512_4470, %int12288_4471 : (!torch.int, !torch.int) -> !torch.list<int>
    %3229 = torch.aten.view %3227, %3228 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.8.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.8.txt_mlp.2.weight : tensor<3072x12288xf16>
    %3230 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_4472 = torch.constant.int 0
    %int1_4473 = torch.constant.int 1
    %3231 = torch.aten.transpose.int %3230, %int0_4472, %int1_4473 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.8.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.8.txt_mlp.2.bias : tensor<3072xf16>
    %3232 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4474 = torch.constant.int 6
    %3233 = torch.prims.convert_element_type %3232, %int6_4474 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4475 = torch.constant.int 6
    %3234 = torch.prims.convert_element_type %3229, %int6_4475 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_4476 = torch.constant.int 6
    %3235 = torch.prims.convert_element_type %3231, %int6_4476 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %3236 = torch.aten.mm %3234, %3235 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_4477 = torch.constant.int 1
    %3237 = torch.aten.mul.Scalar %3236, %int1_4477 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_4478 = torch.constant.int 1
    %3238 = torch.aten.mul.Scalar %3233, %int1_4478 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4479 = torch.constant.int 1
    %3239 = torch.aten.add.Tensor %3237, %3238, %int1_4479 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_4480 = torch.constant.int 5
    %3240 = torch.prims.convert_element_type %3239, %int5_4480 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_4481 = torch.constant.int 1
    %int512_4482 = torch.constant.int 512
    %int3072_4483 = torch.constant.int 3072
    %3241 = torch.prim.ListConstruct %int1_4481, %int512_4482, %int3072_4483 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3242 = torch.aten.view %3240, %3241 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %3243 = torch.aten.mul.Tensor %2949, %3242 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4484 = torch.constant.int 1
    %3244 = torch.aten.add.Tensor %3201, %3243, %int1_4484 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %3245 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.9.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.9.img_mod.lin.weight : tensor<18432x3072xf16>
    %3246 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_4485 = torch.constant.int 0
    %int1_4486 = torch.constant.int 1
    %3247 = torch.aten.transpose.int %3246, %int0_4485, %int1_4486 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.9.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.9.img_mod.lin.bias : tensor<18432xf16>
    %3248 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_4487 = torch.constant.int 6
    %3249 = torch.prims.convert_element_type %3248, %int6_4487 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_4488 = torch.constant.int 6
    %3250 = torch.prims.convert_element_type %3245, %int6_4488 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_4489 = torch.constant.int 6
    %3251 = torch.prims.convert_element_type %3247, %int6_4489 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %3252 = torch.aten.mm %3250, %3251 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_4490 = torch.constant.int 1
    %3253 = torch.aten.mul.Scalar %3252, %int1_4490 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_4491 = torch.constant.int 1
    %3254 = torch.aten.mul.Scalar %3249, %int1_4491 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_4492 = torch.constant.int 1
    %3255 = torch.aten.add.Tensor %3253, %3254, %int1_4492 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_4493 = torch.constant.int 5
    %3256 = torch.prims.convert_element_type %3255, %int5_4493 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_4494 = torch.constant.int 0
    %int0_4495 = torch.constant.int 0
    %int9223372036854775807_4496 = torch.constant.int 9223372036854775807
    %int1_4497 = torch.constant.int 1
    %3257 = torch.aten.slice.Tensor %3256, %int0_4494, %int0_4495, %int9223372036854775807_4496, %int1_4497 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_4498 = torch.constant.int 1
    %3258 = torch.aten.unsqueeze %3257, %int1_4498 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_4499 = torch.constant.int 2
    %int0_4500 = torch.constant.int 0
    %int9223372036854775807_4501 = torch.constant.int 9223372036854775807
    %int1_4502 = torch.constant.int 1
    %3259 = torch.aten.slice.Tensor %3258, %int2_4499, %int0_4500, %int9223372036854775807_4501, %int1_4502 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_4503 = torch.constant.int -1
    %int0_4504 = torch.constant.int 0
    %int3072_4505 = torch.constant.int 3072
    %int1_4506 = torch.constant.int 1
    %3260 = torch.aten.slice.Tensor %3259, %int-1_4503, %int0_4504, %int3072_4505, %int1_4506 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4507 = torch.constant.int -1
    %int3072_4508 = torch.constant.int 3072
    %int6144_4509 = torch.constant.int 6144
    %int1_4510 = torch.constant.int 1
    %3261 = torch.aten.slice.Tensor %3259, %int-1_4507, %int3072_4508, %int6144_4509, %int1_4510 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4511 = torch.constant.int -1
    %int6144_4512 = torch.constant.int 6144
    %int9216_4513 = torch.constant.int 9216
    %int1_4514 = torch.constant.int 1
    %3262 = torch.aten.slice.Tensor %3259, %int-1_4511, %int6144_4512, %int9216_4513, %int1_4514 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4515 = torch.constant.int -1
    %int9216_4516 = torch.constant.int 9216
    %int12288_4517 = torch.constant.int 12288
    %int1_4518 = torch.constant.int 1
    %3263 = torch.aten.slice.Tensor %3259, %int-1_4515, %int9216_4516, %int12288_4517, %int1_4518 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4519 = torch.constant.int -1
    %int12288_4520 = torch.constant.int 12288
    %int15360_4521 = torch.constant.int 15360
    %int1_4522 = torch.constant.int 1
    %3264 = torch.aten.slice.Tensor %3259, %int-1_4519, %int12288_4520, %int15360_4521, %int1_4522 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4523 = torch.constant.int -1
    %int15360_4524 = torch.constant.int 15360
    %int18432_4525 = torch.constant.int 18432
    %int1_4526 = torch.constant.int 1
    %3265 = torch.aten.slice.Tensor %3259, %int-1_4523, %int15360_4524, %int18432_4525, %int1_4526 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3266 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.9.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.9.txt_mod.lin.weight : tensor<18432x3072xf16>
    %3267 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_4527 = torch.constant.int 0
    %int1_4528 = torch.constant.int 1
    %3268 = torch.aten.transpose.int %3267, %int0_4527, %int1_4528 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.9.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.9.txt_mod.lin.bias : tensor<18432xf16>
    %3269 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_4529 = torch.constant.int 6
    %3270 = torch.prims.convert_element_type %3269, %int6_4529 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_4530 = torch.constant.int 6
    %3271 = torch.prims.convert_element_type %3266, %int6_4530 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_4531 = torch.constant.int 6
    %3272 = torch.prims.convert_element_type %3268, %int6_4531 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %3273 = torch.aten.mm %3271, %3272 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_4532 = torch.constant.int 1
    %3274 = torch.aten.mul.Scalar %3273, %int1_4532 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_4533 = torch.constant.int 1
    %3275 = torch.aten.mul.Scalar %3270, %int1_4533 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_4534 = torch.constant.int 1
    %3276 = torch.aten.add.Tensor %3274, %3275, %int1_4534 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_4535 = torch.constant.int 5
    %3277 = torch.prims.convert_element_type %3276, %int5_4535 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_4536 = torch.constant.int 0
    %int0_4537 = torch.constant.int 0
    %int9223372036854775807_4538 = torch.constant.int 9223372036854775807
    %int1_4539 = torch.constant.int 1
    %3278 = torch.aten.slice.Tensor %3277, %int0_4536, %int0_4537, %int9223372036854775807_4538, %int1_4539 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_4540 = torch.constant.int 1
    %3279 = torch.aten.unsqueeze %3278, %int1_4540 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_4541 = torch.constant.int 2
    %int0_4542 = torch.constant.int 0
    %int9223372036854775807_4543 = torch.constant.int 9223372036854775807
    %int1_4544 = torch.constant.int 1
    %3280 = torch.aten.slice.Tensor %3279, %int2_4541, %int0_4542, %int9223372036854775807_4543, %int1_4544 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_4545 = torch.constant.int -1
    %int0_4546 = torch.constant.int 0
    %int3072_4547 = torch.constant.int 3072
    %int1_4548 = torch.constant.int 1
    %3281 = torch.aten.slice.Tensor %3280, %int-1_4545, %int0_4546, %int3072_4547, %int1_4548 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4549 = torch.constant.int -1
    %int3072_4550 = torch.constant.int 3072
    %int6144_4551 = torch.constant.int 6144
    %int1_4552 = torch.constant.int 1
    %3282 = torch.aten.slice.Tensor %3280, %int-1_4549, %int3072_4550, %int6144_4551, %int1_4552 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4553 = torch.constant.int -1
    %int6144_4554 = torch.constant.int 6144
    %int9216_4555 = torch.constant.int 9216
    %int1_4556 = torch.constant.int 1
    %3283 = torch.aten.slice.Tensor %3280, %int-1_4553, %int6144_4554, %int9216_4555, %int1_4556 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4557 = torch.constant.int -1
    %int9216_4558 = torch.constant.int 9216
    %int12288_4559 = torch.constant.int 12288
    %int1_4560 = torch.constant.int 1
    %3284 = torch.aten.slice.Tensor %3280, %int-1_4557, %int9216_4558, %int12288_4559, %int1_4560 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4561 = torch.constant.int -1
    %int12288_4562 = torch.constant.int 12288
    %int15360_4563 = torch.constant.int 15360
    %int1_4564 = torch.constant.int 1
    %3285 = torch.aten.slice.Tensor %3280, %int-1_4561, %int12288_4562, %int15360_4563, %int1_4564 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4565 = torch.constant.int -1
    %int15360_4566 = torch.constant.int 15360
    %int18432_4567 = torch.constant.int 18432
    %int1_4568 = torch.constant.int 1
    %3286 = torch.aten.slice.Tensor %3280, %int-1_4565, %int15360_4566, %int18432_4567, %int1_4568 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_4569 = torch.constant.int 6
    %3287 = torch.prims.convert_element_type %3184, %int6_4569 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_4570 = torch.constant.int 2
    %3288 = torch.prim.ListConstruct %int2_4570 : (!torch.int) -> !torch.list<int>
    %int0_4571 = torch.constant.int 0
    %true_4572 = torch.constant.bool true
    %result0_4573, %result1_4574 = torch.aten.var_mean.correction %3287, %3288, %int0_4571, %true_4572 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_4575 = torch.constant.float 9.9999999999999995E-7
    %int1_4576 = torch.constant.int 1
    %3289 = torch.aten.add.Scalar %result0_4573, %float9.999990e-07_4575, %int1_4576 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %3290 = torch.aten.rsqrt %3289 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_4577 = torch.constant.int 1
    %3291 = torch.aten.sub.Tensor %3184, %result1_4574, %int1_4577 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %3292 = torch.aten.mul.Tensor %3291, %3290 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_4578 = torch.constant.int 5
    %3293 = torch.prims.convert_element_type %3292, %int5_4578 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4579 = torch.constant.int 1
    %int1_4580 = torch.constant.int 1
    %3294 = torch.aten.add.Scalar %3261, %int1_4579, %int1_4580 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3295 = torch.aten.mul.Tensor %3294, %3293 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4581 = torch.constant.int 1
    %3296 = torch.aten.add.Tensor %3295, %3260, %int1_4581 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_4582 = torch.constant.int 4096
    %int3072_4583 = torch.constant.int 3072
    %3297 = torch.prim.ListConstruct %int4096_4582, %int3072_4583 : (!torch.int, !torch.int) -> !torch.list<int>
    %3298 = torch.aten.view %3296, %3297 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.9.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.9.img_attn.qkv.weight : tensor<9216x3072xf16>
    %3299 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_4584 = torch.constant.int 0
    %int1_4585 = torch.constant.int 1
    %3300 = torch.aten.transpose.int %3299, %int0_4584, %int1_4585 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.9.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.9.img_attn.qkv.bias : tensor<9216xf16>
    %3301 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_4586 = torch.constant.int 6
    %3302 = torch.prims.convert_element_type %3301, %int6_4586 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_4587 = torch.constant.int 6
    %3303 = torch.prims.convert_element_type %3298, %int6_4587 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_4588 = torch.constant.int 6
    %3304 = torch.prims.convert_element_type %3300, %int6_4588 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %3305 = torch.aten.mm %3303, %3304 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_4589 = torch.constant.int 1
    %3306 = torch.aten.mul.Scalar %3305, %int1_4589 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_4590 = torch.constant.int 1
    %3307 = torch.aten.mul.Scalar %3302, %int1_4590 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_4591 = torch.constant.int 1
    %3308 = torch.aten.add.Tensor %3306, %3307, %int1_4591 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_4592 = torch.constant.int 5
    %3309 = torch.prims.convert_element_type %3308, %int5_4592 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_4593 = torch.constant.int 1
    %int4096_4594 = torch.constant.int 4096
    %int9216_4595 = torch.constant.int 9216
    %3310 = torch.prim.ListConstruct %int1_4593, %int4096_4594, %int9216_4595 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3311 = torch.aten.view %3309, %3310 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %3312 = torch_c.to_builtin_tensor %3311 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_4596 = tensor.cast %3312 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_4597 = arith.constant 0 : index
    %dim_4598 = tensor.dim %cast_4596, %c0_4597 : tensor<?x?x?xf16>
    %c1_4599 = arith.constant 1 : index
    %dim_4600 = tensor.dim %cast_4596, %c1_4599 : tensor<?x?x?xf16>
    %c2_4601 = arith.constant 2 : index
    %dim_4602 = tensor.dim %cast_4596, %c2_4601 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_4596 : tensor<?x?x?xf16>{%dim_4598, %dim_4600, %dim_4602}]
    %cast_4603 = tensor.cast %cast_4596 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %3313 = torch_c.from_builtin_tensor %cast_4603 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_4604 = torch.constant.int 1
    %int4096_4605 = torch.constant.int 4096
    %int3_4606 = torch.constant.int 3
    %int24_4607 = torch.constant.int 24
    %int128_4608 = torch.constant.int 128
    %3314 = torch.prim.ListConstruct %int1_4604, %int4096_4605, %int3_4606, %int24_4607, %int128_4608 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3315 = torch.aten.view %3313, %3314 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_4609 = torch.constant.int 2
    %int0_4610 = torch.constant.int 0
    %int3_4611 = torch.constant.int 3
    %int1_4612 = torch.constant.int 1
    %int4_4613 = torch.constant.int 4
    %3316 = torch.prim.ListConstruct %int2_4609, %int0_4610, %int3_4611, %int1_4612, %int4_4613 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3317 = torch.aten.permute %3315, %3316 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_4614 = torch.constant.int 0
    %int0_4615 = torch.constant.int 0
    %3318 = torch.aten.select.int %3317, %int0_4614, %int0_4615 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_4616 = torch.constant.int 6
    %3319 = torch.prims.convert_element_type %3318, %int6_4616 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_4617 = torch.constant.int 2
    %3320 = torch.aten.pow.Tensor_Scalar %3319, %int2_4617 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_4618 = torch.constant.int -1
    %3321 = torch.prim.ListConstruct %int-1_4618 : (!torch.int) -> !torch.list<int>
    %true_4619 = torch.constant.bool true
    %none_4620 = torch.constant.none
    %3322 = torch.aten.mean.dim %3320, %3321, %true_4619, %none_4620 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_4621 = torch.constant.float 9.9999999999999995E-7
    %int1_4622 = torch.constant.int 1
    %3323 = torch.aten.add.Scalar %3322, %float9.999990e-07_4621, %int1_4622 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %3324 = torch.aten.rsqrt %3323 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %3325 = torch.aten.mul.Tensor %3319, %3324 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_4623 = torch.constant.int 5
    %3326 = torch.prims.convert_element_type %3325, %int5_4623 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.9.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.9.img_attn.norm.query_norm.scale : tensor<128xf16>
    %3327 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3328 = torch.aten.mul.Tensor %3326, %3327 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_4624 = torch.constant.int 1
    %int4096_4625 = torch.constant.int 4096
    %int3_4626 = torch.constant.int 3
    %int24_4627 = torch.constant.int 24
    %int128_4628 = torch.constant.int 128
    %3329 = torch.prim.ListConstruct %int1_4624, %int4096_4625, %int3_4626, %int24_4627, %int128_4628 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3330 = torch.aten.view %3313, %3329 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_4629 = torch.constant.int 2
    %int0_4630 = torch.constant.int 0
    %int3_4631 = torch.constant.int 3
    %int1_4632 = torch.constant.int 1
    %int4_4633 = torch.constant.int 4
    %3331 = torch.prim.ListConstruct %int2_4629, %int0_4630, %int3_4631, %int1_4632, %int4_4633 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3332 = torch.aten.permute %3330, %3331 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_4634 = torch.constant.int 0
    %int1_4635 = torch.constant.int 1
    %3333 = torch.aten.select.int %3332, %int0_4634, %int1_4635 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_4636 = torch.constant.int 6
    %3334 = torch.prims.convert_element_type %3333, %int6_4636 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_4637 = torch.constant.int 2
    %3335 = torch.aten.pow.Tensor_Scalar %3334, %int2_4637 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_4638 = torch.constant.int -1
    %3336 = torch.prim.ListConstruct %int-1_4638 : (!torch.int) -> !torch.list<int>
    %true_4639 = torch.constant.bool true
    %none_4640 = torch.constant.none
    %3337 = torch.aten.mean.dim %3335, %3336, %true_4639, %none_4640 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_4641 = torch.constant.float 9.9999999999999995E-7
    %int1_4642 = torch.constant.int 1
    %3338 = torch.aten.add.Scalar %3337, %float9.999990e-07_4641, %int1_4642 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %3339 = torch.aten.rsqrt %3338 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %3340 = torch.aten.mul.Tensor %3334, %3339 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_4643 = torch.constant.int 5
    %3341 = torch.prims.convert_element_type %3340, %int5_4643 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.9.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.9.img_attn.norm.key_norm.scale : tensor<128xf16>
    %3342 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3343 = torch.aten.mul.Tensor %3341, %3342 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_4644 = torch.constant.int 5
    %3344 = torch.prims.convert_element_type %3328, %int5_4644 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_4645 = torch.constant.int 5
    %3345 = torch.prims.convert_element_type %3343, %int5_4645 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_4646 = torch.constant.int 6
    %3346 = torch.prims.convert_element_type %3244, %int6_4646 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_4647 = torch.constant.int 2
    %3347 = torch.prim.ListConstruct %int2_4647 : (!torch.int) -> !torch.list<int>
    %int0_4648 = torch.constant.int 0
    %true_4649 = torch.constant.bool true
    %result0_4650, %result1_4651 = torch.aten.var_mean.correction %3346, %3347, %int0_4648, %true_4649 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_4652 = torch.constant.float 9.9999999999999995E-7
    %int1_4653 = torch.constant.int 1
    %3348 = torch.aten.add.Scalar %result0_4650, %float9.999990e-07_4652, %int1_4653 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %3349 = torch.aten.rsqrt %3348 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_4654 = torch.constant.int 1
    %3350 = torch.aten.sub.Tensor %3244, %result1_4651, %int1_4654 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %3351 = torch.aten.mul.Tensor %3350, %3349 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_4655 = torch.constant.int 5
    %3352 = torch.prims.convert_element_type %3351, %int5_4655 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_4656 = torch.constant.int 1
    %int1_4657 = torch.constant.int 1
    %3353 = torch.aten.add.Scalar %3282, %int1_4656, %int1_4657 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3354 = torch.aten.mul.Tensor %3353, %3352 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4658 = torch.constant.int 1
    %3355 = torch.aten.add.Tensor %3354, %3281, %int1_4658 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_4659 = torch.constant.int 512
    %int3072_4660 = torch.constant.int 3072
    %3356 = torch.prim.ListConstruct %int512_4659, %int3072_4660 : (!torch.int, !torch.int) -> !torch.list<int>
    %3357 = torch.aten.view %3355, %3356 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.9.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.9.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %3358 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_4661 = torch.constant.int 0
    %int1_4662 = torch.constant.int 1
    %3359 = torch.aten.transpose.int %3358, %int0_4661, %int1_4662 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.9.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.9.txt_attn.qkv.bias : tensor<9216xf16>
    %3360 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_4663 = torch.constant.int 6
    %3361 = torch.prims.convert_element_type %3360, %int6_4663 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_4664 = torch.constant.int 6
    %3362 = torch.prims.convert_element_type %3357, %int6_4664 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_4665 = torch.constant.int 6
    %3363 = torch.prims.convert_element_type %3359, %int6_4665 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %3364 = torch.aten.mm %3362, %3363 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_4666 = torch.constant.int 1
    %3365 = torch.aten.mul.Scalar %3364, %int1_4666 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_4667 = torch.constant.int 1
    %3366 = torch.aten.mul.Scalar %3361, %int1_4667 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_4668 = torch.constant.int 1
    %3367 = torch.aten.add.Tensor %3365, %3366, %int1_4668 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_4669 = torch.constant.int 5
    %3368 = torch.prims.convert_element_type %3367, %int5_4669 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_4670 = torch.constant.int 1
    %int512_4671 = torch.constant.int 512
    %int9216_4672 = torch.constant.int 9216
    %3369 = torch.prim.ListConstruct %int1_4670, %int512_4671, %int9216_4672 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3370 = torch.aten.view %3368, %3369 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %3371 = torch_c.to_builtin_tensor %3370 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_4673 = tensor.cast %3371 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_4674 = arith.constant 0 : index
    %dim_4675 = tensor.dim %cast_4673, %c0_4674 : tensor<?x?x?xf16>
    %c1_4676 = arith.constant 1 : index
    %dim_4677 = tensor.dim %cast_4673, %c1_4676 : tensor<?x?x?xf16>
    %c2_4678 = arith.constant 2 : index
    %dim_4679 = tensor.dim %cast_4673, %c2_4678 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_4673 : tensor<?x?x?xf16>{%dim_4675, %dim_4677, %dim_4679}]
    %cast_4680 = tensor.cast %cast_4673 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %3372 = torch_c.from_builtin_tensor %cast_4680 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_4681 = torch.constant.int 1
    %int512_4682 = torch.constant.int 512
    %int3_4683 = torch.constant.int 3
    %int24_4684 = torch.constant.int 24
    %int128_4685 = torch.constant.int 128
    %3373 = torch.prim.ListConstruct %int1_4681, %int512_4682, %int3_4683, %int24_4684, %int128_4685 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3374 = torch.aten.view %3372, %3373 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_4686 = torch.constant.int 2
    %int0_4687 = torch.constant.int 0
    %int3_4688 = torch.constant.int 3
    %int1_4689 = torch.constant.int 1
    %int4_4690 = torch.constant.int 4
    %3375 = torch.prim.ListConstruct %int2_4686, %int0_4687, %int3_4688, %int1_4689, %int4_4690 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3376 = torch.aten.permute %3374, %3375 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_4691 = torch.constant.int 0
    %int0_4692 = torch.constant.int 0
    %3377 = torch.aten.select.int %3376, %int0_4691, %int0_4692 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_4693 = torch.constant.int 6
    %3378 = torch.prims.convert_element_type %3377, %int6_4693 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_4694 = torch.constant.int 2
    %3379 = torch.aten.pow.Tensor_Scalar %3378, %int2_4694 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_4695 = torch.constant.int -1
    %3380 = torch.prim.ListConstruct %int-1_4695 : (!torch.int) -> !torch.list<int>
    %true_4696 = torch.constant.bool true
    %none_4697 = torch.constant.none
    %3381 = torch.aten.mean.dim %3379, %3380, %true_4696, %none_4697 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_4698 = torch.constant.float 9.9999999999999995E-7
    %int1_4699 = torch.constant.int 1
    %3382 = torch.aten.add.Scalar %3381, %float9.999990e-07_4698, %int1_4699 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %3383 = torch.aten.rsqrt %3382 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %3384 = torch.aten.mul.Tensor %3378, %3383 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_4700 = torch.constant.int 5
    %3385 = torch.prims.convert_element_type %3384, %int5_4700 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.9.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.9.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %3386 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3387 = torch.aten.mul.Tensor %3385, %3386 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_4701 = torch.constant.int 1
    %int512_4702 = torch.constant.int 512
    %int3_4703 = torch.constant.int 3
    %int24_4704 = torch.constant.int 24
    %int128_4705 = torch.constant.int 128
    %3388 = torch.prim.ListConstruct %int1_4701, %int512_4702, %int3_4703, %int24_4704, %int128_4705 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3389 = torch.aten.view %3372, %3388 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_4706 = torch.constant.int 2
    %int0_4707 = torch.constant.int 0
    %int3_4708 = torch.constant.int 3
    %int1_4709 = torch.constant.int 1
    %int4_4710 = torch.constant.int 4
    %3390 = torch.prim.ListConstruct %int2_4706, %int0_4707, %int3_4708, %int1_4709, %int4_4710 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3391 = torch.aten.permute %3389, %3390 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_4711 = torch.constant.int 0
    %int1_4712 = torch.constant.int 1
    %3392 = torch.aten.select.int %3391, %int0_4711, %int1_4712 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_4713 = torch.constant.int 6
    %3393 = torch.prims.convert_element_type %3392, %int6_4713 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_4714 = torch.constant.int 2
    %3394 = torch.aten.pow.Tensor_Scalar %3393, %int2_4714 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_4715 = torch.constant.int -1
    %3395 = torch.prim.ListConstruct %int-1_4715 : (!torch.int) -> !torch.list<int>
    %true_4716 = torch.constant.bool true
    %none_4717 = torch.constant.none
    %3396 = torch.aten.mean.dim %3394, %3395, %true_4716, %none_4717 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_4718 = torch.constant.float 9.9999999999999995E-7
    %int1_4719 = torch.constant.int 1
    %3397 = torch.aten.add.Scalar %3396, %float9.999990e-07_4718, %int1_4719 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %3398 = torch.aten.rsqrt %3397 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %3399 = torch.aten.mul.Tensor %3393, %3398 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_4720 = torch.constant.int 5
    %3400 = torch.prims.convert_element_type %3399, %int5_4720 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.9.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.9.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %3401 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3402 = torch.aten.mul.Tensor %3400, %3401 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_4721 = torch.constant.int 5
    %3403 = torch.prims.convert_element_type %3387, %int5_4721 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_4722 = torch.constant.int 5
    %3404 = torch.prims.convert_element_type %3402, %int5_4722 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %3405 = torch.prim.ListConstruct %3403, %3344 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_4723 = torch.constant.int 2
    %3406 = torch.aten.cat %3405, %int2_4723 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %3407 = torch.prim.ListConstruct %3404, %3345 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_4724 = torch.constant.int 2
    %3408 = torch.aten.cat %3407, %int2_4724 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_4725 = torch.constant.int 1
    %int512_4726 = torch.constant.int 512
    %int3_4727 = torch.constant.int 3
    %int24_4728 = torch.constant.int 24
    %int128_4729 = torch.constant.int 128
    %3409 = torch.prim.ListConstruct %int1_4725, %int512_4726, %int3_4727, %int24_4728, %int128_4729 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3410 = torch.aten.view %3372, %3409 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_4730 = torch.constant.int 2
    %int0_4731 = torch.constant.int 0
    %int3_4732 = torch.constant.int 3
    %int1_4733 = torch.constant.int 1
    %int4_4734 = torch.constant.int 4
    %3411 = torch.prim.ListConstruct %int2_4730, %int0_4731, %int3_4732, %int1_4733, %int4_4734 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3412 = torch.aten.permute %3410, %3411 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_4735 = torch.constant.int 0
    %int2_4736 = torch.constant.int 2
    %3413 = torch.aten.select.int %3412, %int0_4735, %int2_4736 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_4737 = torch.constant.int 1
    %int4096_4738 = torch.constant.int 4096
    %int3_4739 = torch.constant.int 3
    %int24_4740 = torch.constant.int 24
    %int128_4741 = torch.constant.int 128
    %3414 = torch.prim.ListConstruct %int1_4737, %int4096_4738, %int3_4739, %int24_4740, %int128_4741 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3415 = torch.aten.view %3313, %3414 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_4742 = torch.constant.int 2
    %int0_4743 = torch.constant.int 0
    %int3_4744 = torch.constant.int 3
    %int1_4745 = torch.constant.int 1
    %int4_4746 = torch.constant.int 4
    %3416 = torch.prim.ListConstruct %int2_4742, %int0_4743, %int3_4744, %int1_4745, %int4_4746 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3417 = torch.aten.permute %3415, %3416 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_4747 = torch.constant.int 0
    %int2_4748 = torch.constant.int 2
    %3418 = torch.aten.select.int %3417, %int0_4747, %int2_4748 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %3419 = torch.prim.ListConstruct %3413, %3418 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_4749 = torch.constant.int 2
    %3420 = torch.aten.cat %3419, %int2_4749 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %3421 = torch_c.to_builtin_tensor %3406 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_4750 = tensor.cast %3421 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_4751 = arith.constant 0 : index
    %dim_4752 = tensor.dim %cast_4750, %c0_4751 : tensor<?x?x?x?xf16>
    %c1_4753 = arith.constant 1 : index
    %dim_4754 = tensor.dim %cast_4750, %c1_4753 : tensor<?x?x?x?xf16>
    %c2_4755 = arith.constant 2 : index
    %dim_4756 = tensor.dim %cast_4750, %c2_4755 : tensor<?x?x?x?xf16>
    %c3_4757 = arith.constant 3 : index
    %dim_4758 = tensor.dim %cast_4750, %c3_4757 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_4750 : tensor<?x?x?x?xf16>{%dim_4752, %dim_4754, %dim_4756, %dim_4758}]
    %cast_4759 = tensor.cast %cast_4750 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %3422 = torch_c.from_builtin_tensor %cast_4759 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %3423 = torch_c.to_builtin_tensor %3408 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_4760 = tensor.cast %3423 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_4761 = arith.constant 0 : index
    %dim_4762 = tensor.dim %cast_4760, %c0_4761 : tensor<?x?x?x?xf16>
    %c1_4763 = arith.constant 1 : index
    %dim_4764 = tensor.dim %cast_4760, %c1_4763 : tensor<?x?x?x?xf16>
    %c2_4765 = arith.constant 2 : index
    %dim_4766 = tensor.dim %cast_4760, %c2_4765 : tensor<?x?x?x?xf16>
    %c3_4767 = arith.constant 3 : index
    %dim_4768 = tensor.dim %cast_4760, %c3_4767 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_4760 : tensor<?x?x?x?xf16>{%dim_4762, %dim_4764, %dim_4766, %dim_4768}]
    %cast_4769 = tensor.cast %cast_4760 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %3424 = torch_c.from_builtin_tensor %cast_4769 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %3425 = torch_c.to_builtin_tensor %3420 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_4770 = tensor.cast %3425 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_4771 = arith.constant 0 : index
    %dim_4772 = tensor.dim %cast_4770, %c0_4771 : tensor<?x?x?x?xf16>
    %c1_4773 = arith.constant 1 : index
    %dim_4774 = tensor.dim %cast_4770, %c1_4773 : tensor<?x?x?x?xf16>
    %c2_4775 = arith.constant 2 : index
    %dim_4776 = tensor.dim %cast_4770, %c2_4775 : tensor<?x?x?x?xf16>
    %c3_4777 = arith.constant 3 : index
    %dim_4778 = tensor.dim %cast_4770, %c3_4777 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_4770 : tensor<?x?x?x?xf16>{%dim_4772, %dim_4774, %dim_4776, %dim_4778}]
    %cast_4779 = tensor.cast %cast_4770 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %3426 = torch_c.from_builtin_tensor %cast_4779 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_4780 = torch.constant.int 6
    %3427 = torch.prims.convert_element_type %3422, %int6_4780 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_4781 = torch.constant.int 1
    %int24_4782 = torch.constant.int 24
    %int4608_4783 = torch.constant.int 4608
    %int-1_4784 = torch.constant.int -1
    %int1_4785 = torch.constant.int 1
    %int2_4786 = torch.constant.int 2
    %3428 = torch.prim.ListConstruct %int1_4781, %int24_4782, %int4608_4783, %int-1_4784, %int1_4785, %int2_4786 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3429 = torch.aten.view %3427, %3428 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_4787 = torch.constant.int 6
    %3430 = torch.prims.convert_element_type %3424, %int6_4787 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_4788 = torch.constant.int 1
    %int24_4789 = torch.constant.int 24
    %int4608_4790 = torch.constant.int 4608
    %int-1_4791 = torch.constant.int -1
    %int1_4792 = torch.constant.int 1
    %int2_4793 = torch.constant.int 2
    %3431 = torch.prim.ListConstruct %int1_4788, %int24_4789, %int4608_4790, %int-1_4791, %int1_4792, %int2_4793 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3432 = torch.aten.view %3430, %3431 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_4794 = torch.constant.int 5
    %int0_4795 = torch.constant.int 0
    %3433 = torch.aten.select.int %211, %int5_4794, %int0_4795 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4796 = torch.constant.int 5
    %int0_4797 = torch.constant.int 0
    %3434 = torch.aten.select.int %3429, %int5_4796, %int0_4797 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3435 = torch.aten.mul.Tensor %3433, %3434 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_4798 = torch.constant.int 5
    %int1_4799 = torch.constant.int 1
    %3436 = torch.aten.select.int %211, %int5_4798, %int1_4799 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4800 = torch.constant.int 5
    %int1_4801 = torch.constant.int 1
    %3437 = torch.aten.select.int %3429, %int5_4800, %int1_4801 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3438 = torch.aten.mul.Tensor %3436, %3437 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_4802 = torch.constant.int 1
    %3439 = torch.aten.add.Tensor %3435, %3438, %int1_4802 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_4803 = torch.constant.int 5
    %int0_4804 = torch.constant.int 0
    %3440 = torch.aten.select.int %211, %int5_4803, %int0_4804 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4805 = torch.constant.int 5
    %int0_4806 = torch.constant.int 0
    %3441 = torch.aten.select.int %3432, %int5_4805, %int0_4806 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3442 = torch.aten.mul.Tensor %3440, %3441 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_4807 = torch.constant.int 5
    %int1_4808 = torch.constant.int 1
    %3443 = torch.aten.select.int %211, %int5_4807, %int1_4808 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4809 = torch.constant.int 5
    %int1_4810 = torch.constant.int 1
    %3444 = torch.aten.select.int %3432, %int5_4809, %int1_4810 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3445 = torch.aten.mul.Tensor %3443, %3444 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_4811 = torch.constant.int 1
    %3446 = torch.aten.add.Tensor %3442, %3445, %int1_4811 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_4812 = torch.constant.int 1
    %int24_4813 = torch.constant.int 24
    %int4608_4814 = torch.constant.int 4608
    %int128_4815 = torch.constant.int 128
    %3447 = torch.prim.ListConstruct %int1_4812, %int24_4813, %int4608_4814, %int128_4815 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3448 = torch.aten.view %3439, %3447 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_4816 = torch.constant.int 5
    %3449 = torch.prims.convert_element_type %3448, %int5_4816 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_4817 = torch.constant.int 1
    %int24_4818 = torch.constant.int 24
    %int4608_4819 = torch.constant.int 4608
    %int128_4820 = torch.constant.int 128
    %3450 = torch.prim.ListConstruct %int1_4817, %int24_4818, %int4608_4819, %int128_4820 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3451 = torch.aten.view %3446, %3450 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_4821 = torch.constant.int 5
    %3452 = torch.prims.convert_element_type %3451, %int5_4821 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_4822 = torch.constant.float 0.000000e+00
    %false_4823 = torch.constant.bool false
    %none_4824 = torch.constant.none
    %none_4825 = torch.constant.none
    %3453:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%3449, %3452, %3426, %float0.000000e00_4822, %false_4823, %none_4824, %none_4825) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_4826 = torch.constant.int 0
    %int2_4827 = torch.constant.int 2
    %int1_4828 = torch.constant.int 1
    %int3_4829 = torch.constant.int 3
    %3454 = torch.prim.ListConstruct %int0_4826, %int2_4827, %int1_4828, %int3_4829 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3455 = torch.aten.permute %3453#0, %3454 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_4830 = torch.constant.int 1
    %int4608_4831 = torch.constant.int 4608
    %int3072_4832 = torch.constant.int 3072
    %3456 = torch.prim.ListConstruct %int1_4830, %int4608_4831, %int3072_4832 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3457 = torch.aten.view %3455, %3456 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_4833 = torch.constant.int 0
    %int0_4834 = torch.constant.int 0
    %int9223372036854775807_4835 = torch.constant.int 9223372036854775807
    %int1_4836 = torch.constant.int 1
    %3458 = torch.aten.slice.Tensor %3457, %int0_4833, %int0_4834, %int9223372036854775807_4835, %int1_4836 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_4837 = torch.constant.int 1
    %int0_4838 = torch.constant.int 0
    %int512_4839 = torch.constant.int 512
    %int1_4840 = torch.constant.int 1
    %3459 = torch.aten.slice.Tensor %3458, %int1_4837, %int0_4838, %int512_4839, %int1_4840 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_4841 = torch.constant.int 0
    %int0_4842 = torch.constant.int 0
    %int9223372036854775807_4843 = torch.constant.int 9223372036854775807
    %int1_4844 = torch.constant.int 1
    %3460 = torch.aten.slice.Tensor %3457, %int0_4841, %int0_4842, %int9223372036854775807_4843, %int1_4844 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_4845 = torch.constant.int 1
    %int512_4846 = torch.constant.int 512
    %int9223372036854775807_4847 = torch.constant.int 9223372036854775807
    %int1_4848 = torch.constant.int 1
    %3461 = torch.aten.slice.Tensor %3460, %int1_4845, %int512_4846, %int9223372036854775807_4847, %int1_4848 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_4849 = torch.constant.int 4096
    %int3072_4850 = torch.constant.int 3072
    %3462 = torch.prim.ListConstruct %int4096_4849, %int3072_4850 : (!torch.int, !torch.int) -> !torch.list<int>
    %3463 = torch.aten.view %3461, %3462 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.9.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.9.img_attn.proj.weight : tensor<3072x3072xf16>
    %3464 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_4851 = torch.constant.int 0
    %int1_4852 = torch.constant.int 1
    %3465 = torch.aten.transpose.int %3464, %int0_4851, %int1_4852 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.9.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.9.img_attn.proj.bias : tensor<3072xf16>
    %3466 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4853 = torch.constant.int 6
    %3467 = torch.prims.convert_element_type %3466, %int6_4853 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4854 = torch.constant.int 6
    %3468 = torch.prims.convert_element_type %3463, %int6_4854 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_4855 = torch.constant.int 6
    %3469 = torch.prims.convert_element_type %3465, %int6_4855 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %3470 = torch.aten.mm %3468, %3469 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_4856 = torch.constant.int 1
    %3471 = torch.aten.mul.Scalar %3470, %int1_4856 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_4857 = torch.constant.int 1
    %3472 = torch.aten.mul.Scalar %3467, %int1_4857 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4858 = torch.constant.int 1
    %3473 = torch.aten.add.Tensor %3471, %3472, %int1_4858 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_4859 = torch.constant.int 5
    %3474 = torch.prims.convert_element_type %3473, %int5_4859 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_4860 = torch.constant.int 1
    %int4096_4861 = torch.constant.int 4096
    %int3072_4862 = torch.constant.int 3072
    %3475 = torch.prim.ListConstruct %int1_4860, %int4096_4861, %int3072_4862 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3476 = torch.aten.view %3474, %3475 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %3477 = torch.aten.mul.Tensor %3262, %3476 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4863 = torch.constant.int 1
    %3478 = torch.aten.add.Tensor %3184, %3477, %int1_4863 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4864 = torch.constant.int 1
    %int1_4865 = torch.constant.int 1
    %3479 = torch.aten.add.Scalar %3264, %int1_4864, %int1_4865 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_4866 = torch.constant.int 6
    %3480 = torch.prims.convert_element_type %3478, %int6_4866 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_4867 = torch.constant.int 2
    %3481 = torch.prim.ListConstruct %int2_4867 : (!torch.int) -> !torch.list<int>
    %int0_4868 = torch.constant.int 0
    %true_4869 = torch.constant.bool true
    %result0_4870, %result1_4871 = torch.aten.var_mean.correction %3480, %3481, %int0_4868, %true_4869 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_4872 = torch.constant.float 9.9999999999999995E-7
    %int1_4873 = torch.constant.int 1
    %3482 = torch.aten.add.Scalar %result0_4870, %float9.999990e-07_4872, %int1_4873 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %3483 = torch.aten.rsqrt %3482 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_4874 = torch.constant.int 1
    %3484 = torch.aten.sub.Tensor %3478, %result1_4871, %int1_4874 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %3485 = torch.aten.mul.Tensor %3484, %3483 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_4875 = torch.constant.int 5
    %3486 = torch.prims.convert_element_type %3485, %int5_4875 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %3487 = torch.aten.mul.Tensor %3479, %3486 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4876 = torch.constant.int 1
    %3488 = torch.aten.add.Tensor %3487, %3263, %int1_4876 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_4877 = torch.constant.int 4096
    %int3072_4878 = torch.constant.int 3072
    %3489 = torch.prim.ListConstruct %int4096_4877, %int3072_4878 : (!torch.int, !torch.int) -> !torch.list<int>
    %3490 = torch.aten.view %3488, %3489 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.9.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.9.img_mlp.0.weight : tensor<12288x3072xf16>
    %3491 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_4879 = torch.constant.int 0
    %int1_4880 = torch.constant.int 1
    %3492 = torch.aten.transpose.int %3491, %int0_4879, %int1_4880 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.9.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.9.img_mlp.0.bias : tensor<12288xf16>
    %3493 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_4881 = torch.constant.int 6
    %3494 = torch.prims.convert_element_type %3493, %int6_4881 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_4882 = torch.constant.int 6
    %3495 = torch.prims.convert_element_type %3490, %int6_4882 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_4883 = torch.constant.int 6
    %3496 = torch.prims.convert_element_type %3492, %int6_4883 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %3497 = torch.aten.mm %3495, %3496 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_4884 = torch.constant.int 1
    %3498 = torch.aten.mul.Scalar %3497, %int1_4884 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_4885 = torch.constant.int 1
    %3499 = torch.aten.mul.Scalar %3494, %int1_4885 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_4886 = torch.constant.int 1
    %3500 = torch.aten.add.Tensor %3498, %3499, %int1_4886 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_4887 = torch.constant.int 5
    %3501 = torch.prims.convert_element_type %3500, %int5_4887 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_4888 = torch.constant.int 1
    %int4096_4889 = torch.constant.int 4096
    %int12288_4890 = torch.constant.int 12288
    %3502 = torch.prim.ListConstruct %int1_4888, %int4096_4889, %int12288_4890 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3503 = torch.aten.view %3501, %3502 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_4891 = torch.constant.str "tanh"
    %3504 = torch.aten.gelu %3503, %str_4891 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_4892 = torch.constant.int 4096
    %int12288_4893 = torch.constant.int 12288
    %3505 = torch.prim.ListConstruct %int4096_4892, %int12288_4893 : (!torch.int, !torch.int) -> !torch.list<int>
    %3506 = torch.aten.view %3504, %3505 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.9.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.9.img_mlp.2.weight : tensor<3072x12288xf16>
    %3507 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_4894 = torch.constant.int 0
    %int1_4895 = torch.constant.int 1
    %3508 = torch.aten.transpose.int %3507, %int0_4894, %int1_4895 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.9.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.9.img_mlp.2.bias : tensor<3072xf16>
    %3509 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4896 = torch.constant.int 6
    %3510 = torch.prims.convert_element_type %3509, %int6_4896 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4897 = torch.constant.int 6
    %3511 = torch.prims.convert_element_type %3506, %int6_4897 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_4898 = torch.constant.int 6
    %3512 = torch.prims.convert_element_type %3508, %int6_4898 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %3513 = torch.aten.mm %3511, %3512 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_4899 = torch.constant.int 1
    %3514 = torch.aten.mul.Scalar %3513, %int1_4899 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_4900 = torch.constant.int 1
    %3515 = torch.aten.mul.Scalar %3510, %int1_4900 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4901 = torch.constant.int 1
    %3516 = torch.aten.add.Tensor %3514, %3515, %int1_4901 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_4902 = torch.constant.int 5
    %3517 = torch.prims.convert_element_type %3516, %int5_4902 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_4903 = torch.constant.int 1
    %int4096_4904 = torch.constant.int 4096
    %int3072_4905 = torch.constant.int 3072
    %3518 = torch.prim.ListConstruct %int1_4903, %int4096_4904, %int3072_4905 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3519 = torch.aten.view %3517, %3518 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %3520 = torch.aten.mul.Tensor %3265, %3519 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4906 = torch.constant.int 1
    %3521 = torch.aten.add.Tensor %3478, %3520, %int1_4906 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_4907 = torch.constant.int 512
    %int3072_4908 = torch.constant.int 3072
    %3522 = torch.prim.ListConstruct %int512_4907, %int3072_4908 : (!torch.int, !torch.int) -> !torch.list<int>
    %3523 = torch.aten.view %3459, %3522 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.9.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.9.txt_attn.proj.weight : tensor<3072x3072xf16>
    %3524 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_4909 = torch.constant.int 0
    %int1_4910 = torch.constant.int 1
    %3525 = torch.aten.transpose.int %3524, %int0_4909, %int1_4910 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.9.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.9.txt_attn.proj.bias : tensor<3072xf16>
    %3526 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4911 = torch.constant.int 6
    %3527 = torch.prims.convert_element_type %3526, %int6_4911 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4912 = torch.constant.int 6
    %3528 = torch.prims.convert_element_type %3523, %int6_4912 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_4913 = torch.constant.int 6
    %3529 = torch.prims.convert_element_type %3525, %int6_4913 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %3530 = torch.aten.mm %3528, %3529 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_4914 = torch.constant.int 1
    %3531 = torch.aten.mul.Scalar %3530, %int1_4914 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_4915 = torch.constant.int 1
    %3532 = torch.aten.mul.Scalar %3527, %int1_4915 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4916 = torch.constant.int 1
    %3533 = torch.aten.add.Tensor %3531, %3532, %int1_4916 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_4917 = torch.constant.int 5
    %3534 = torch.prims.convert_element_type %3533, %int5_4917 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_4918 = torch.constant.int 1
    %int512_4919 = torch.constant.int 512
    %int3072_4920 = torch.constant.int 3072
    %3535 = torch.prim.ListConstruct %int1_4918, %int512_4919, %int3072_4920 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3536 = torch.aten.view %3534, %3535 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %3537 = torch.aten.mul.Tensor %3283, %3536 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4921 = torch.constant.int 1
    %3538 = torch.aten.add.Tensor %3244, %3537, %int1_4921 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_4922 = torch.constant.int 1
    %int1_4923 = torch.constant.int 1
    %3539 = torch.aten.add.Scalar %3285, %int1_4922, %int1_4923 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_4924 = torch.constant.int 6
    %3540 = torch.prims.convert_element_type %3538, %int6_4924 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_4925 = torch.constant.int 2
    %3541 = torch.prim.ListConstruct %int2_4925 : (!torch.int) -> !torch.list<int>
    %int0_4926 = torch.constant.int 0
    %true_4927 = torch.constant.bool true
    %result0_4928, %result1_4929 = torch.aten.var_mean.correction %3540, %3541, %int0_4926, %true_4927 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_4930 = torch.constant.float 9.9999999999999995E-7
    %int1_4931 = torch.constant.int 1
    %3542 = torch.aten.add.Scalar %result0_4928, %float9.999990e-07_4930, %int1_4931 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %3543 = torch.aten.rsqrt %3542 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_4932 = torch.constant.int 1
    %3544 = torch.aten.sub.Tensor %3538, %result1_4929, %int1_4932 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %3545 = torch.aten.mul.Tensor %3544, %3543 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_4933 = torch.constant.int 5
    %3546 = torch.prims.convert_element_type %3545, %int5_4933 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %3547 = torch.aten.mul.Tensor %3539, %3546 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4934 = torch.constant.int 1
    %3548 = torch.aten.add.Tensor %3547, %3284, %int1_4934 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_4935 = torch.constant.int 512
    %int3072_4936 = torch.constant.int 3072
    %3549 = torch.prim.ListConstruct %int512_4935, %int3072_4936 : (!torch.int, !torch.int) -> !torch.list<int>
    %3550 = torch.aten.view %3548, %3549 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.9.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.9.txt_mlp.0.weight : tensor<12288x3072xf16>
    %3551 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_4937 = torch.constant.int 0
    %int1_4938 = torch.constant.int 1
    %3552 = torch.aten.transpose.int %3551, %int0_4937, %int1_4938 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.9.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.9.txt_mlp.0.bias : tensor<12288xf16>
    %3553 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_4939 = torch.constant.int 6
    %3554 = torch.prims.convert_element_type %3553, %int6_4939 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_4940 = torch.constant.int 6
    %3555 = torch.prims.convert_element_type %3550, %int6_4940 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_4941 = torch.constant.int 6
    %3556 = torch.prims.convert_element_type %3552, %int6_4941 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %3557 = torch.aten.mm %3555, %3556 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_4942 = torch.constant.int 1
    %3558 = torch.aten.mul.Scalar %3557, %int1_4942 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_4943 = torch.constant.int 1
    %3559 = torch.aten.mul.Scalar %3554, %int1_4943 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_4944 = torch.constant.int 1
    %3560 = torch.aten.add.Tensor %3558, %3559, %int1_4944 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_4945 = torch.constant.int 5
    %3561 = torch.prims.convert_element_type %3560, %int5_4945 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_4946 = torch.constant.int 1
    %int512_4947 = torch.constant.int 512
    %int12288_4948 = torch.constant.int 12288
    %3562 = torch.prim.ListConstruct %int1_4946, %int512_4947, %int12288_4948 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3563 = torch.aten.view %3561, %3562 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_4949 = torch.constant.str "tanh"
    %3564 = torch.aten.gelu %3563, %str_4949 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_4950 = torch.constant.int 512
    %int12288_4951 = torch.constant.int 12288
    %3565 = torch.prim.ListConstruct %int512_4950, %int12288_4951 : (!torch.int, !torch.int) -> !torch.list<int>
    %3566 = torch.aten.view %3564, %3565 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.9.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.9.txt_mlp.2.weight : tensor<3072x12288xf16>
    %3567 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_4952 = torch.constant.int 0
    %int1_4953 = torch.constant.int 1
    %3568 = torch.aten.transpose.int %3567, %int0_4952, %int1_4953 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.9.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.9.txt_mlp.2.bias : tensor<3072xf16>
    %3569 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4954 = torch.constant.int 6
    %3570 = torch.prims.convert_element_type %3569, %int6_4954 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4955 = torch.constant.int 6
    %3571 = torch.prims.convert_element_type %3566, %int6_4955 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_4956 = torch.constant.int 6
    %3572 = torch.prims.convert_element_type %3568, %int6_4956 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %3573 = torch.aten.mm %3571, %3572 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_4957 = torch.constant.int 1
    %3574 = torch.aten.mul.Scalar %3573, %int1_4957 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_4958 = torch.constant.int 1
    %3575 = torch.aten.mul.Scalar %3570, %int1_4958 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4959 = torch.constant.int 1
    %3576 = torch.aten.add.Tensor %3574, %3575, %int1_4959 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_4960 = torch.constant.int 5
    %3577 = torch.prims.convert_element_type %3576, %int5_4960 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_4961 = torch.constant.int 1
    %int512_4962 = torch.constant.int 512
    %int3072_4963 = torch.constant.int 3072
    %3578 = torch.prim.ListConstruct %int1_4961, %int512_4962, %int3072_4963 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3579 = torch.aten.view %3577, %3578 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %3580 = torch.aten.mul.Tensor %3286, %3579 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4964 = torch.constant.int 1
    %3581 = torch.aten.add.Tensor %3538, %3580, %int1_4964 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %3582 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.10.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.10.img_mod.lin.weight : tensor<18432x3072xf16>
    %3583 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_4965 = torch.constant.int 0
    %int1_4966 = torch.constant.int 1
    %3584 = torch.aten.transpose.int %3583, %int0_4965, %int1_4966 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.10.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.10.img_mod.lin.bias : tensor<18432xf16>
    %3585 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_4967 = torch.constant.int 6
    %3586 = torch.prims.convert_element_type %3585, %int6_4967 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_4968 = torch.constant.int 6
    %3587 = torch.prims.convert_element_type %3582, %int6_4968 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_4969 = torch.constant.int 6
    %3588 = torch.prims.convert_element_type %3584, %int6_4969 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %3589 = torch.aten.mm %3587, %3588 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_4970 = torch.constant.int 1
    %3590 = torch.aten.mul.Scalar %3589, %int1_4970 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_4971 = torch.constant.int 1
    %3591 = torch.aten.mul.Scalar %3586, %int1_4971 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_4972 = torch.constant.int 1
    %3592 = torch.aten.add.Tensor %3590, %3591, %int1_4972 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_4973 = torch.constant.int 5
    %3593 = torch.prims.convert_element_type %3592, %int5_4973 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_4974 = torch.constant.int 0
    %int0_4975 = torch.constant.int 0
    %int9223372036854775807_4976 = torch.constant.int 9223372036854775807
    %int1_4977 = torch.constant.int 1
    %3594 = torch.aten.slice.Tensor %3593, %int0_4974, %int0_4975, %int9223372036854775807_4976, %int1_4977 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_4978 = torch.constant.int 1
    %3595 = torch.aten.unsqueeze %3594, %int1_4978 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_4979 = torch.constant.int 2
    %int0_4980 = torch.constant.int 0
    %int9223372036854775807_4981 = torch.constant.int 9223372036854775807
    %int1_4982 = torch.constant.int 1
    %3596 = torch.aten.slice.Tensor %3595, %int2_4979, %int0_4980, %int9223372036854775807_4981, %int1_4982 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_4983 = torch.constant.int -1
    %int0_4984 = torch.constant.int 0
    %int3072_4985 = torch.constant.int 3072
    %int1_4986 = torch.constant.int 1
    %3597 = torch.aten.slice.Tensor %3596, %int-1_4983, %int0_4984, %int3072_4985, %int1_4986 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4987 = torch.constant.int -1
    %int3072_4988 = torch.constant.int 3072
    %int6144_4989 = torch.constant.int 6144
    %int1_4990 = torch.constant.int 1
    %3598 = torch.aten.slice.Tensor %3596, %int-1_4987, %int3072_4988, %int6144_4989, %int1_4990 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4991 = torch.constant.int -1
    %int6144_4992 = torch.constant.int 6144
    %int9216_4993 = torch.constant.int 9216
    %int1_4994 = torch.constant.int 1
    %3599 = torch.aten.slice.Tensor %3596, %int-1_4991, %int6144_4992, %int9216_4993, %int1_4994 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4995 = torch.constant.int -1
    %int9216_4996 = torch.constant.int 9216
    %int12288_4997 = torch.constant.int 12288
    %int1_4998 = torch.constant.int 1
    %3600 = torch.aten.slice.Tensor %3596, %int-1_4995, %int9216_4996, %int12288_4997, %int1_4998 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4999 = torch.constant.int -1
    %int12288_5000 = torch.constant.int 12288
    %int15360_5001 = torch.constant.int 15360
    %int1_5002 = torch.constant.int 1
    %3601 = torch.aten.slice.Tensor %3596, %int-1_4999, %int12288_5000, %int15360_5001, %int1_5002 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5003 = torch.constant.int -1
    %int15360_5004 = torch.constant.int 15360
    %int18432_5005 = torch.constant.int 18432
    %int1_5006 = torch.constant.int 1
    %3602 = torch.aten.slice.Tensor %3596, %int-1_5003, %int15360_5004, %int18432_5005, %int1_5006 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3603 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.10.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.10.txt_mod.lin.weight : tensor<18432x3072xf16>
    %3604 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_5007 = torch.constant.int 0
    %int1_5008 = torch.constant.int 1
    %3605 = torch.aten.transpose.int %3604, %int0_5007, %int1_5008 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.10.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.10.txt_mod.lin.bias : tensor<18432xf16>
    %3606 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_5009 = torch.constant.int 6
    %3607 = torch.prims.convert_element_type %3606, %int6_5009 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_5010 = torch.constant.int 6
    %3608 = torch.prims.convert_element_type %3603, %int6_5010 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_5011 = torch.constant.int 6
    %3609 = torch.prims.convert_element_type %3605, %int6_5011 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %3610 = torch.aten.mm %3608, %3609 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_5012 = torch.constant.int 1
    %3611 = torch.aten.mul.Scalar %3610, %int1_5012 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_5013 = torch.constant.int 1
    %3612 = torch.aten.mul.Scalar %3607, %int1_5013 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_5014 = torch.constant.int 1
    %3613 = torch.aten.add.Tensor %3611, %3612, %int1_5014 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_5015 = torch.constant.int 5
    %3614 = torch.prims.convert_element_type %3613, %int5_5015 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_5016 = torch.constant.int 0
    %int0_5017 = torch.constant.int 0
    %int9223372036854775807_5018 = torch.constant.int 9223372036854775807
    %int1_5019 = torch.constant.int 1
    %3615 = torch.aten.slice.Tensor %3614, %int0_5016, %int0_5017, %int9223372036854775807_5018, %int1_5019 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_5020 = torch.constant.int 1
    %3616 = torch.aten.unsqueeze %3615, %int1_5020 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_5021 = torch.constant.int 2
    %int0_5022 = torch.constant.int 0
    %int9223372036854775807_5023 = torch.constant.int 9223372036854775807
    %int1_5024 = torch.constant.int 1
    %3617 = torch.aten.slice.Tensor %3616, %int2_5021, %int0_5022, %int9223372036854775807_5023, %int1_5024 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_5025 = torch.constant.int -1
    %int0_5026 = torch.constant.int 0
    %int3072_5027 = torch.constant.int 3072
    %int1_5028 = torch.constant.int 1
    %3618 = torch.aten.slice.Tensor %3617, %int-1_5025, %int0_5026, %int3072_5027, %int1_5028 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5029 = torch.constant.int -1
    %int3072_5030 = torch.constant.int 3072
    %int6144_5031 = torch.constant.int 6144
    %int1_5032 = torch.constant.int 1
    %3619 = torch.aten.slice.Tensor %3617, %int-1_5029, %int3072_5030, %int6144_5031, %int1_5032 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5033 = torch.constant.int -1
    %int6144_5034 = torch.constant.int 6144
    %int9216_5035 = torch.constant.int 9216
    %int1_5036 = torch.constant.int 1
    %3620 = torch.aten.slice.Tensor %3617, %int-1_5033, %int6144_5034, %int9216_5035, %int1_5036 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5037 = torch.constant.int -1
    %int9216_5038 = torch.constant.int 9216
    %int12288_5039 = torch.constant.int 12288
    %int1_5040 = torch.constant.int 1
    %3621 = torch.aten.slice.Tensor %3617, %int-1_5037, %int9216_5038, %int12288_5039, %int1_5040 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5041 = torch.constant.int -1
    %int12288_5042 = torch.constant.int 12288
    %int15360_5043 = torch.constant.int 15360
    %int1_5044 = torch.constant.int 1
    %3622 = torch.aten.slice.Tensor %3617, %int-1_5041, %int12288_5042, %int15360_5043, %int1_5044 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5045 = torch.constant.int -1
    %int15360_5046 = torch.constant.int 15360
    %int18432_5047 = torch.constant.int 18432
    %int1_5048 = torch.constant.int 1
    %3623 = torch.aten.slice.Tensor %3617, %int-1_5045, %int15360_5046, %int18432_5047, %int1_5048 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_5049 = torch.constant.int 6
    %3624 = torch.prims.convert_element_type %3521, %int6_5049 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_5050 = torch.constant.int 2
    %3625 = torch.prim.ListConstruct %int2_5050 : (!torch.int) -> !torch.list<int>
    %int0_5051 = torch.constant.int 0
    %true_5052 = torch.constant.bool true
    %result0_5053, %result1_5054 = torch.aten.var_mean.correction %3624, %3625, %int0_5051, %true_5052 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_5055 = torch.constant.float 9.9999999999999995E-7
    %int1_5056 = torch.constant.int 1
    %3626 = torch.aten.add.Scalar %result0_5053, %float9.999990e-07_5055, %int1_5056 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %3627 = torch.aten.rsqrt %3626 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_5057 = torch.constant.int 1
    %3628 = torch.aten.sub.Tensor %3521, %result1_5054, %int1_5057 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %3629 = torch.aten.mul.Tensor %3628, %3627 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_5058 = torch.constant.int 5
    %3630 = torch.prims.convert_element_type %3629, %int5_5058 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5059 = torch.constant.int 1
    %int1_5060 = torch.constant.int 1
    %3631 = torch.aten.add.Scalar %3598, %int1_5059, %int1_5060 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3632 = torch.aten.mul.Tensor %3631, %3630 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5061 = torch.constant.int 1
    %3633 = torch.aten.add.Tensor %3632, %3597, %int1_5061 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_5062 = torch.constant.int 4096
    %int3072_5063 = torch.constant.int 3072
    %3634 = torch.prim.ListConstruct %int4096_5062, %int3072_5063 : (!torch.int, !torch.int) -> !torch.list<int>
    %3635 = torch.aten.view %3633, %3634 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.10.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.10.img_attn.qkv.weight : tensor<9216x3072xf16>
    %3636 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_5064 = torch.constant.int 0
    %int1_5065 = torch.constant.int 1
    %3637 = torch.aten.transpose.int %3636, %int0_5064, %int1_5065 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.10.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.10.img_attn.qkv.bias : tensor<9216xf16>
    %3638 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_5066 = torch.constant.int 6
    %3639 = torch.prims.convert_element_type %3638, %int6_5066 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_5067 = torch.constant.int 6
    %3640 = torch.prims.convert_element_type %3635, %int6_5067 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_5068 = torch.constant.int 6
    %3641 = torch.prims.convert_element_type %3637, %int6_5068 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %3642 = torch.aten.mm %3640, %3641 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_5069 = torch.constant.int 1
    %3643 = torch.aten.mul.Scalar %3642, %int1_5069 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_5070 = torch.constant.int 1
    %3644 = torch.aten.mul.Scalar %3639, %int1_5070 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_5071 = torch.constant.int 1
    %3645 = torch.aten.add.Tensor %3643, %3644, %int1_5071 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_5072 = torch.constant.int 5
    %3646 = torch.prims.convert_element_type %3645, %int5_5072 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_5073 = torch.constant.int 1
    %int4096_5074 = torch.constant.int 4096
    %int9216_5075 = torch.constant.int 9216
    %3647 = torch.prim.ListConstruct %int1_5073, %int4096_5074, %int9216_5075 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3648 = torch.aten.view %3646, %3647 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %3649 = torch_c.to_builtin_tensor %3648 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_5076 = tensor.cast %3649 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_5077 = arith.constant 0 : index
    %dim_5078 = tensor.dim %cast_5076, %c0_5077 : tensor<?x?x?xf16>
    %c1_5079 = arith.constant 1 : index
    %dim_5080 = tensor.dim %cast_5076, %c1_5079 : tensor<?x?x?xf16>
    %c2_5081 = arith.constant 2 : index
    %dim_5082 = tensor.dim %cast_5076, %c2_5081 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_5076 : tensor<?x?x?xf16>{%dim_5078, %dim_5080, %dim_5082}]
    %cast_5083 = tensor.cast %cast_5076 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %3650 = torch_c.from_builtin_tensor %cast_5083 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_5084 = torch.constant.int 1
    %int4096_5085 = torch.constant.int 4096
    %int3_5086 = torch.constant.int 3
    %int24_5087 = torch.constant.int 24
    %int128_5088 = torch.constant.int 128
    %3651 = torch.prim.ListConstruct %int1_5084, %int4096_5085, %int3_5086, %int24_5087, %int128_5088 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3652 = torch.aten.view %3650, %3651 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_5089 = torch.constant.int 2
    %int0_5090 = torch.constant.int 0
    %int3_5091 = torch.constant.int 3
    %int1_5092 = torch.constant.int 1
    %int4_5093 = torch.constant.int 4
    %3653 = torch.prim.ListConstruct %int2_5089, %int0_5090, %int3_5091, %int1_5092, %int4_5093 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3654 = torch.aten.permute %3652, %3653 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_5094 = torch.constant.int 0
    %int0_5095 = torch.constant.int 0
    %3655 = torch.aten.select.int %3654, %int0_5094, %int0_5095 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_5096 = torch.constant.int 6
    %3656 = torch.prims.convert_element_type %3655, %int6_5096 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_5097 = torch.constant.int 2
    %3657 = torch.aten.pow.Tensor_Scalar %3656, %int2_5097 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_5098 = torch.constant.int -1
    %3658 = torch.prim.ListConstruct %int-1_5098 : (!torch.int) -> !torch.list<int>
    %true_5099 = torch.constant.bool true
    %none_5100 = torch.constant.none
    %3659 = torch.aten.mean.dim %3657, %3658, %true_5099, %none_5100 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_5101 = torch.constant.float 9.9999999999999995E-7
    %int1_5102 = torch.constant.int 1
    %3660 = torch.aten.add.Scalar %3659, %float9.999990e-07_5101, %int1_5102 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %3661 = torch.aten.rsqrt %3660 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %3662 = torch.aten.mul.Tensor %3656, %3661 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_5103 = torch.constant.int 5
    %3663 = torch.prims.convert_element_type %3662, %int5_5103 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.10.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.10.img_attn.norm.query_norm.scale : tensor<128xf16>
    %3664 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3665 = torch.aten.mul.Tensor %3663, %3664 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_5104 = torch.constant.int 1
    %int4096_5105 = torch.constant.int 4096
    %int3_5106 = torch.constant.int 3
    %int24_5107 = torch.constant.int 24
    %int128_5108 = torch.constant.int 128
    %3666 = torch.prim.ListConstruct %int1_5104, %int4096_5105, %int3_5106, %int24_5107, %int128_5108 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3667 = torch.aten.view %3650, %3666 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_5109 = torch.constant.int 2
    %int0_5110 = torch.constant.int 0
    %int3_5111 = torch.constant.int 3
    %int1_5112 = torch.constant.int 1
    %int4_5113 = torch.constant.int 4
    %3668 = torch.prim.ListConstruct %int2_5109, %int0_5110, %int3_5111, %int1_5112, %int4_5113 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3669 = torch.aten.permute %3667, %3668 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_5114 = torch.constant.int 0
    %int1_5115 = torch.constant.int 1
    %3670 = torch.aten.select.int %3669, %int0_5114, %int1_5115 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_5116 = torch.constant.int 6
    %3671 = torch.prims.convert_element_type %3670, %int6_5116 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_5117 = torch.constant.int 2
    %3672 = torch.aten.pow.Tensor_Scalar %3671, %int2_5117 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_5118 = torch.constant.int -1
    %3673 = torch.prim.ListConstruct %int-1_5118 : (!torch.int) -> !torch.list<int>
    %true_5119 = torch.constant.bool true
    %none_5120 = torch.constant.none
    %3674 = torch.aten.mean.dim %3672, %3673, %true_5119, %none_5120 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_5121 = torch.constant.float 9.9999999999999995E-7
    %int1_5122 = torch.constant.int 1
    %3675 = torch.aten.add.Scalar %3674, %float9.999990e-07_5121, %int1_5122 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %3676 = torch.aten.rsqrt %3675 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %3677 = torch.aten.mul.Tensor %3671, %3676 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_5123 = torch.constant.int 5
    %3678 = torch.prims.convert_element_type %3677, %int5_5123 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.10.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.10.img_attn.norm.key_norm.scale : tensor<128xf16>
    %3679 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3680 = torch.aten.mul.Tensor %3678, %3679 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_5124 = torch.constant.int 5
    %3681 = torch.prims.convert_element_type %3665, %int5_5124 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_5125 = torch.constant.int 5
    %3682 = torch.prims.convert_element_type %3680, %int5_5125 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_5126 = torch.constant.int 6
    %3683 = torch.prims.convert_element_type %3581, %int6_5126 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_5127 = torch.constant.int 2
    %3684 = torch.prim.ListConstruct %int2_5127 : (!torch.int) -> !torch.list<int>
    %int0_5128 = torch.constant.int 0
    %true_5129 = torch.constant.bool true
    %result0_5130, %result1_5131 = torch.aten.var_mean.correction %3683, %3684, %int0_5128, %true_5129 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_5132 = torch.constant.float 9.9999999999999995E-7
    %int1_5133 = torch.constant.int 1
    %3685 = torch.aten.add.Scalar %result0_5130, %float9.999990e-07_5132, %int1_5133 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %3686 = torch.aten.rsqrt %3685 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_5134 = torch.constant.int 1
    %3687 = torch.aten.sub.Tensor %3581, %result1_5131, %int1_5134 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %3688 = torch.aten.mul.Tensor %3687, %3686 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_5135 = torch.constant.int 5
    %3689 = torch.prims.convert_element_type %3688, %int5_5135 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_5136 = torch.constant.int 1
    %int1_5137 = torch.constant.int 1
    %3690 = torch.aten.add.Scalar %3619, %int1_5136, %int1_5137 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3691 = torch.aten.mul.Tensor %3690, %3689 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5138 = torch.constant.int 1
    %3692 = torch.aten.add.Tensor %3691, %3618, %int1_5138 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_5139 = torch.constant.int 512
    %int3072_5140 = torch.constant.int 3072
    %3693 = torch.prim.ListConstruct %int512_5139, %int3072_5140 : (!torch.int, !torch.int) -> !torch.list<int>
    %3694 = torch.aten.view %3692, %3693 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.10.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.10.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %3695 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_5141 = torch.constant.int 0
    %int1_5142 = torch.constant.int 1
    %3696 = torch.aten.transpose.int %3695, %int0_5141, %int1_5142 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.10.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.10.txt_attn.qkv.bias : tensor<9216xf16>
    %3697 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_5143 = torch.constant.int 6
    %3698 = torch.prims.convert_element_type %3697, %int6_5143 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_5144 = torch.constant.int 6
    %3699 = torch.prims.convert_element_type %3694, %int6_5144 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_5145 = torch.constant.int 6
    %3700 = torch.prims.convert_element_type %3696, %int6_5145 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %3701 = torch.aten.mm %3699, %3700 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_5146 = torch.constant.int 1
    %3702 = torch.aten.mul.Scalar %3701, %int1_5146 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_5147 = torch.constant.int 1
    %3703 = torch.aten.mul.Scalar %3698, %int1_5147 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_5148 = torch.constant.int 1
    %3704 = torch.aten.add.Tensor %3702, %3703, %int1_5148 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_5149 = torch.constant.int 5
    %3705 = torch.prims.convert_element_type %3704, %int5_5149 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_5150 = torch.constant.int 1
    %int512_5151 = torch.constant.int 512
    %int9216_5152 = torch.constant.int 9216
    %3706 = torch.prim.ListConstruct %int1_5150, %int512_5151, %int9216_5152 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3707 = torch.aten.view %3705, %3706 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %3708 = torch_c.to_builtin_tensor %3707 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_5153 = tensor.cast %3708 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_5154 = arith.constant 0 : index
    %dim_5155 = tensor.dim %cast_5153, %c0_5154 : tensor<?x?x?xf16>
    %c1_5156 = arith.constant 1 : index
    %dim_5157 = tensor.dim %cast_5153, %c1_5156 : tensor<?x?x?xf16>
    %c2_5158 = arith.constant 2 : index
    %dim_5159 = tensor.dim %cast_5153, %c2_5158 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_5153 : tensor<?x?x?xf16>{%dim_5155, %dim_5157, %dim_5159}]
    %cast_5160 = tensor.cast %cast_5153 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %3709 = torch_c.from_builtin_tensor %cast_5160 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_5161 = torch.constant.int 1
    %int512_5162 = torch.constant.int 512
    %int3_5163 = torch.constant.int 3
    %int24_5164 = torch.constant.int 24
    %int128_5165 = torch.constant.int 128
    %3710 = torch.prim.ListConstruct %int1_5161, %int512_5162, %int3_5163, %int24_5164, %int128_5165 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3711 = torch.aten.view %3709, %3710 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_5166 = torch.constant.int 2
    %int0_5167 = torch.constant.int 0
    %int3_5168 = torch.constant.int 3
    %int1_5169 = torch.constant.int 1
    %int4_5170 = torch.constant.int 4
    %3712 = torch.prim.ListConstruct %int2_5166, %int0_5167, %int3_5168, %int1_5169, %int4_5170 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3713 = torch.aten.permute %3711, %3712 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_5171 = torch.constant.int 0
    %int0_5172 = torch.constant.int 0
    %3714 = torch.aten.select.int %3713, %int0_5171, %int0_5172 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_5173 = torch.constant.int 6
    %3715 = torch.prims.convert_element_type %3714, %int6_5173 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_5174 = torch.constant.int 2
    %3716 = torch.aten.pow.Tensor_Scalar %3715, %int2_5174 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_5175 = torch.constant.int -1
    %3717 = torch.prim.ListConstruct %int-1_5175 : (!torch.int) -> !torch.list<int>
    %true_5176 = torch.constant.bool true
    %none_5177 = torch.constant.none
    %3718 = torch.aten.mean.dim %3716, %3717, %true_5176, %none_5177 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_5178 = torch.constant.float 9.9999999999999995E-7
    %int1_5179 = torch.constant.int 1
    %3719 = torch.aten.add.Scalar %3718, %float9.999990e-07_5178, %int1_5179 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %3720 = torch.aten.rsqrt %3719 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %3721 = torch.aten.mul.Tensor %3715, %3720 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_5180 = torch.constant.int 5
    %3722 = torch.prims.convert_element_type %3721, %int5_5180 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.10.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.10.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %3723 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3724 = torch.aten.mul.Tensor %3722, %3723 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_5181 = torch.constant.int 1
    %int512_5182 = torch.constant.int 512
    %int3_5183 = torch.constant.int 3
    %int24_5184 = torch.constant.int 24
    %int128_5185 = torch.constant.int 128
    %3725 = torch.prim.ListConstruct %int1_5181, %int512_5182, %int3_5183, %int24_5184, %int128_5185 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3726 = torch.aten.view %3709, %3725 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_5186 = torch.constant.int 2
    %int0_5187 = torch.constant.int 0
    %int3_5188 = torch.constant.int 3
    %int1_5189 = torch.constant.int 1
    %int4_5190 = torch.constant.int 4
    %3727 = torch.prim.ListConstruct %int2_5186, %int0_5187, %int3_5188, %int1_5189, %int4_5190 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3728 = torch.aten.permute %3726, %3727 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_5191 = torch.constant.int 0
    %int1_5192 = torch.constant.int 1
    %3729 = torch.aten.select.int %3728, %int0_5191, %int1_5192 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_5193 = torch.constant.int 6
    %3730 = torch.prims.convert_element_type %3729, %int6_5193 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_5194 = torch.constant.int 2
    %3731 = torch.aten.pow.Tensor_Scalar %3730, %int2_5194 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_5195 = torch.constant.int -1
    %3732 = torch.prim.ListConstruct %int-1_5195 : (!torch.int) -> !torch.list<int>
    %true_5196 = torch.constant.bool true
    %none_5197 = torch.constant.none
    %3733 = torch.aten.mean.dim %3731, %3732, %true_5196, %none_5197 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_5198 = torch.constant.float 9.9999999999999995E-7
    %int1_5199 = torch.constant.int 1
    %3734 = torch.aten.add.Scalar %3733, %float9.999990e-07_5198, %int1_5199 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %3735 = torch.aten.rsqrt %3734 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %3736 = torch.aten.mul.Tensor %3730, %3735 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_5200 = torch.constant.int 5
    %3737 = torch.prims.convert_element_type %3736, %int5_5200 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.10.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.10.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %3738 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3739 = torch.aten.mul.Tensor %3737, %3738 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_5201 = torch.constant.int 5
    %3740 = torch.prims.convert_element_type %3724, %int5_5201 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_5202 = torch.constant.int 5
    %3741 = torch.prims.convert_element_type %3739, %int5_5202 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %3742 = torch.prim.ListConstruct %3740, %3681 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_5203 = torch.constant.int 2
    %3743 = torch.aten.cat %3742, %int2_5203 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %3744 = torch.prim.ListConstruct %3741, %3682 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_5204 = torch.constant.int 2
    %3745 = torch.aten.cat %3744, %int2_5204 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_5205 = torch.constant.int 1
    %int512_5206 = torch.constant.int 512
    %int3_5207 = torch.constant.int 3
    %int24_5208 = torch.constant.int 24
    %int128_5209 = torch.constant.int 128
    %3746 = torch.prim.ListConstruct %int1_5205, %int512_5206, %int3_5207, %int24_5208, %int128_5209 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3747 = torch.aten.view %3709, %3746 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_5210 = torch.constant.int 2
    %int0_5211 = torch.constant.int 0
    %int3_5212 = torch.constant.int 3
    %int1_5213 = torch.constant.int 1
    %int4_5214 = torch.constant.int 4
    %3748 = torch.prim.ListConstruct %int2_5210, %int0_5211, %int3_5212, %int1_5213, %int4_5214 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3749 = torch.aten.permute %3747, %3748 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_5215 = torch.constant.int 0
    %int2_5216 = torch.constant.int 2
    %3750 = torch.aten.select.int %3749, %int0_5215, %int2_5216 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_5217 = torch.constant.int 1
    %int4096_5218 = torch.constant.int 4096
    %int3_5219 = torch.constant.int 3
    %int24_5220 = torch.constant.int 24
    %int128_5221 = torch.constant.int 128
    %3751 = torch.prim.ListConstruct %int1_5217, %int4096_5218, %int3_5219, %int24_5220, %int128_5221 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3752 = torch.aten.view %3650, %3751 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_5222 = torch.constant.int 2
    %int0_5223 = torch.constant.int 0
    %int3_5224 = torch.constant.int 3
    %int1_5225 = torch.constant.int 1
    %int4_5226 = torch.constant.int 4
    %3753 = torch.prim.ListConstruct %int2_5222, %int0_5223, %int3_5224, %int1_5225, %int4_5226 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3754 = torch.aten.permute %3752, %3753 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_5227 = torch.constant.int 0
    %int2_5228 = torch.constant.int 2
    %3755 = torch.aten.select.int %3754, %int0_5227, %int2_5228 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %3756 = torch.prim.ListConstruct %3750, %3755 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_5229 = torch.constant.int 2
    %3757 = torch.aten.cat %3756, %int2_5229 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %3758 = torch_c.to_builtin_tensor %3743 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_5230 = tensor.cast %3758 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_5231 = arith.constant 0 : index
    %dim_5232 = tensor.dim %cast_5230, %c0_5231 : tensor<?x?x?x?xf16>
    %c1_5233 = arith.constant 1 : index
    %dim_5234 = tensor.dim %cast_5230, %c1_5233 : tensor<?x?x?x?xf16>
    %c2_5235 = arith.constant 2 : index
    %dim_5236 = tensor.dim %cast_5230, %c2_5235 : tensor<?x?x?x?xf16>
    %c3_5237 = arith.constant 3 : index
    %dim_5238 = tensor.dim %cast_5230, %c3_5237 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_5230 : tensor<?x?x?x?xf16>{%dim_5232, %dim_5234, %dim_5236, %dim_5238}]
    %cast_5239 = tensor.cast %cast_5230 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %3759 = torch_c.from_builtin_tensor %cast_5239 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %3760 = torch_c.to_builtin_tensor %3745 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_5240 = tensor.cast %3760 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_5241 = arith.constant 0 : index
    %dim_5242 = tensor.dim %cast_5240, %c0_5241 : tensor<?x?x?x?xf16>
    %c1_5243 = arith.constant 1 : index
    %dim_5244 = tensor.dim %cast_5240, %c1_5243 : tensor<?x?x?x?xf16>
    %c2_5245 = arith.constant 2 : index
    %dim_5246 = tensor.dim %cast_5240, %c2_5245 : tensor<?x?x?x?xf16>
    %c3_5247 = arith.constant 3 : index
    %dim_5248 = tensor.dim %cast_5240, %c3_5247 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_5240 : tensor<?x?x?x?xf16>{%dim_5242, %dim_5244, %dim_5246, %dim_5248}]
    %cast_5249 = tensor.cast %cast_5240 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %3761 = torch_c.from_builtin_tensor %cast_5249 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %3762 = torch_c.to_builtin_tensor %3757 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_5250 = tensor.cast %3762 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_5251 = arith.constant 0 : index
    %dim_5252 = tensor.dim %cast_5250, %c0_5251 : tensor<?x?x?x?xf16>
    %c1_5253 = arith.constant 1 : index
    %dim_5254 = tensor.dim %cast_5250, %c1_5253 : tensor<?x?x?x?xf16>
    %c2_5255 = arith.constant 2 : index
    %dim_5256 = tensor.dim %cast_5250, %c2_5255 : tensor<?x?x?x?xf16>
    %c3_5257 = arith.constant 3 : index
    %dim_5258 = tensor.dim %cast_5250, %c3_5257 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_5250 : tensor<?x?x?x?xf16>{%dim_5252, %dim_5254, %dim_5256, %dim_5258}]
    %cast_5259 = tensor.cast %cast_5250 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %3763 = torch_c.from_builtin_tensor %cast_5259 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_5260 = torch.constant.int 6
    %3764 = torch.prims.convert_element_type %3759, %int6_5260 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_5261 = torch.constant.int 1
    %int24_5262 = torch.constant.int 24
    %int4608_5263 = torch.constant.int 4608
    %int-1_5264 = torch.constant.int -1
    %int1_5265 = torch.constant.int 1
    %int2_5266 = torch.constant.int 2
    %3765 = torch.prim.ListConstruct %int1_5261, %int24_5262, %int4608_5263, %int-1_5264, %int1_5265, %int2_5266 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3766 = torch.aten.view %3764, %3765 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_5267 = torch.constant.int 6
    %3767 = torch.prims.convert_element_type %3761, %int6_5267 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_5268 = torch.constant.int 1
    %int24_5269 = torch.constant.int 24
    %int4608_5270 = torch.constant.int 4608
    %int-1_5271 = torch.constant.int -1
    %int1_5272 = torch.constant.int 1
    %int2_5273 = torch.constant.int 2
    %3768 = torch.prim.ListConstruct %int1_5268, %int24_5269, %int4608_5270, %int-1_5271, %int1_5272, %int2_5273 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3769 = torch.aten.view %3767, %3768 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_5274 = torch.constant.int 5
    %int0_5275 = torch.constant.int 0
    %3770 = torch.aten.select.int %211, %int5_5274, %int0_5275 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5276 = torch.constant.int 5
    %int0_5277 = torch.constant.int 0
    %3771 = torch.aten.select.int %3766, %int5_5276, %int0_5277 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3772 = torch.aten.mul.Tensor %3770, %3771 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_5278 = torch.constant.int 5
    %int1_5279 = torch.constant.int 1
    %3773 = torch.aten.select.int %211, %int5_5278, %int1_5279 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5280 = torch.constant.int 5
    %int1_5281 = torch.constant.int 1
    %3774 = torch.aten.select.int %3766, %int5_5280, %int1_5281 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3775 = torch.aten.mul.Tensor %3773, %3774 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_5282 = torch.constant.int 1
    %3776 = torch.aten.add.Tensor %3772, %3775, %int1_5282 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_5283 = torch.constant.int 5
    %int0_5284 = torch.constant.int 0
    %3777 = torch.aten.select.int %211, %int5_5283, %int0_5284 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5285 = torch.constant.int 5
    %int0_5286 = torch.constant.int 0
    %3778 = torch.aten.select.int %3769, %int5_5285, %int0_5286 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3779 = torch.aten.mul.Tensor %3777, %3778 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_5287 = torch.constant.int 5
    %int1_5288 = torch.constant.int 1
    %3780 = torch.aten.select.int %211, %int5_5287, %int1_5288 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5289 = torch.constant.int 5
    %int1_5290 = torch.constant.int 1
    %3781 = torch.aten.select.int %3769, %int5_5289, %int1_5290 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3782 = torch.aten.mul.Tensor %3780, %3781 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_5291 = torch.constant.int 1
    %3783 = torch.aten.add.Tensor %3779, %3782, %int1_5291 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_5292 = torch.constant.int 1
    %int24_5293 = torch.constant.int 24
    %int4608_5294 = torch.constant.int 4608
    %int128_5295 = torch.constant.int 128
    %3784 = torch.prim.ListConstruct %int1_5292, %int24_5293, %int4608_5294, %int128_5295 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3785 = torch.aten.view %3776, %3784 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_5296 = torch.constant.int 5
    %3786 = torch.prims.convert_element_type %3785, %int5_5296 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_5297 = torch.constant.int 1
    %int24_5298 = torch.constant.int 24
    %int4608_5299 = torch.constant.int 4608
    %int128_5300 = torch.constant.int 128
    %3787 = torch.prim.ListConstruct %int1_5297, %int24_5298, %int4608_5299, %int128_5300 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3788 = torch.aten.view %3783, %3787 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_5301 = torch.constant.int 5
    %3789 = torch.prims.convert_element_type %3788, %int5_5301 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_5302 = torch.constant.float 0.000000e+00
    %false_5303 = torch.constant.bool false
    %none_5304 = torch.constant.none
    %none_5305 = torch.constant.none
    %3790:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%3786, %3789, %3763, %float0.000000e00_5302, %false_5303, %none_5304, %none_5305) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_5306 = torch.constant.int 0
    %int2_5307 = torch.constant.int 2
    %int1_5308 = torch.constant.int 1
    %int3_5309 = torch.constant.int 3
    %3791 = torch.prim.ListConstruct %int0_5306, %int2_5307, %int1_5308, %int3_5309 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3792 = torch.aten.permute %3790#0, %3791 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_5310 = torch.constant.int 1
    %int4608_5311 = torch.constant.int 4608
    %int3072_5312 = torch.constant.int 3072
    %3793 = torch.prim.ListConstruct %int1_5310, %int4608_5311, %int3072_5312 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3794 = torch.aten.view %3792, %3793 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_5313 = torch.constant.int 0
    %int0_5314 = torch.constant.int 0
    %int9223372036854775807_5315 = torch.constant.int 9223372036854775807
    %int1_5316 = torch.constant.int 1
    %3795 = torch.aten.slice.Tensor %3794, %int0_5313, %int0_5314, %int9223372036854775807_5315, %int1_5316 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_5317 = torch.constant.int 1
    %int0_5318 = torch.constant.int 0
    %int512_5319 = torch.constant.int 512
    %int1_5320 = torch.constant.int 1
    %3796 = torch.aten.slice.Tensor %3795, %int1_5317, %int0_5318, %int512_5319, %int1_5320 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_5321 = torch.constant.int 0
    %int0_5322 = torch.constant.int 0
    %int9223372036854775807_5323 = torch.constant.int 9223372036854775807
    %int1_5324 = torch.constant.int 1
    %3797 = torch.aten.slice.Tensor %3794, %int0_5321, %int0_5322, %int9223372036854775807_5323, %int1_5324 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_5325 = torch.constant.int 1
    %int512_5326 = torch.constant.int 512
    %int9223372036854775807_5327 = torch.constant.int 9223372036854775807
    %int1_5328 = torch.constant.int 1
    %3798 = torch.aten.slice.Tensor %3797, %int1_5325, %int512_5326, %int9223372036854775807_5327, %int1_5328 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_5329 = torch.constant.int 4096
    %int3072_5330 = torch.constant.int 3072
    %3799 = torch.prim.ListConstruct %int4096_5329, %int3072_5330 : (!torch.int, !torch.int) -> !torch.list<int>
    %3800 = torch.aten.view %3798, %3799 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.10.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.10.img_attn.proj.weight : tensor<3072x3072xf16>
    %3801 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_5331 = torch.constant.int 0
    %int1_5332 = torch.constant.int 1
    %3802 = torch.aten.transpose.int %3801, %int0_5331, %int1_5332 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.10.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.10.img_attn.proj.bias : tensor<3072xf16>
    %3803 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5333 = torch.constant.int 6
    %3804 = torch.prims.convert_element_type %3803, %int6_5333 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5334 = torch.constant.int 6
    %3805 = torch.prims.convert_element_type %3800, %int6_5334 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_5335 = torch.constant.int 6
    %3806 = torch.prims.convert_element_type %3802, %int6_5335 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %3807 = torch.aten.mm %3805, %3806 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_5336 = torch.constant.int 1
    %3808 = torch.aten.mul.Scalar %3807, %int1_5336 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_5337 = torch.constant.int 1
    %3809 = torch.aten.mul.Scalar %3804, %int1_5337 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5338 = torch.constant.int 1
    %3810 = torch.aten.add.Tensor %3808, %3809, %int1_5338 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_5339 = torch.constant.int 5
    %3811 = torch.prims.convert_element_type %3810, %int5_5339 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_5340 = torch.constant.int 1
    %int4096_5341 = torch.constant.int 4096
    %int3072_5342 = torch.constant.int 3072
    %3812 = torch.prim.ListConstruct %int1_5340, %int4096_5341, %int3072_5342 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3813 = torch.aten.view %3811, %3812 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %3814 = torch.aten.mul.Tensor %3599, %3813 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5343 = torch.constant.int 1
    %3815 = torch.aten.add.Tensor %3521, %3814, %int1_5343 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5344 = torch.constant.int 1
    %int1_5345 = torch.constant.int 1
    %3816 = torch.aten.add.Scalar %3601, %int1_5344, %int1_5345 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_5346 = torch.constant.int 6
    %3817 = torch.prims.convert_element_type %3815, %int6_5346 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_5347 = torch.constant.int 2
    %3818 = torch.prim.ListConstruct %int2_5347 : (!torch.int) -> !torch.list<int>
    %int0_5348 = torch.constant.int 0
    %true_5349 = torch.constant.bool true
    %result0_5350, %result1_5351 = torch.aten.var_mean.correction %3817, %3818, %int0_5348, %true_5349 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_5352 = torch.constant.float 9.9999999999999995E-7
    %int1_5353 = torch.constant.int 1
    %3819 = torch.aten.add.Scalar %result0_5350, %float9.999990e-07_5352, %int1_5353 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %3820 = torch.aten.rsqrt %3819 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_5354 = torch.constant.int 1
    %3821 = torch.aten.sub.Tensor %3815, %result1_5351, %int1_5354 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %3822 = torch.aten.mul.Tensor %3821, %3820 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_5355 = torch.constant.int 5
    %3823 = torch.prims.convert_element_type %3822, %int5_5355 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %3824 = torch.aten.mul.Tensor %3816, %3823 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5356 = torch.constant.int 1
    %3825 = torch.aten.add.Tensor %3824, %3600, %int1_5356 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_5357 = torch.constant.int 4096
    %int3072_5358 = torch.constant.int 3072
    %3826 = torch.prim.ListConstruct %int4096_5357, %int3072_5358 : (!torch.int, !torch.int) -> !torch.list<int>
    %3827 = torch.aten.view %3825, %3826 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.10.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.10.img_mlp.0.weight : tensor<12288x3072xf16>
    %3828 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_5359 = torch.constant.int 0
    %int1_5360 = torch.constant.int 1
    %3829 = torch.aten.transpose.int %3828, %int0_5359, %int1_5360 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.10.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.10.img_mlp.0.bias : tensor<12288xf16>
    %3830 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_5361 = torch.constant.int 6
    %3831 = torch.prims.convert_element_type %3830, %int6_5361 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_5362 = torch.constant.int 6
    %3832 = torch.prims.convert_element_type %3827, %int6_5362 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_5363 = torch.constant.int 6
    %3833 = torch.prims.convert_element_type %3829, %int6_5363 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %3834 = torch.aten.mm %3832, %3833 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_5364 = torch.constant.int 1
    %3835 = torch.aten.mul.Scalar %3834, %int1_5364 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_5365 = torch.constant.int 1
    %3836 = torch.aten.mul.Scalar %3831, %int1_5365 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_5366 = torch.constant.int 1
    %3837 = torch.aten.add.Tensor %3835, %3836, %int1_5366 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_5367 = torch.constant.int 5
    %3838 = torch.prims.convert_element_type %3837, %int5_5367 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_5368 = torch.constant.int 1
    %int4096_5369 = torch.constant.int 4096
    %int12288_5370 = torch.constant.int 12288
    %3839 = torch.prim.ListConstruct %int1_5368, %int4096_5369, %int12288_5370 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3840 = torch.aten.view %3838, %3839 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_5371 = torch.constant.str "tanh"
    %3841 = torch.aten.gelu %3840, %str_5371 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_5372 = torch.constant.int 4096
    %int12288_5373 = torch.constant.int 12288
    %3842 = torch.prim.ListConstruct %int4096_5372, %int12288_5373 : (!torch.int, !torch.int) -> !torch.list<int>
    %3843 = torch.aten.view %3841, %3842 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.10.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.10.img_mlp.2.weight : tensor<3072x12288xf16>
    %3844 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_5374 = torch.constant.int 0
    %int1_5375 = torch.constant.int 1
    %3845 = torch.aten.transpose.int %3844, %int0_5374, %int1_5375 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.10.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.10.img_mlp.2.bias : tensor<3072xf16>
    %3846 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5376 = torch.constant.int 6
    %3847 = torch.prims.convert_element_type %3846, %int6_5376 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5377 = torch.constant.int 6
    %3848 = torch.prims.convert_element_type %3843, %int6_5377 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_5378 = torch.constant.int 6
    %3849 = torch.prims.convert_element_type %3845, %int6_5378 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %3850 = torch.aten.mm %3848, %3849 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_5379 = torch.constant.int 1
    %3851 = torch.aten.mul.Scalar %3850, %int1_5379 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_5380 = torch.constant.int 1
    %3852 = torch.aten.mul.Scalar %3847, %int1_5380 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5381 = torch.constant.int 1
    %3853 = torch.aten.add.Tensor %3851, %3852, %int1_5381 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_5382 = torch.constant.int 5
    %3854 = torch.prims.convert_element_type %3853, %int5_5382 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_5383 = torch.constant.int 1
    %int4096_5384 = torch.constant.int 4096
    %int3072_5385 = torch.constant.int 3072
    %3855 = torch.prim.ListConstruct %int1_5383, %int4096_5384, %int3072_5385 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3856 = torch.aten.view %3854, %3855 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %3857 = torch.aten.mul.Tensor %3602, %3856 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5386 = torch.constant.int 1
    %3858 = torch.aten.add.Tensor %3815, %3857, %int1_5386 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_5387 = torch.constant.int 512
    %int3072_5388 = torch.constant.int 3072
    %3859 = torch.prim.ListConstruct %int512_5387, %int3072_5388 : (!torch.int, !torch.int) -> !torch.list<int>
    %3860 = torch.aten.view %3796, %3859 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.10.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.10.txt_attn.proj.weight : tensor<3072x3072xf16>
    %3861 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_5389 = torch.constant.int 0
    %int1_5390 = torch.constant.int 1
    %3862 = torch.aten.transpose.int %3861, %int0_5389, %int1_5390 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.10.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.10.txt_attn.proj.bias : tensor<3072xf16>
    %3863 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5391 = torch.constant.int 6
    %3864 = torch.prims.convert_element_type %3863, %int6_5391 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5392 = torch.constant.int 6
    %3865 = torch.prims.convert_element_type %3860, %int6_5392 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_5393 = torch.constant.int 6
    %3866 = torch.prims.convert_element_type %3862, %int6_5393 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %3867 = torch.aten.mm %3865, %3866 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_5394 = torch.constant.int 1
    %3868 = torch.aten.mul.Scalar %3867, %int1_5394 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_5395 = torch.constant.int 1
    %3869 = torch.aten.mul.Scalar %3864, %int1_5395 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5396 = torch.constant.int 1
    %3870 = torch.aten.add.Tensor %3868, %3869, %int1_5396 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_5397 = torch.constant.int 5
    %3871 = torch.prims.convert_element_type %3870, %int5_5397 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_5398 = torch.constant.int 1
    %int512_5399 = torch.constant.int 512
    %int3072_5400 = torch.constant.int 3072
    %3872 = torch.prim.ListConstruct %int1_5398, %int512_5399, %int3072_5400 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3873 = torch.aten.view %3871, %3872 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %3874 = torch.aten.mul.Tensor %3620, %3873 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5401 = torch.constant.int 1
    %3875 = torch.aten.add.Tensor %3581, %3874, %int1_5401 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_5402 = torch.constant.int 1
    %int1_5403 = torch.constant.int 1
    %3876 = torch.aten.add.Scalar %3622, %int1_5402, %int1_5403 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_5404 = torch.constant.int 6
    %3877 = torch.prims.convert_element_type %3875, %int6_5404 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_5405 = torch.constant.int 2
    %3878 = torch.prim.ListConstruct %int2_5405 : (!torch.int) -> !torch.list<int>
    %int0_5406 = torch.constant.int 0
    %true_5407 = torch.constant.bool true
    %result0_5408, %result1_5409 = torch.aten.var_mean.correction %3877, %3878, %int0_5406, %true_5407 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_5410 = torch.constant.float 9.9999999999999995E-7
    %int1_5411 = torch.constant.int 1
    %3879 = torch.aten.add.Scalar %result0_5408, %float9.999990e-07_5410, %int1_5411 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %3880 = torch.aten.rsqrt %3879 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_5412 = torch.constant.int 1
    %3881 = torch.aten.sub.Tensor %3875, %result1_5409, %int1_5412 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %3882 = torch.aten.mul.Tensor %3881, %3880 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_5413 = torch.constant.int 5
    %3883 = torch.prims.convert_element_type %3882, %int5_5413 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %3884 = torch.aten.mul.Tensor %3876, %3883 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5414 = torch.constant.int 1
    %3885 = torch.aten.add.Tensor %3884, %3621, %int1_5414 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_5415 = torch.constant.int 512
    %int3072_5416 = torch.constant.int 3072
    %3886 = torch.prim.ListConstruct %int512_5415, %int3072_5416 : (!torch.int, !torch.int) -> !torch.list<int>
    %3887 = torch.aten.view %3885, %3886 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.10.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.10.txt_mlp.0.weight : tensor<12288x3072xf16>
    %3888 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_5417 = torch.constant.int 0
    %int1_5418 = torch.constant.int 1
    %3889 = torch.aten.transpose.int %3888, %int0_5417, %int1_5418 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.10.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.10.txt_mlp.0.bias : tensor<12288xf16>
    %3890 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_5419 = torch.constant.int 6
    %3891 = torch.prims.convert_element_type %3890, %int6_5419 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_5420 = torch.constant.int 6
    %3892 = torch.prims.convert_element_type %3887, %int6_5420 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_5421 = torch.constant.int 6
    %3893 = torch.prims.convert_element_type %3889, %int6_5421 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %3894 = torch.aten.mm %3892, %3893 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_5422 = torch.constant.int 1
    %3895 = torch.aten.mul.Scalar %3894, %int1_5422 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_5423 = torch.constant.int 1
    %3896 = torch.aten.mul.Scalar %3891, %int1_5423 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_5424 = torch.constant.int 1
    %3897 = torch.aten.add.Tensor %3895, %3896, %int1_5424 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_5425 = torch.constant.int 5
    %3898 = torch.prims.convert_element_type %3897, %int5_5425 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_5426 = torch.constant.int 1
    %int512_5427 = torch.constant.int 512
    %int12288_5428 = torch.constant.int 12288
    %3899 = torch.prim.ListConstruct %int1_5426, %int512_5427, %int12288_5428 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3900 = torch.aten.view %3898, %3899 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_5429 = torch.constant.str "tanh"
    %3901 = torch.aten.gelu %3900, %str_5429 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_5430 = torch.constant.int 512
    %int12288_5431 = torch.constant.int 12288
    %3902 = torch.prim.ListConstruct %int512_5430, %int12288_5431 : (!torch.int, !torch.int) -> !torch.list<int>
    %3903 = torch.aten.view %3901, %3902 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.10.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.10.txt_mlp.2.weight : tensor<3072x12288xf16>
    %3904 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_5432 = torch.constant.int 0
    %int1_5433 = torch.constant.int 1
    %3905 = torch.aten.transpose.int %3904, %int0_5432, %int1_5433 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.10.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.10.txt_mlp.2.bias : tensor<3072xf16>
    %3906 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5434 = torch.constant.int 6
    %3907 = torch.prims.convert_element_type %3906, %int6_5434 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5435 = torch.constant.int 6
    %3908 = torch.prims.convert_element_type %3903, %int6_5435 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_5436 = torch.constant.int 6
    %3909 = torch.prims.convert_element_type %3905, %int6_5436 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %3910 = torch.aten.mm %3908, %3909 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_5437 = torch.constant.int 1
    %3911 = torch.aten.mul.Scalar %3910, %int1_5437 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_5438 = torch.constant.int 1
    %3912 = torch.aten.mul.Scalar %3907, %int1_5438 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5439 = torch.constant.int 1
    %3913 = torch.aten.add.Tensor %3911, %3912, %int1_5439 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_5440 = torch.constant.int 5
    %3914 = torch.prims.convert_element_type %3913, %int5_5440 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_5441 = torch.constant.int 1
    %int512_5442 = torch.constant.int 512
    %int3072_5443 = torch.constant.int 3072
    %3915 = torch.prim.ListConstruct %int1_5441, %int512_5442, %int3072_5443 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3916 = torch.aten.view %3914, %3915 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %3917 = torch.aten.mul.Tensor %3623, %3916 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5444 = torch.constant.int 1
    %3918 = torch.aten.add.Tensor %3875, %3917, %int1_5444 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %3919 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.11.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.11.img_mod.lin.weight : tensor<18432x3072xf16>
    %3920 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_5445 = torch.constant.int 0
    %int1_5446 = torch.constant.int 1
    %3921 = torch.aten.transpose.int %3920, %int0_5445, %int1_5446 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.11.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.11.img_mod.lin.bias : tensor<18432xf16>
    %3922 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_5447 = torch.constant.int 6
    %3923 = torch.prims.convert_element_type %3922, %int6_5447 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_5448 = torch.constant.int 6
    %3924 = torch.prims.convert_element_type %3919, %int6_5448 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_5449 = torch.constant.int 6
    %3925 = torch.prims.convert_element_type %3921, %int6_5449 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %3926 = torch.aten.mm %3924, %3925 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_5450 = torch.constant.int 1
    %3927 = torch.aten.mul.Scalar %3926, %int1_5450 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_5451 = torch.constant.int 1
    %3928 = torch.aten.mul.Scalar %3923, %int1_5451 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_5452 = torch.constant.int 1
    %3929 = torch.aten.add.Tensor %3927, %3928, %int1_5452 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_5453 = torch.constant.int 5
    %3930 = torch.prims.convert_element_type %3929, %int5_5453 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_5454 = torch.constant.int 0
    %int0_5455 = torch.constant.int 0
    %int9223372036854775807_5456 = torch.constant.int 9223372036854775807
    %int1_5457 = torch.constant.int 1
    %3931 = torch.aten.slice.Tensor %3930, %int0_5454, %int0_5455, %int9223372036854775807_5456, %int1_5457 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_5458 = torch.constant.int 1
    %3932 = torch.aten.unsqueeze %3931, %int1_5458 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_5459 = torch.constant.int 2
    %int0_5460 = torch.constant.int 0
    %int9223372036854775807_5461 = torch.constant.int 9223372036854775807
    %int1_5462 = torch.constant.int 1
    %3933 = torch.aten.slice.Tensor %3932, %int2_5459, %int0_5460, %int9223372036854775807_5461, %int1_5462 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_5463 = torch.constant.int -1
    %int0_5464 = torch.constant.int 0
    %int3072_5465 = torch.constant.int 3072
    %int1_5466 = torch.constant.int 1
    %3934 = torch.aten.slice.Tensor %3933, %int-1_5463, %int0_5464, %int3072_5465, %int1_5466 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5467 = torch.constant.int -1
    %int3072_5468 = torch.constant.int 3072
    %int6144_5469 = torch.constant.int 6144
    %int1_5470 = torch.constant.int 1
    %3935 = torch.aten.slice.Tensor %3933, %int-1_5467, %int3072_5468, %int6144_5469, %int1_5470 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5471 = torch.constant.int -1
    %int6144_5472 = torch.constant.int 6144
    %int9216_5473 = torch.constant.int 9216
    %int1_5474 = torch.constant.int 1
    %3936 = torch.aten.slice.Tensor %3933, %int-1_5471, %int6144_5472, %int9216_5473, %int1_5474 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5475 = torch.constant.int -1
    %int9216_5476 = torch.constant.int 9216
    %int12288_5477 = torch.constant.int 12288
    %int1_5478 = torch.constant.int 1
    %3937 = torch.aten.slice.Tensor %3933, %int-1_5475, %int9216_5476, %int12288_5477, %int1_5478 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5479 = torch.constant.int -1
    %int12288_5480 = torch.constant.int 12288
    %int15360_5481 = torch.constant.int 15360
    %int1_5482 = torch.constant.int 1
    %3938 = torch.aten.slice.Tensor %3933, %int-1_5479, %int12288_5480, %int15360_5481, %int1_5482 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5483 = torch.constant.int -1
    %int15360_5484 = torch.constant.int 15360
    %int18432_5485 = torch.constant.int 18432
    %int1_5486 = torch.constant.int 1
    %3939 = torch.aten.slice.Tensor %3933, %int-1_5483, %int15360_5484, %int18432_5485, %int1_5486 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3940 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.11.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.11.txt_mod.lin.weight : tensor<18432x3072xf16>
    %3941 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_5487 = torch.constant.int 0
    %int1_5488 = torch.constant.int 1
    %3942 = torch.aten.transpose.int %3941, %int0_5487, %int1_5488 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.11.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.11.txt_mod.lin.bias : tensor<18432xf16>
    %3943 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_5489 = torch.constant.int 6
    %3944 = torch.prims.convert_element_type %3943, %int6_5489 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_5490 = torch.constant.int 6
    %3945 = torch.prims.convert_element_type %3940, %int6_5490 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_5491 = torch.constant.int 6
    %3946 = torch.prims.convert_element_type %3942, %int6_5491 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %3947 = torch.aten.mm %3945, %3946 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_5492 = torch.constant.int 1
    %3948 = torch.aten.mul.Scalar %3947, %int1_5492 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_5493 = torch.constant.int 1
    %3949 = torch.aten.mul.Scalar %3944, %int1_5493 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_5494 = torch.constant.int 1
    %3950 = torch.aten.add.Tensor %3948, %3949, %int1_5494 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_5495 = torch.constant.int 5
    %3951 = torch.prims.convert_element_type %3950, %int5_5495 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_5496 = torch.constant.int 0
    %int0_5497 = torch.constant.int 0
    %int9223372036854775807_5498 = torch.constant.int 9223372036854775807
    %int1_5499 = torch.constant.int 1
    %3952 = torch.aten.slice.Tensor %3951, %int0_5496, %int0_5497, %int9223372036854775807_5498, %int1_5499 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_5500 = torch.constant.int 1
    %3953 = torch.aten.unsqueeze %3952, %int1_5500 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_5501 = torch.constant.int 2
    %int0_5502 = torch.constant.int 0
    %int9223372036854775807_5503 = torch.constant.int 9223372036854775807
    %int1_5504 = torch.constant.int 1
    %3954 = torch.aten.slice.Tensor %3953, %int2_5501, %int0_5502, %int9223372036854775807_5503, %int1_5504 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_5505 = torch.constant.int -1
    %int0_5506 = torch.constant.int 0
    %int3072_5507 = torch.constant.int 3072
    %int1_5508 = torch.constant.int 1
    %3955 = torch.aten.slice.Tensor %3954, %int-1_5505, %int0_5506, %int3072_5507, %int1_5508 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5509 = torch.constant.int -1
    %int3072_5510 = torch.constant.int 3072
    %int6144_5511 = torch.constant.int 6144
    %int1_5512 = torch.constant.int 1
    %3956 = torch.aten.slice.Tensor %3954, %int-1_5509, %int3072_5510, %int6144_5511, %int1_5512 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5513 = torch.constant.int -1
    %int6144_5514 = torch.constant.int 6144
    %int9216_5515 = torch.constant.int 9216
    %int1_5516 = torch.constant.int 1
    %3957 = torch.aten.slice.Tensor %3954, %int-1_5513, %int6144_5514, %int9216_5515, %int1_5516 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5517 = torch.constant.int -1
    %int9216_5518 = torch.constant.int 9216
    %int12288_5519 = torch.constant.int 12288
    %int1_5520 = torch.constant.int 1
    %3958 = torch.aten.slice.Tensor %3954, %int-1_5517, %int9216_5518, %int12288_5519, %int1_5520 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5521 = torch.constant.int -1
    %int12288_5522 = torch.constant.int 12288
    %int15360_5523 = torch.constant.int 15360
    %int1_5524 = torch.constant.int 1
    %3959 = torch.aten.slice.Tensor %3954, %int-1_5521, %int12288_5522, %int15360_5523, %int1_5524 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5525 = torch.constant.int -1
    %int15360_5526 = torch.constant.int 15360
    %int18432_5527 = torch.constant.int 18432
    %int1_5528 = torch.constant.int 1
    %3960 = torch.aten.slice.Tensor %3954, %int-1_5525, %int15360_5526, %int18432_5527, %int1_5528 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_5529 = torch.constant.int 6
    %3961 = torch.prims.convert_element_type %3858, %int6_5529 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_5530 = torch.constant.int 2
    %3962 = torch.prim.ListConstruct %int2_5530 : (!torch.int) -> !torch.list<int>
    %int0_5531 = torch.constant.int 0
    %true_5532 = torch.constant.bool true
    %result0_5533, %result1_5534 = torch.aten.var_mean.correction %3961, %3962, %int0_5531, %true_5532 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_5535 = torch.constant.float 9.9999999999999995E-7
    %int1_5536 = torch.constant.int 1
    %3963 = torch.aten.add.Scalar %result0_5533, %float9.999990e-07_5535, %int1_5536 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %3964 = torch.aten.rsqrt %3963 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_5537 = torch.constant.int 1
    %3965 = torch.aten.sub.Tensor %3858, %result1_5534, %int1_5537 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %3966 = torch.aten.mul.Tensor %3965, %3964 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_5538 = torch.constant.int 5
    %3967 = torch.prims.convert_element_type %3966, %int5_5538 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5539 = torch.constant.int 1
    %int1_5540 = torch.constant.int 1
    %3968 = torch.aten.add.Scalar %3935, %int1_5539, %int1_5540 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3969 = torch.aten.mul.Tensor %3968, %3967 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5541 = torch.constant.int 1
    %3970 = torch.aten.add.Tensor %3969, %3934, %int1_5541 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_5542 = torch.constant.int 4096
    %int3072_5543 = torch.constant.int 3072
    %3971 = torch.prim.ListConstruct %int4096_5542, %int3072_5543 : (!torch.int, !torch.int) -> !torch.list<int>
    %3972 = torch.aten.view %3970, %3971 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.11.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.11.img_attn.qkv.weight : tensor<9216x3072xf16>
    %3973 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_5544 = torch.constant.int 0
    %int1_5545 = torch.constant.int 1
    %3974 = torch.aten.transpose.int %3973, %int0_5544, %int1_5545 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.11.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.11.img_attn.qkv.bias : tensor<9216xf16>
    %3975 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_5546 = torch.constant.int 6
    %3976 = torch.prims.convert_element_type %3975, %int6_5546 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_5547 = torch.constant.int 6
    %3977 = torch.prims.convert_element_type %3972, %int6_5547 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_5548 = torch.constant.int 6
    %3978 = torch.prims.convert_element_type %3974, %int6_5548 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %3979 = torch.aten.mm %3977, %3978 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_5549 = torch.constant.int 1
    %3980 = torch.aten.mul.Scalar %3979, %int1_5549 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_5550 = torch.constant.int 1
    %3981 = torch.aten.mul.Scalar %3976, %int1_5550 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_5551 = torch.constant.int 1
    %3982 = torch.aten.add.Tensor %3980, %3981, %int1_5551 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_5552 = torch.constant.int 5
    %3983 = torch.prims.convert_element_type %3982, %int5_5552 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_5553 = torch.constant.int 1
    %int4096_5554 = torch.constant.int 4096
    %int9216_5555 = torch.constant.int 9216
    %3984 = torch.prim.ListConstruct %int1_5553, %int4096_5554, %int9216_5555 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3985 = torch.aten.view %3983, %3984 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %3986 = torch_c.to_builtin_tensor %3985 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_5556 = tensor.cast %3986 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_5557 = arith.constant 0 : index
    %dim_5558 = tensor.dim %cast_5556, %c0_5557 : tensor<?x?x?xf16>
    %c1_5559 = arith.constant 1 : index
    %dim_5560 = tensor.dim %cast_5556, %c1_5559 : tensor<?x?x?xf16>
    %c2_5561 = arith.constant 2 : index
    %dim_5562 = tensor.dim %cast_5556, %c2_5561 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_5556 : tensor<?x?x?xf16>{%dim_5558, %dim_5560, %dim_5562}]
    %cast_5563 = tensor.cast %cast_5556 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %3987 = torch_c.from_builtin_tensor %cast_5563 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_5564 = torch.constant.int 1
    %int4096_5565 = torch.constant.int 4096
    %int3_5566 = torch.constant.int 3
    %int24_5567 = torch.constant.int 24
    %int128_5568 = torch.constant.int 128
    %3988 = torch.prim.ListConstruct %int1_5564, %int4096_5565, %int3_5566, %int24_5567, %int128_5568 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3989 = torch.aten.view %3987, %3988 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_5569 = torch.constant.int 2
    %int0_5570 = torch.constant.int 0
    %int3_5571 = torch.constant.int 3
    %int1_5572 = torch.constant.int 1
    %int4_5573 = torch.constant.int 4
    %3990 = torch.prim.ListConstruct %int2_5569, %int0_5570, %int3_5571, %int1_5572, %int4_5573 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3991 = torch.aten.permute %3989, %3990 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_5574 = torch.constant.int 0
    %int0_5575 = torch.constant.int 0
    %3992 = torch.aten.select.int %3991, %int0_5574, %int0_5575 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_5576 = torch.constant.int 6
    %3993 = torch.prims.convert_element_type %3992, %int6_5576 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_5577 = torch.constant.int 2
    %3994 = torch.aten.pow.Tensor_Scalar %3993, %int2_5577 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_5578 = torch.constant.int -1
    %3995 = torch.prim.ListConstruct %int-1_5578 : (!torch.int) -> !torch.list<int>
    %true_5579 = torch.constant.bool true
    %none_5580 = torch.constant.none
    %3996 = torch.aten.mean.dim %3994, %3995, %true_5579, %none_5580 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_5581 = torch.constant.float 9.9999999999999995E-7
    %int1_5582 = torch.constant.int 1
    %3997 = torch.aten.add.Scalar %3996, %float9.999990e-07_5581, %int1_5582 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %3998 = torch.aten.rsqrt %3997 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %3999 = torch.aten.mul.Tensor %3993, %3998 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_5583 = torch.constant.int 5
    %4000 = torch.prims.convert_element_type %3999, %int5_5583 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.11.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.11.img_attn.norm.query_norm.scale : tensor<128xf16>
    %4001 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4002 = torch.aten.mul.Tensor %4000, %4001 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_5584 = torch.constant.int 1
    %int4096_5585 = torch.constant.int 4096
    %int3_5586 = torch.constant.int 3
    %int24_5587 = torch.constant.int 24
    %int128_5588 = torch.constant.int 128
    %4003 = torch.prim.ListConstruct %int1_5584, %int4096_5585, %int3_5586, %int24_5587, %int128_5588 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4004 = torch.aten.view %3987, %4003 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_5589 = torch.constant.int 2
    %int0_5590 = torch.constant.int 0
    %int3_5591 = torch.constant.int 3
    %int1_5592 = torch.constant.int 1
    %int4_5593 = torch.constant.int 4
    %4005 = torch.prim.ListConstruct %int2_5589, %int0_5590, %int3_5591, %int1_5592, %int4_5593 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4006 = torch.aten.permute %4004, %4005 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_5594 = torch.constant.int 0
    %int1_5595 = torch.constant.int 1
    %4007 = torch.aten.select.int %4006, %int0_5594, %int1_5595 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_5596 = torch.constant.int 6
    %4008 = torch.prims.convert_element_type %4007, %int6_5596 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_5597 = torch.constant.int 2
    %4009 = torch.aten.pow.Tensor_Scalar %4008, %int2_5597 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_5598 = torch.constant.int -1
    %4010 = torch.prim.ListConstruct %int-1_5598 : (!torch.int) -> !torch.list<int>
    %true_5599 = torch.constant.bool true
    %none_5600 = torch.constant.none
    %4011 = torch.aten.mean.dim %4009, %4010, %true_5599, %none_5600 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_5601 = torch.constant.float 9.9999999999999995E-7
    %int1_5602 = torch.constant.int 1
    %4012 = torch.aten.add.Scalar %4011, %float9.999990e-07_5601, %int1_5602 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %4013 = torch.aten.rsqrt %4012 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %4014 = torch.aten.mul.Tensor %4008, %4013 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_5603 = torch.constant.int 5
    %4015 = torch.prims.convert_element_type %4014, %int5_5603 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.11.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.11.img_attn.norm.key_norm.scale : tensor<128xf16>
    %4016 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4017 = torch.aten.mul.Tensor %4015, %4016 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_5604 = torch.constant.int 5
    %4018 = torch.prims.convert_element_type %4002, %int5_5604 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_5605 = torch.constant.int 5
    %4019 = torch.prims.convert_element_type %4017, %int5_5605 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_5606 = torch.constant.int 6
    %4020 = torch.prims.convert_element_type %3918, %int6_5606 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_5607 = torch.constant.int 2
    %4021 = torch.prim.ListConstruct %int2_5607 : (!torch.int) -> !torch.list<int>
    %int0_5608 = torch.constant.int 0
    %true_5609 = torch.constant.bool true
    %result0_5610, %result1_5611 = torch.aten.var_mean.correction %4020, %4021, %int0_5608, %true_5609 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_5612 = torch.constant.float 9.9999999999999995E-7
    %int1_5613 = torch.constant.int 1
    %4022 = torch.aten.add.Scalar %result0_5610, %float9.999990e-07_5612, %int1_5613 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %4023 = torch.aten.rsqrt %4022 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_5614 = torch.constant.int 1
    %4024 = torch.aten.sub.Tensor %3918, %result1_5611, %int1_5614 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %4025 = torch.aten.mul.Tensor %4024, %4023 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_5615 = torch.constant.int 5
    %4026 = torch.prims.convert_element_type %4025, %int5_5615 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_5616 = torch.constant.int 1
    %int1_5617 = torch.constant.int 1
    %4027 = torch.aten.add.Scalar %3956, %int1_5616, %int1_5617 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4028 = torch.aten.mul.Tensor %4027, %4026 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5618 = torch.constant.int 1
    %4029 = torch.aten.add.Tensor %4028, %3955, %int1_5618 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_5619 = torch.constant.int 512
    %int3072_5620 = torch.constant.int 3072
    %4030 = torch.prim.ListConstruct %int512_5619, %int3072_5620 : (!torch.int, !torch.int) -> !torch.list<int>
    %4031 = torch.aten.view %4029, %4030 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.11.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.11.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %4032 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_5621 = torch.constant.int 0
    %int1_5622 = torch.constant.int 1
    %4033 = torch.aten.transpose.int %4032, %int0_5621, %int1_5622 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.11.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.11.txt_attn.qkv.bias : tensor<9216xf16>
    %4034 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_5623 = torch.constant.int 6
    %4035 = torch.prims.convert_element_type %4034, %int6_5623 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_5624 = torch.constant.int 6
    %4036 = torch.prims.convert_element_type %4031, %int6_5624 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_5625 = torch.constant.int 6
    %4037 = torch.prims.convert_element_type %4033, %int6_5625 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %4038 = torch.aten.mm %4036, %4037 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_5626 = torch.constant.int 1
    %4039 = torch.aten.mul.Scalar %4038, %int1_5626 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_5627 = torch.constant.int 1
    %4040 = torch.aten.mul.Scalar %4035, %int1_5627 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_5628 = torch.constant.int 1
    %4041 = torch.aten.add.Tensor %4039, %4040, %int1_5628 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_5629 = torch.constant.int 5
    %4042 = torch.prims.convert_element_type %4041, %int5_5629 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_5630 = torch.constant.int 1
    %int512_5631 = torch.constant.int 512
    %int9216_5632 = torch.constant.int 9216
    %4043 = torch.prim.ListConstruct %int1_5630, %int512_5631, %int9216_5632 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4044 = torch.aten.view %4042, %4043 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %4045 = torch_c.to_builtin_tensor %4044 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_5633 = tensor.cast %4045 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_5634 = arith.constant 0 : index
    %dim_5635 = tensor.dim %cast_5633, %c0_5634 : tensor<?x?x?xf16>
    %c1_5636 = arith.constant 1 : index
    %dim_5637 = tensor.dim %cast_5633, %c1_5636 : tensor<?x?x?xf16>
    %c2_5638 = arith.constant 2 : index
    %dim_5639 = tensor.dim %cast_5633, %c2_5638 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_5633 : tensor<?x?x?xf16>{%dim_5635, %dim_5637, %dim_5639}]
    %cast_5640 = tensor.cast %cast_5633 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %4046 = torch_c.from_builtin_tensor %cast_5640 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_5641 = torch.constant.int 1
    %int512_5642 = torch.constant.int 512
    %int3_5643 = torch.constant.int 3
    %int24_5644 = torch.constant.int 24
    %int128_5645 = torch.constant.int 128
    %4047 = torch.prim.ListConstruct %int1_5641, %int512_5642, %int3_5643, %int24_5644, %int128_5645 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4048 = torch.aten.view %4046, %4047 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_5646 = torch.constant.int 2
    %int0_5647 = torch.constant.int 0
    %int3_5648 = torch.constant.int 3
    %int1_5649 = torch.constant.int 1
    %int4_5650 = torch.constant.int 4
    %4049 = torch.prim.ListConstruct %int2_5646, %int0_5647, %int3_5648, %int1_5649, %int4_5650 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4050 = torch.aten.permute %4048, %4049 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_5651 = torch.constant.int 0
    %int0_5652 = torch.constant.int 0
    %4051 = torch.aten.select.int %4050, %int0_5651, %int0_5652 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_5653 = torch.constant.int 6
    %4052 = torch.prims.convert_element_type %4051, %int6_5653 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_5654 = torch.constant.int 2
    %4053 = torch.aten.pow.Tensor_Scalar %4052, %int2_5654 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_5655 = torch.constant.int -1
    %4054 = torch.prim.ListConstruct %int-1_5655 : (!torch.int) -> !torch.list<int>
    %true_5656 = torch.constant.bool true
    %none_5657 = torch.constant.none
    %4055 = torch.aten.mean.dim %4053, %4054, %true_5656, %none_5657 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_5658 = torch.constant.float 9.9999999999999995E-7
    %int1_5659 = torch.constant.int 1
    %4056 = torch.aten.add.Scalar %4055, %float9.999990e-07_5658, %int1_5659 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %4057 = torch.aten.rsqrt %4056 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %4058 = torch.aten.mul.Tensor %4052, %4057 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_5660 = torch.constant.int 5
    %4059 = torch.prims.convert_element_type %4058, %int5_5660 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.11.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.11.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %4060 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4061 = torch.aten.mul.Tensor %4059, %4060 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_5661 = torch.constant.int 1
    %int512_5662 = torch.constant.int 512
    %int3_5663 = torch.constant.int 3
    %int24_5664 = torch.constant.int 24
    %int128_5665 = torch.constant.int 128
    %4062 = torch.prim.ListConstruct %int1_5661, %int512_5662, %int3_5663, %int24_5664, %int128_5665 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4063 = torch.aten.view %4046, %4062 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_5666 = torch.constant.int 2
    %int0_5667 = torch.constant.int 0
    %int3_5668 = torch.constant.int 3
    %int1_5669 = torch.constant.int 1
    %int4_5670 = torch.constant.int 4
    %4064 = torch.prim.ListConstruct %int2_5666, %int0_5667, %int3_5668, %int1_5669, %int4_5670 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4065 = torch.aten.permute %4063, %4064 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_5671 = torch.constant.int 0
    %int1_5672 = torch.constant.int 1
    %4066 = torch.aten.select.int %4065, %int0_5671, %int1_5672 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_5673 = torch.constant.int 6
    %4067 = torch.prims.convert_element_type %4066, %int6_5673 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_5674 = torch.constant.int 2
    %4068 = torch.aten.pow.Tensor_Scalar %4067, %int2_5674 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_5675 = torch.constant.int -1
    %4069 = torch.prim.ListConstruct %int-1_5675 : (!torch.int) -> !torch.list<int>
    %true_5676 = torch.constant.bool true
    %none_5677 = torch.constant.none
    %4070 = torch.aten.mean.dim %4068, %4069, %true_5676, %none_5677 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_5678 = torch.constant.float 9.9999999999999995E-7
    %int1_5679 = torch.constant.int 1
    %4071 = torch.aten.add.Scalar %4070, %float9.999990e-07_5678, %int1_5679 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %4072 = torch.aten.rsqrt %4071 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %4073 = torch.aten.mul.Tensor %4067, %4072 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_5680 = torch.constant.int 5
    %4074 = torch.prims.convert_element_type %4073, %int5_5680 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.11.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.11.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %4075 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4076 = torch.aten.mul.Tensor %4074, %4075 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_5681 = torch.constant.int 5
    %4077 = torch.prims.convert_element_type %4061, %int5_5681 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_5682 = torch.constant.int 5
    %4078 = torch.prims.convert_element_type %4076, %int5_5682 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %4079 = torch.prim.ListConstruct %4077, %4018 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_5683 = torch.constant.int 2
    %4080 = torch.aten.cat %4079, %int2_5683 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %4081 = torch.prim.ListConstruct %4078, %4019 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_5684 = torch.constant.int 2
    %4082 = torch.aten.cat %4081, %int2_5684 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_5685 = torch.constant.int 1
    %int512_5686 = torch.constant.int 512
    %int3_5687 = torch.constant.int 3
    %int24_5688 = torch.constant.int 24
    %int128_5689 = torch.constant.int 128
    %4083 = torch.prim.ListConstruct %int1_5685, %int512_5686, %int3_5687, %int24_5688, %int128_5689 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4084 = torch.aten.view %4046, %4083 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_5690 = torch.constant.int 2
    %int0_5691 = torch.constant.int 0
    %int3_5692 = torch.constant.int 3
    %int1_5693 = torch.constant.int 1
    %int4_5694 = torch.constant.int 4
    %4085 = torch.prim.ListConstruct %int2_5690, %int0_5691, %int3_5692, %int1_5693, %int4_5694 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4086 = torch.aten.permute %4084, %4085 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_5695 = torch.constant.int 0
    %int2_5696 = torch.constant.int 2
    %4087 = torch.aten.select.int %4086, %int0_5695, %int2_5696 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_5697 = torch.constant.int 1
    %int4096_5698 = torch.constant.int 4096
    %int3_5699 = torch.constant.int 3
    %int24_5700 = torch.constant.int 24
    %int128_5701 = torch.constant.int 128
    %4088 = torch.prim.ListConstruct %int1_5697, %int4096_5698, %int3_5699, %int24_5700, %int128_5701 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4089 = torch.aten.view %3987, %4088 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_5702 = torch.constant.int 2
    %int0_5703 = torch.constant.int 0
    %int3_5704 = torch.constant.int 3
    %int1_5705 = torch.constant.int 1
    %int4_5706 = torch.constant.int 4
    %4090 = torch.prim.ListConstruct %int2_5702, %int0_5703, %int3_5704, %int1_5705, %int4_5706 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4091 = torch.aten.permute %4089, %4090 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_5707 = torch.constant.int 0
    %int2_5708 = torch.constant.int 2
    %4092 = torch.aten.select.int %4091, %int0_5707, %int2_5708 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %4093 = torch.prim.ListConstruct %4087, %4092 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_5709 = torch.constant.int 2
    %4094 = torch.aten.cat %4093, %int2_5709 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %4095 = torch_c.to_builtin_tensor %4080 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_5710 = tensor.cast %4095 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_5711 = arith.constant 0 : index
    %dim_5712 = tensor.dim %cast_5710, %c0_5711 : tensor<?x?x?x?xf16>
    %c1_5713 = arith.constant 1 : index
    %dim_5714 = tensor.dim %cast_5710, %c1_5713 : tensor<?x?x?x?xf16>
    %c2_5715 = arith.constant 2 : index
    %dim_5716 = tensor.dim %cast_5710, %c2_5715 : tensor<?x?x?x?xf16>
    %c3_5717 = arith.constant 3 : index
    %dim_5718 = tensor.dim %cast_5710, %c3_5717 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_5710 : tensor<?x?x?x?xf16>{%dim_5712, %dim_5714, %dim_5716, %dim_5718}]
    %cast_5719 = tensor.cast %cast_5710 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %4096 = torch_c.from_builtin_tensor %cast_5719 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %4097 = torch_c.to_builtin_tensor %4082 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_5720 = tensor.cast %4097 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_5721 = arith.constant 0 : index
    %dim_5722 = tensor.dim %cast_5720, %c0_5721 : tensor<?x?x?x?xf16>
    %c1_5723 = arith.constant 1 : index
    %dim_5724 = tensor.dim %cast_5720, %c1_5723 : tensor<?x?x?x?xf16>
    %c2_5725 = arith.constant 2 : index
    %dim_5726 = tensor.dim %cast_5720, %c2_5725 : tensor<?x?x?x?xf16>
    %c3_5727 = arith.constant 3 : index
    %dim_5728 = tensor.dim %cast_5720, %c3_5727 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_5720 : tensor<?x?x?x?xf16>{%dim_5722, %dim_5724, %dim_5726, %dim_5728}]
    %cast_5729 = tensor.cast %cast_5720 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %4098 = torch_c.from_builtin_tensor %cast_5729 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %4099 = torch_c.to_builtin_tensor %4094 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_5730 = tensor.cast %4099 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_5731 = arith.constant 0 : index
    %dim_5732 = tensor.dim %cast_5730, %c0_5731 : tensor<?x?x?x?xf16>
    %c1_5733 = arith.constant 1 : index
    %dim_5734 = tensor.dim %cast_5730, %c1_5733 : tensor<?x?x?x?xf16>
    %c2_5735 = arith.constant 2 : index
    %dim_5736 = tensor.dim %cast_5730, %c2_5735 : tensor<?x?x?x?xf16>
    %c3_5737 = arith.constant 3 : index
    %dim_5738 = tensor.dim %cast_5730, %c3_5737 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_5730 : tensor<?x?x?x?xf16>{%dim_5732, %dim_5734, %dim_5736, %dim_5738}]
    %cast_5739 = tensor.cast %cast_5730 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %4100 = torch_c.from_builtin_tensor %cast_5739 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_5740 = torch.constant.int 6
    %4101 = torch.prims.convert_element_type %4096, %int6_5740 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_5741 = torch.constant.int 1
    %int24_5742 = torch.constant.int 24
    %int4608_5743 = torch.constant.int 4608
    %int-1_5744 = torch.constant.int -1
    %int1_5745 = torch.constant.int 1
    %int2_5746 = torch.constant.int 2
    %4102 = torch.prim.ListConstruct %int1_5741, %int24_5742, %int4608_5743, %int-1_5744, %int1_5745, %int2_5746 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4103 = torch.aten.view %4101, %4102 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_5747 = torch.constant.int 6
    %4104 = torch.prims.convert_element_type %4098, %int6_5747 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_5748 = torch.constant.int 1
    %int24_5749 = torch.constant.int 24
    %int4608_5750 = torch.constant.int 4608
    %int-1_5751 = torch.constant.int -1
    %int1_5752 = torch.constant.int 1
    %int2_5753 = torch.constant.int 2
    %4105 = torch.prim.ListConstruct %int1_5748, %int24_5749, %int4608_5750, %int-1_5751, %int1_5752, %int2_5753 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4106 = torch.aten.view %4104, %4105 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_5754 = torch.constant.int 5
    %int0_5755 = torch.constant.int 0
    %4107 = torch.aten.select.int %211, %int5_5754, %int0_5755 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5756 = torch.constant.int 5
    %int0_5757 = torch.constant.int 0
    %4108 = torch.aten.select.int %4103, %int5_5756, %int0_5757 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4109 = torch.aten.mul.Tensor %4107, %4108 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_5758 = torch.constant.int 5
    %int1_5759 = torch.constant.int 1
    %4110 = torch.aten.select.int %211, %int5_5758, %int1_5759 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5760 = torch.constant.int 5
    %int1_5761 = torch.constant.int 1
    %4111 = torch.aten.select.int %4103, %int5_5760, %int1_5761 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4112 = torch.aten.mul.Tensor %4110, %4111 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_5762 = torch.constant.int 1
    %4113 = torch.aten.add.Tensor %4109, %4112, %int1_5762 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_5763 = torch.constant.int 5
    %int0_5764 = torch.constant.int 0
    %4114 = torch.aten.select.int %211, %int5_5763, %int0_5764 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5765 = torch.constant.int 5
    %int0_5766 = torch.constant.int 0
    %4115 = torch.aten.select.int %4106, %int5_5765, %int0_5766 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4116 = torch.aten.mul.Tensor %4114, %4115 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_5767 = torch.constant.int 5
    %int1_5768 = torch.constant.int 1
    %4117 = torch.aten.select.int %211, %int5_5767, %int1_5768 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5769 = torch.constant.int 5
    %int1_5770 = torch.constant.int 1
    %4118 = torch.aten.select.int %4106, %int5_5769, %int1_5770 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4119 = torch.aten.mul.Tensor %4117, %4118 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_5771 = torch.constant.int 1
    %4120 = torch.aten.add.Tensor %4116, %4119, %int1_5771 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_5772 = torch.constant.int 1
    %int24_5773 = torch.constant.int 24
    %int4608_5774 = torch.constant.int 4608
    %int128_5775 = torch.constant.int 128
    %4121 = torch.prim.ListConstruct %int1_5772, %int24_5773, %int4608_5774, %int128_5775 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4122 = torch.aten.view %4113, %4121 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_5776 = torch.constant.int 5
    %4123 = torch.prims.convert_element_type %4122, %int5_5776 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_5777 = torch.constant.int 1
    %int24_5778 = torch.constant.int 24
    %int4608_5779 = torch.constant.int 4608
    %int128_5780 = torch.constant.int 128
    %4124 = torch.prim.ListConstruct %int1_5777, %int24_5778, %int4608_5779, %int128_5780 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4125 = torch.aten.view %4120, %4124 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_5781 = torch.constant.int 5
    %4126 = torch.prims.convert_element_type %4125, %int5_5781 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_5782 = torch.constant.float 0.000000e+00
    %false_5783 = torch.constant.bool false
    %none_5784 = torch.constant.none
    %none_5785 = torch.constant.none
    %4127:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%4123, %4126, %4100, %float0.000000e00_5782, %false_5783, %none_5784, %none_5785) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_5786 = torch.constant.int 0
    %int2_5787 = torch.constant.int 2
    %int1_5788 = torch.constant.int 1
    %int3_5789 = torch.constant.int 3
    %4128 = torch.prim.ListConstruct %int0_5786, %int2_5787, %int1_5788, %int3_5789 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4129 = torch.aten.permute %4127#0, %4128 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_5790 = torch.constant.int 1
    %int4608_5791 = torch.constant.int 4608
    %int3072_5792 = torch.constant.int 3072
    %4130 = torch.prim.ListConstruct %int1_5790, %int4608_5791, %int3072_5792 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4131 = torch.aten.view %4129, %4130 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_5793 = torch.constant.int 0
    %int0_5794 = torch.constant.int 0
    %int9223372036854775807_5795 = torch.constant.int 9223372036854775807
    %int1_5796 = torch.constant.int 1
    %4132 = torch.aten.slice.Tensor %4131, %int0_5793, %int0_5794, %int9223372036854775807_5795, %int1_5796 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_5797 = torch.constant.int 1
    %int0_5798 = torch.constant.int 0
    %int512_5799 = torch.constant.int 512
    %int1_5800 = torch.constant.int 1
    %4133 = torch.aten.slice.Tensor %4132, %int1_5797, %int0_5798, %int512_5799, %int1_5800 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_5801 = torch.constant.int 0
    %int0_5802 = torch.constant.int 0
    %int9223372036854775807_5803 = torch.constant.int 9223372036854775807
    %int1_5804 = torch.constant.int 1
    %4134 = torch.aten.slice.Tensor %4131, %int0_5801, %int0_5802, %int9223372036854775807_5803, %int1_5804 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_5805 = torch.constant.int 1
    %int512_5806 = torch.constant.int 512
    %int9223372036854775807_5807 = torch.constant.int 9223372036854775807
    %int1_5808 = torch.constant.int 1
    %4135 = torch.aten.slice.Tensor %4134, %int1_5805, %int512_5806, %int9223372036854775807_5807, %int1_5808 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_5809 = torch.constant.int 4096
    %int3072_5810 = torch.constant.int 3072
    %4136 = torch.prim.ListConstruct %int4096_5809, %int3072_5810 : (!torch.int, !torch.int) -> !torch.list<int>
    %4137 = torch.aten.view %4135, %4136 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.11.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.11.img_attn.proj.weight : tensor<3072x3072xf16>
    %4138 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_5811 = torch.constant.int 0
    %int1_5812 = torch.constant.int 1
    %4139 = torch.aten.transpose.int %4138, %int0_5811, %int1_5812 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.11.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.11.img_attn.proj.bias : tensor<3072xf16>
    %4140 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5813 = torch.constant.int 6
    %4141 = torch.prims.convert_element_type %4140, %int6_5813 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5814 = torch.constant.int 6
    %4142 = torch.prims.convert_element_type %4137, %int6_5814 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_5815 = torch.constant.int 6
    %4143 = torch.prims.convert_element_type %4139, %int6_5815 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %4144 = torch.aten.mm %4142, %4143 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_5816 = torch.constant.int 1
    %4145 = torch.aten.mul.Scalar %4144, %int1_5816 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_5817 = torch.constant.int 1
    %4146 = torch.aten.mul.Scalar %4141, %int1_5817 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5818 = torch.constant.int 1
    %4147 = torch.aten.add.Tensor %4145, %4146, %int1_5818 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_5819 = torch.constant.int 5
    %4148 = torch.prims.convert_element_type %4147, %int5_5819 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_5820 = torch.constant.int 1
    %int4096_5821 = torch.constant.int 4096
    %int3072_5822 = torch.constant.int 3072
    %4149 = torch.prim.ListConstruct %int1_5820, %int4096_5821, %int3072_5822 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4150 = torch.aten.view %4148, %4149 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %4151 = torch.aten.mul.Tensor %3936, %4150 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5823 = torch.constant.int 1
    %4152 = torch.aten.add.Tensor %3858, %4151, %int1_5823 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5824 = torch.constant.int 1
    %int1_5825 = torch.constant.int 1
    %4153 = torch.aten.add.Scalar %3938, %int1_5824, %int1_5825 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_5826 = torch.constant.int 6
    %4154 = torch.prims.convert_element_type %4152, %int6_5826 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_5827 = torch.constant.int 2
    %4155 = torch.prim.ListConstruct %int2_5827 : (!torch.int) -> !torch.list<int>
    %int0_5828 = torch.constant.int 0
    %true_5829 = torch.constant.bool true
    %result0_5830, %result1_5831 = torch.aten.var_mean.correction %4154, %4155, %int0_5828, %true_5829 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_5832 = torch.constant.float 9.9999999999999995E-7
    %int1_5833 = torch.constant.int 1
    %4156 = torch.aten.add.Scalar %result0_5830, %float9.999990e-07_5832, %int1_5833 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %4157 = torch.aten.rsqrt %4156 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_5834 = torch.constant.int 1
    %4158 = torch.aten.sub.Tensor %4152, %result1_5831, %int1_5834 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %4159 = torch.aten.mul.Tensor %4158, %4157 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_5835 = torch.constant.int 5
    %4160 = torch.prims.convert_element_type %4159, %int5_5835 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %4161 = torch.aten.mul.Tensor %4153, %4160 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5836 = torch.constant.int 1
    %4162 = torch.aten.add.Tensor %4161, %3937, %int1_5836 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_5837 = torch.constant.int 4096
    %int3072_5838 = torch.constant.int 3072
    %4163 = torch.prim.ListConstruct %int4096_5837, %int3072_5838 : (!torch.int, !torch.int) -> !torch.list<int>
    %4164 = torch.aten.view %4162, %4163 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.11.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.11.img_mlp.0.weight : tensor<12288x3072xf16>
    %4165 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_5839 = torch.constant.int 0
    %int1_5840 = torch.constant.int 1
    %4166 = torch.aten.transpose.int %4165, %int0_5839, %int1_5840 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.11.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.11.img_mlp.0.bias : tensor<12288xf16>
    %4167 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_5841 = torch.constant.int 6
    %4168 = torch.prims.convert_element_type %4167, %int6_5841 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_5842 = torch.constant.int 6
    %4169 = torch.prims.convert_element_type %4164, %int6_5842 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_5843 = torch.constant.int 6
    %4170 = torch.prims.convert_element_type %4166, %int6_5843 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %4171 = torch.aten.mm %4169, %4170 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_5844 = torch.constant.int 1
    %4172 = torch.aten.mul.Scalar %4171, %int1_5844 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_5845 = torch.constant.int 1
    %4173 = torch.aten.mul.Scalar %4168, %int1_5845 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_5846 = torch.constant.int 1
    %4174 = torch.aten.add.Tensor %4172, %4173, %int1_5846 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_5847 = torch.constant.int 5
    %4175 = torch.prims.convert_element_type %4174, %int5_5847 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_5848 = torch.constant.int 1
    %int4096_5849 = torch.constant.int 4096
    %int12288_5850 = torch.constant.int 12288
    %4176 = torch.prim.ListConstruct %int1_5848, %int4096_5849, %int12288_5850 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4177 = torch.aten.view %4175, %4176 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_5851 = torch.constant.str "tanh"
    %4178 = torch.aten.gelu %4177, %str_5851 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_5852 = torch.constant.int 4096
    %int12288_5853 = torch.constant.int 12288
    %4179 = torch.prim.ListConstruct %int4096_5852, %int12288_5853 : (!torch.int, !torch.int) -> !torch.list<int>
    %4180 = torch.aten.view %4178, %4179 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.11.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.11.img_mlp.2.weight : tensor<3072x12288xf16>
    %4181 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_5854 = torch.constant.int 0
    %int1_5855 = torch.constant.int 1
    %4182 = torch.aten.transpose.int %4181, %int0_5854, %int1_5855 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.11.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.11.img_mlp.2.bias : tensor<3072xf16>
    %4183 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5856 = torch.constant.int 6
    %4184 = torch.prims.convert_element_type %4183, %int6_5856 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5857 = torch.constant.int 6
    %4185 = torch.prims.convert_element_type %4180, %int6_5857 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_5858 = torch.constant.int 6
    %4186 = torch.prims.convert_element_type %4182, %int6_5858 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %4187 = torch.aten.mm %4185, %4186 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_5859 = torch.constant.int 1
    %4188 = torch.aten.mul.Scalar %4187, %int1_5859 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_5860 = torch.constant.int 1
    %4189 = torch.aten.mul.Scalar %4184, %int1_5860 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5861 = torch.constant.int 1
    %4190 = torch.aten.add.Tensor %4188, %4189, %int1_5861 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_5862 = torch.constant.int 5
    %4191 = torch.prims.convert_element_type %4190, %int5_5862 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_5863 = torch.constant.int 1
    %int4096_5864 = torch.constant.int 4096
    %int3072_5865 = torch.constant.int 3072
    %4192 = torch.prim.ListConstruct %int1_5863, %int4096_5864, %int3072_5865 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4193 = torch.aten.view %4191, %4192 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %4194 = torch.aten.mul.Tensor %3939, %4193 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5866 = torch.constant.int 1
    %4195 = torch.aten.add.Tensor %4152, %4194, %int1_5866 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_5867 = torch.constant.int 512
    %int3072_5868 = torch.constant.int 3072
    %4196 = torch.prim.ListConstruct %int512_5867, %int3072_5868 : (!torch.int, !torch.int) -> !torch.list<int>
    %4197 = torch.aten.view %4133, %4196 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.11.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.11.txt_attn.proj.weight : tensor<3072x3072xf16>
    %4198 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_5869 = torch.constant.int 0
    %int1_5870 = torch.constant.int 1
    %4199 = torch.aten.transpose.int %4198, %int0_5869, %int1_5870 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.11.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.11.txt_attn.proj.bias : tensor<3072xf16>
    %4200 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5871 = torch.constant.int 6
    %4201 = torch.prims.convert_element_type %4200, %int6_5871 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5872 = torch.constant.int 6
    %4202 = torch.prims.convert_element_type %4197, %int6_5872 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_5873 = torch.constant.int 6
    %4203 = torch.prims.convert_element_type %4199, %int6_5873 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %4204 = torch.aten.mm %4202, %4203 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_5874 = torch.constant.int 1
    %4205 = torch.aten.mul.Scalar %4204, %int1_5874 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_5875 = torch.constant.int 1
    %4206 = torch.aten.mul.Scalar %4201, %int1_5875 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5876 = torch.constant.int 1
    %4207 = torch.aten.add.Tensor %4205, %4206, %int1_5876 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_5877 = torch.constant.int 5
    %4208 = torch.prims.convert_element_type %4207, %int5_5877 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_5878 = torch.constant.int 1
    %int512_5879 = torch.constant.int 512
    %int3072_5880 = torch.constant.int 3072
    %4209 = torch.prim.ListConstruct %int1_5878, %int512_5879, %int3072_5880 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4210 = torch.aten.view %4208, %4209 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %4211 = torch.aten.mul.Tensor %3957, %4210 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5881 = torch.constant.int 1
    %4212 = torch.aten.add.Tensor %3918, %4211, %int1_5881 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_5882 = torch.constant.int 1
    %int1_5883 = torch.constant.int 1
    %4213 = torch.aten.add.Scalar %3959, %int1_5882, %int1_5883 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_5884 = torch.constant.int 6
    %4214 = torch.prims.convert_element_type %4212, %int6_5884 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_5885 = torch.constant.int 2
    %4215 = torch.prim.ListConstruct %int2_5885 : (!torch.int) -> !torch.list<int>
    %int0_5886 = torch.constant.int 0
    %true_5887 = torch.constant.bool true
    %result0_5888, %result1_5889 = torch.aten.var_mean.correction %4214, %4215, %int0_5886, %true_5887 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_5890 = torch.constant.float 9.9999999999999995E-7
    %int1_5891 = torch.constant.int 1
    %4216 = torch.aten.add.Scalar %result0_5888, %float9.999990e-07_5890, %int1_5891 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %4217 = torch.aten.rsqrt %4216 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_5892 = torch.constant.int 1
    %4218 = torch.aten.sub.Tensor %4212, %result1_5889, %int1_5892 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %4219 = torch.aten.mul.Tensor %4218, %4217 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_5893 = torch.constant.int 5
    %4220 = torch.prims.convert_element_type %4219, %int5_5893 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %4221 = torch.aten.mul.Tensor %4213, %4220 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5894 = torch.constant.int 1
    %4222 = torch.aten.add.Tensor %4221, %3958, %int1_5894 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_5895 = torch.constant.int 512
    %int3072_5896 = torch.constant.int 3072
    %4223 = torch.prim.ListConstruct %int512_5895, %int3072_5896 : (!torch.int, !torch.int) -> !torch.list<int>
    %4224 = torch.aten.view %4222, %4223 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.11.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.11.txt_mlp.0.weight : tensor<12288x3072xf16>
    %4225 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_5897 = torch.constant.int 0
    %int1_5898 = torch.constant.int 1
    %4226 = torch.aten.transpose.int %4225, %int0_5897, %int1_5898 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.11.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.11.txt_mlp.0.bias : tensor<12288xf16>
    %4227 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_5899 = torch.constant.int 6
    %4228 = torch.prims.convert_element_type %4227, %int6_5899 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_5900 = torch.constant.int 6
    %4229 = torch.prims.convert_element_type %4224, %int6_5900 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_5901 = torch.constant.int 6
    %4230 = torch.prims.convert_element_type %4226, %int6_5901 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %4231 = torch.aten.mm %4229, %4230 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_5902 = torch.constant.int 1
    %4232 = torch.aten.mul.Scalar %4231, %int1_5902 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_5903 = torch.constant.int 1
    %4233 = torch.aten.mul.Scalar %4228, %int1_5903 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_5904 = torch.constant.int 1
    %4234 = torch.aten.add.Tensor %4232, %4233, %int1_5904 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_5905 = torch.constant.int 5
    %4235 = torch.prims.convert_element_type %4234, %int5_5905 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_5906 = torch.constant.int 1
    %int512_5907 = torch.constant.int 512
    %int12288_5908 = torch.constant.int 12288
    %4236 = torch.prim.ListConstruct %int1_5906, %int512_5907, %int12288_5908 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4237 = torch.aten.view %4235, %4236 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_5909 = torch.constant.str "tanh"
    %4238 = torch.aten.gelu %4237, %str_5909 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_5910 = torch.constant.int 512
    %int12288_5911 = torch.constant.int 12288
    %4239 = torch.prim.ListConstruct %int512_5910, %int12288_5911 : (!torch.int, !torch.int) -> !torch.list<int>
    %4240 = torch.aten.view %4238, %4239 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.11.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.11.txt_mlp.2.weight : tensor<3072x12288xf16>
    %4241 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_5912 = torch.constant.int 0
    %int1_5913 = torch.constant.int 1
    %4242 = torch.aten.transpose.int %4241, %int0_5912, %int1_5913 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.11.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.11.txt_mlp.2.bias : tensor<3072xf16>
    %4243 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5914 = torch.constant.int 6
    %4244 = torch.prims.convert_element_type %4243, %int6_5914 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5915 = torch.constant.int 6
    %4245 = torch.prims.convert_element_type %4240, %int6_5915 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_5916 = torch.constant.int 6
    %4246 = torch.prims.convert_element_type %4242, %int6_5916 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %4247 = torch.aten.mm %4245, %4246 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_5917 = torch.constant.int 1
    %4248 = torch.aten.mul.Scalar %4247, %int1_5917 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_5918 = torch.constant.int 1
    %4249 = torch.aten.mul.Scalar %4244, %int1_5918 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5919 = torch.constant.int 1
    %4250 = torch.aten.add.Tensor %4248, %4249, %int1_5919 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_5920 = torch.constant.int 5
    %4251 = torch.prims.convert_element_type %4250, %int5_5920 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_5921 = torch.constant.int 1
    %int512_5922 = torch.constant.int 512
    %int3072_5923 = torch.constant.int 3072
    %4252 = torch.prim.ListConstruct %int1_5921, %int512_5922, %int3072_5923 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4253 = torch.aten.view %4251, %4252 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %4254 = torch.aten.mul.Tensor %3960, %4253 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5924 = torch.constant.int 1
    %4255 = torch.aten.add.Tensor %4212, %4254, %int1_5924 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %4256 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.12.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.12.img_mod.lin.weight : tensor<18432x3072xf16>
    %4257 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_5925 = torch.constant.int 0
    %int1_5926 = torch.constant.int 1
    %4258 = torch.aten.transpose.int %4257, %int0_5925, %int1_5926 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.12.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.12.img_mod.lin.bias : tensor<18432xf16>
    %4259 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_5927 = torch.constant.int 6
    %4260 = torch.prims.convert_element_type %4259, %int6_5927 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_5928 = torch.constant.int 6
    %4261 = torch.prims.convert_element_type %4256, %int6_5928 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_5929 = torch.constant.int 6
    %4262 = torch.prims.convert_element_type %4258, %int6_5929 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %4263 = torch.aten.mm %4261, %4262 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_5930 = torch.constant.int 1
    %4264 = torch.aten.mul.Scalar %4263, %int1_5930 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_5931 = torch.constant.int 1
    %4265 = torch.aten.mul.Scalar %4260, %int1_5931 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_5932 = torch.constant.int 1
    %4266 = torch.aten.add.Tensor %4264, %4265, %int1_5932 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_5933 = torch.constant.int 5
    %4267 = torch.prims.convert_element_type %4266, %int5_5933 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_5934 = torch.constant.int 0
    %int0_5935 = torch.constant.int 0
    %int9223372036854775807_5936 = torch.constant.int 9223372036854775807
    %int1_5937 = torch.constant.int 1
    %4268 = torch.aten.slice.Tensor %4267, %int0_5934, %int0_5935, %int9223372036854775807_5936, %int1_5937 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_5938 = torch.constant.int 1
    %4269 = torch.aten.unsqueeze %4268, %int1_5938 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_5939 = torch.constant.int 2
    %int0_5940 = torch.constant.int 0
    %int9223372036854775807_5941 = torch.constant.int 9223372036854775807
    %int1_5942 = torch.constant.int 1
    %4270 = torch.aten.slice.Tensor %4269, %int2_5939, %int0_5940, %int9223372036854775807_5941, %int1_5942 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_5943 = torch.constant.int -1
    %int0_5944 = torch.constant.int 0
    %int3072_5945 = torch.constant.int 3072
    %int1_5946 = torch.constant.int 1
    %4271 = torch.aten.slice.Tensor %4270, %int-1_5943, %int0_5944, %int3072_5945, %int1_5946 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5947 = torch.constant.int -1
    %int3072_5948 = torch.constant.int 3072
    %int6144_5949 = torch.constant.int 6144
    %int1_5950 = torch.constant.int 1
    %4272 = torch.aten.slice.Tensor %4270, %int-1_5947, %int3072_5948, %int6144_5949, %int1_5950 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5951 = torch.constant.int -1
    %int6144_5952 = torch.constant.int 6144
    %int9216_5953 = torch.constant.int 9216
    %int1_5954 = torch.constant.int 1
    %4273 = torch.aten.slice.Tensor %4270, %int-1_5951, %int6144_5952, %int9216_5953, %int1_5954 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5955 = torch.constant.int -1
    %int9216_5956 = torch.constant.int 9216
    %int12288_5957 = torch.constant.int 12288
    %int1_5958 = torch.constant.int 1
    %4274 = torch.aten.slice.Tensor %4270, %int-1_5955, %int9216_5956, %int12288_5957, %int1_5958 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5959 = torch.constant.int -1
    %int12288_5960 = torch.constant.int 12288
    %int15360_5961 = torch.constant.int 15360
    %int1_5962 = torch.constant.int 1
    %4275 = torch.aten.slice.Tensor %4270, %int-1_5959, %int12288_5960, %int15360_5961, %int1_5962 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5963 = torch.constant.int -1
    %int15360_5964 = torch.constant.int 15360
    %int18432_5965 = torch.constant.int 18432
    %int1_5966 = torch.constant.int 1
    %4276 = torch.aten.slice.Tensor %4270, %int-1_5963, %int15360_5964, %int18432_5965, %int1_5966 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4277 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.12.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.12.txt_mod.lin.weight : tensor<18432x3072xf16>
    %4278 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_5967 = torch.constant.int 0
    %int1_5968 = torch.constant.int 1
    %4279 = torch.aten.transpose.int %4278, %int0_5967, %int1_5968 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.12.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.12.txt_mod.lin.bias : tensor<18432xf16>
    %4280 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_5969 = torch.constant.int 6
    %4281 = torch.prims.convert_element_type %4280, %int6_5969 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_5970 = torch.constant.int 6
    %4282 = torch.prims.convert_element_type %4277, %int6_5970 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_5971 = torch.constant.int 6
    %4283 = torch.prims.convert_element_type %4279, %int6_5971 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %4284 = torch.aten.mm %4282, %4283 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_5972 = torch.constant.int 1
    %4285 = torch.aten.mul.Scalar %4284, %int1_5972 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_5973 = torch.constant.int 1
    %4286 = torch.aten.mul.Scalar %4281, %int1_5973 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_5974 = torch.constant.int 1
    %4287 = torch.aten.add.Tensor %4285, %4286, %int1_5974 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_5975 = torch.constant.int 5
    %4288 = torch.prims.convert_element_type %4287, %int5_5975 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_5976 = torch.constant.int 0
    %int0_5977 = torch.constant.int 0
    %int9223372036854775807_5978 = torch.constant.int 9223372036854775807
    %int1_5979 = torch.constant.int 1
    %4289 = torch.aten.slice.Tensor %4288, %int0_5976, %int0_5977, %int9223372036854775807_5978, %int1_5979 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_5980 = torch.constant.int 1
    %4290 = torch.aten.unsqueeze %4289, %int1_5980 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_5981 = torch.constant.int 2
    %int0_5982 = torch.constant.int 0
    %int9223372036854775807_5983 = torch.constant.int 9223372036854775807
    %int1_5984 = torch.constant.int 1
    %4291 = torch.aten.slice.Tensor %4290, %int2_5981, %int0_5982, %int9223372036854775807_5983, %int1_5984 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_5985 = torch.constant.int -1
    %int0_5986 = torch.constant.int 0
    %int3072_5987 = torch.constant.int 3072
    %int1_5988 = torch.constant.int 1
    %4292 = torch.aten.slice.Tensor %4291, %int-1_5985, %int0_5986, %int3072_5987, %int1_5988 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5989 = torch.constant.int -1
    %int3072_5990 = torch.constant.int 3072
    %int6144_5991 = torch.constant.int 6144
    %int1_5992 = torch.constant.int 1
    %4293 = torch.aten.slice.Tensor %4291, %int-1_5989, %int3072_5990, %int6144_5991, %int1_5992 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5993 = torch.constant.int -1
    %int6144_5994 = torch.constant.int 6144
    %int9216_5995 = torch.constant.int 9216
    %int1_5996 = torch.constant.int 1
    %4294 = torch.aten.slice.Tensor %4291, %int-1_5993, %int6144_5994, %int9216_5995, %int1_5996 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5997 = torch.constant.int -1
    %int9216_5998 = torch.constant.int 9216
    %int12288_5999 = torch.constant.int 12288
    %int1_6000 = torch.constant.int 1
    %4295 = torch.aten.slice.Tensor %4291, %int-1_5997, %int9216_5998, %int12288_5999, %int1_6000 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6001 = torch.constant.int -1
    %int12288_6002 = torch.constant.int 12288
    %int15360_6003 = torch.constant.int 15360
    %int1_6004 = torch.constant.int 1
    %4296 = torch.aten.slice.Tensor %4291, %int-1_6001, %int12288_6002, %int15360_6003, %int1_6004 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6005 = torch.constant.int -1
    %int15360_6006 = torch.constant.int 15360
    %int18432_6007 = torch.constant.int 18432
    %int1_6008 = torch.constant.int 1
    %4297 = torch.aten.slice.Tensor %4291, %int-1_6005, %int15360_6006, %int18432_6007, %int1_6008 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_6009 = torch.constant.int 6
    %4298 = torch.prims.convert_element_type %4195, %int6_6009 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_6010 = torch.constant.int 2
    %4299 = torch.prim.ListConstruct %int2_6010 : (!torch.int) -> !torch.list<int>
    %int0_6011 = torch.constant.int 0
    %true_6012 = torch.constant.bool true
    %result0_6013, %result1_6014 = torch.aten.var_mean.correction %4298, %4299, %int0_6011, %true_6012 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_6015 = torch.constant.float 9.9999999999999995E-7
    %int1_6016 = torch.constant.int 1
    %4300 = torch.aten.add.Scalar %result0_6013, %float9.999990e-07_6015, %int1_6016 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %4301 = torch.aten.rsqrt %4300 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_6017 = torch.constant.int 1
    %4302 = torch.aten.sub.Tensor %4195, %result1_6014, %int1_6017 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %4303 = torch.aten.mul.Tensor %4302, %4301 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_6018 = torch.constant.int 5
    %4304 = torch.prims.convert_element_type %4303, %int5_6018 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6019 = torch.constant.int 1
    %int1_6020 = torch.constant.int 1
    %4305 = torch.aten.add.Scalar %4272, %int1_6019, %int1_6020 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4306 = torch.aten.mul.Tensor %4305, %4304 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6021 = torch.constant.int 1
    %4307 = torch.aten.add.Tensor %4306, %4271, %int1_6021 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_6022 = torch.constant.int 4096
    %int3072_6023 = torch.constant.int 3072
    %4308 = torch.prim.ListConstruct %int4096_6022, %int3072_6023 : (!torch.int, !torch.int) -> !torch.list<int>
    %4309 = torch.aten.view %4307, %4308 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.12.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.12.img_attn.qkv.weight : tensor<9216x3072xf16>
    %4310 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_6024 = torch.constant.int 0
    %int1_6025 = torch.constant.int 1
    %4311 = torch.aten.transpose.int %4310, %int0_6024, %int1_6025 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.12.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.12.img_attn.qkv.bias : tensor<9216xf16>
    %4312 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_6026 = torch.constant.int 6
    %4313 = torch.prims.convert_element_type %4312, %int6_6026 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_6027 = torch.constant.int 6
    %4314 = torch.prims.convert_element_type %4309, %int6_6027 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_6028 = torch.constant.int 6
    %4315 = torch.prims.convert_element_type %4311, %int6_6028 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %4316 = torch.aten.mm %4314, %4315 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_6029 = torch.constant.int 1
    %4317 = torch.aten.mul.Scalar %4316, %int1_6029 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_6030 = torch.constant.int 1
    %4318 = torch.aten.mul.Scalar %4313, %int1_6030 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_6031 = torch.constant.int 1
    %4319 = torch.aten.add.Tensor %4317, %4318, %int1_6031 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_6032 = torch.constant.int 5
    %4320 = torch.prims.convert_element_type %4319, %int5_6032 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_6033 = torch.constant.int 1
    %int4096_6034 = torch.constant.int 4096
    %int9216_6035 = torch.constant.int 9216
    %4321 = torch.prim.ListConstruct %int1_6033, %int4096_6034, %int9216_6035 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4322 = torch.aten.view %4320, %4321 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %4323 = torch_c.to_builtin_tensor %4322 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_6036 = tensor.cast %4323 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_6037 = arith.constant 0 : index
    %dim_6038 = tensor.dim %cast_6036, %c0_6037 : tensor<?x?x?xf16>
    %c1_6039 = arith.constant 1 : index
    %dim_6040 = tensor.dim %cast_6036, %c1_6039 : tensor<?x?x?xf16>
    %c2_6041 = arith.constant 2 : index
    %dim_6042 = tensor.dim %cast_6036, %c2_6041 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_6036 : tensor<?x?x?xf16>{%dim_6038, %dim_6040, %dim_6042}]
    %cast_6043 = tensor.cast %cast_6036 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %4324 = torch_c.from_builtin_tensor %cast_6043 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_6044 = torch.constant.int 1
    %int4096_6045 = torch.constant.int 4096
    %int3_6046 = torch.constant.int 3
    %int24_6047 = torch.constant.int 24
    %int128_6048 = torch.constant.int 128
    %4325 = torch.prim.ListConstruct %int1_6044, %int4096_6045, %int3_6046, %int24_6047, %int128_6048 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4326 = torch.aten.view %4324, %4325 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_6049 = torch.constant.int 2
    %int0_6050 = torch.constant.int 0
    %int3_6051 = torch.constant.int 3
    %int1_6052 = torch.constant.int 1
    %int4_6053 = torch.constant.int 4
    %4327 = torch.prim.ListConstruct %int2_6049, %int0_6050, %int3_6051, %int1_6052, %int4_6053 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4328 = torch.aten.permute %4326, %4327 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_6054 = torch.constant.int 0
    %int0_6055 = torch.constant.int 0
    %4329 = torch.aten.select.int %4328, %int0_6054, %int0_6055 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_6056 = torch.constant.int 6
    %4330 = torch.prims.convert_element_type %4329, %int6_6056 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_6057 = torch.constant.int 2
    %4331 = torch.aten.pow.Tensor_Scalar %4330, %int2_6057 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_6058 = torch.constant.int -1
    %4332 = torch.prim.ListConstruct %int-1_6058 : (!torch.int) -> !torch.list<int>
    %true_6059 = torch.constant.bool true
    %none_6060 = torch.constant.none
    %4333 = torch.aten.mean.dim %4331, %4332, %true_6059, %none_6060 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_6061 = torch.constant.float 9.9999999999999995E-7
    %int1_6062 = torch.constant.int 1
    %4334 = torch.aten.add.Scalar %4333, %float9.999990e-07_6061, %int1_6062 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %4335 = torch.aten.rsqrt %4334 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %4336 = torch.aten.mul.Tensor %4330, %4335 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_6063 = torch.constant.int 5
    %4337 = torch.prims.convert_element_type %4336, %int5_6063 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.12.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.12.img_attn.norm.query_norm.scale : tensor<128xf16>
    %4338 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4339 = torch.aten.mul.Tensor %4337, %4338 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_6064 = torch.constant.int 1
    %int4096_6065 = torch.constant.int 4096
    %int3_6066 = torch.constant.int 3
    %int24_6067 = torch.constant.int 24
    %int128_6068 = torch.constant.int 128
    %4340 = torch.prim.ListConstruct %int1_6064, %int4096_6065, %int3_6066, %int24_6067, %int128_6068 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4341 = torch.aten.view %4324, %4340 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_6069 = torch.constant.int 2
    %int0_6070 = torch.constant.int 0
    %int3_6071 = torch.constant.int 3
    %int1_6072 = torch.constant.int 1
    %int4_6073 = torch.constant.int 4
    %4342 = torch.prim.ListConstruct %int2_6069, %int0_6070, %int3_6071, %int1_6072, %int4_6073 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4343 = torch.aten.permute %4341, %4342 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_6074 = torch.constant.int 0
    %int1_6075 = torch.constant.int 1
    %4344 = torch.aten.select.int %4343, %int0_6074, %int1_6075 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_6076 = torch.constant.int 6
    %4345 = torch.prims.convert_element_type %4344, %int6_6076 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_6077 = torch.constant.int 2
    %4346 = torch.aten.pow.Tensor_Scalar %4345, %int2_6077 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_6078 = torch.constant.int -1
    %4347 = torch.prim.ListConstruct %int-1_6078 : (!torch.int) -> !torch.list<int>
    %true_6079 = torch.constant.bool true
    %none_6080 = torch.constant.none
    %4348 = torch.aten.mean.dim %4346, %4347, %true_6079, %none_6080 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_6081 = torch.constant.float 9.9999999999999995E-7
    %int1_6082 = torch.constant.int 1
    %4349 = torch.aten.add.Scalar %4348, %float9.999990e-07_6081, %int1_6082 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %4350 = torch.aten.rsqrt %4349 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %4351 = torch.aten.mul.Tensor %4345, %4350 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_6083 = torch.constant.int 5
    %4352 = torch.prims.convert_element_type %4351, %int5_6083 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.12.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.12.img_attn.norm.key_norm.scale : tensor<128xf16>
    %4353 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4354 = torch.aten.mul.Tensor %4352, %4353 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_6084 = torch.constant.int 5
    %4355 = torch.prims.convert_element_type %4339, %int5_6084 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_6085 = torch.constant.int 5
    %4356 = torch.prims.convert_element_type %4354, %int5_6085 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_6086 = torch.constant.int 6
    %4357 = torch.prims.convert_element_type %4255, %int6_6086 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_6087 = torch.constant.int 2
    %4358 = torch.prim.ListConstruct %int2_6087 : (!torch.int) -> !torch.list<int>
    %int0_6088 = torch.constant.int 0
    %true_6089 = torch.constant.bool true
    %result0_6090, %result1_6091 = torch.aten.var_mean.correction %4357, %4358, %int0_6088, %true_6089 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_6092 = torch.constant.float 9.9999999999999995E-7
    %int1_6093 = torch.constant.int 1
    %4359 = torch.aten.add.Scalar %result0_6090, %float9.999990e-07_6092, %int1_6093 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %4360 = torch.aten.rsqrt %4359 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_6094 = torch.constant.int 1
    %4361 = torch.aten.sub.Tensor %4255, %result1_6091, %int1_6094 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %4362 = torch.aten.mul.Tensor %4361, %4360 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_6095 = torch.constant.int 5
    %4363 = torch.prims.convert_element_type %4362, %int5_6095 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_6096 = torch.constant.int 1
    %int1_6097 = torch.constant.int 1
    %4364 = torch.aten.add.Scalar %4293, %int1_6096, %int1_6097 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4365 = torch.aten.mul.Tensor %4364, %4363 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6098 = torch.constant.int 1
    %4366 = torch.aten.add.Tensor %4365, %4292, %int1_6098 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_6099 = torch.constant.int 512
    %int3072_6100 = torch.constant.int 3072
    %4367 = torch.prim.ListConstruct %int512_6099, %int3072_6100 : (!torch.int, !torch.int) -> !torch.list<int>
    %4368 = torch.aten.view %4366, %4367 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.12.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.12.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %4369 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_6101 = torch.constant.int 0
    %int1_6102 = torch.constant.int 1
    %4370 = torch.aten.transpose.int %4369, %int0_6101, %int1_6102 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.12.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.12.txt_attn.qkv.bias : tensor<9216xf16>
    %4371 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_6103 = torch.constant.int 6
    %4372 = torch.prims.convert_element_type %4371, %int6_6103 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_6104 = torch.constant.int 6
    %4373 = torch.prims.convert_element_type %4368, %int6_6104 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_6105 = torch.constant.int 6
    %4374 = torch.prims.convert_element_type %4370, %int6_6105 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %4375 = torch.aten.mm %4373, %4374 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_6106 = torch.constant.int 1
    %4376 = torch.aten.mul.Scalar %4375, %int1_6106 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_6107 = torch.constant.int 1
    %4377 = torch.aten.mul.Scalar %4372, %int1_6107 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_6108 = torch.constant.int 1
    %4378 = torch.aten.add.Tensor %4376, %4377, %int1_6108 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_6109 = torch.constant.int 5
    %4379 = torch.prims.convert_element_type %4378, %int5_6109 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_6110 = torch.constant.int 1
    %int512_6111 = torch.constant.int 512
    %int9216_6112 = torch.constant.int 9216
    %4380 = torch.prim.ListConstruct %int1_6110, %int512_6111, %int9216_6112 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4381 = torch.aten.view %4379, %4380 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %4382 = torch_c.to_builtin_tensor %4381 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_6113 = tensor.cast %4382 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_6114 = arith.constant 0 : index
    %dim_6115 = tensor.dim %cast_6113, %c0_6114 : tensor<?x?x?xf16>
    %c1_6116 = arith.constant 1 : index
    %dim_6117 = tensor.dim %cast_6113, %c1_6116 : tensor<?x?x?xf16>
    %c2_6118 = arith.constant 2 : index
    %dim_6119 = tensor.dim %cast_6113, %c2_6118 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_6113 : tensor<?x?x?xf16>{%dim_6115, %dim_6117, %dim_6119}]
    %cast_6120 = tensor.cast %cast_6113 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %4383 = torch_c.from_builtin_tensor %cast_6120 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_6121 = torch.constant.int 1
    %int512_6122 = torch.constant.int 512
    %int3_6123 = torch.constant.int 3
    %int24_6124 = torch.constant.int 24
    %int128_6125 = torch.constant.int 128
    %4384 = torch.prim.ListConstruct %int1_6121, %int512_6122, %int3_6123, %int24_6124, %int128_6125 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4385 = torch.aten.view %4383, %4384 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_6126 = torch.constant.int 2
    %int0_6127 = torch.constant.int 0
    %int3_6128 = torch.constant.int 3
    %int1_6129 = torch.constant.int 1
    %int4_6130 = torch.constant.int 4
    %4386 = torch.prim.ListConstruct %int2_6126, %int0_6127, %int3_6128, %int1_6129, %int4_6130 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4387 = torch.aten.permute %4385, %4386 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_6131 = torch.constant.int 0
    %int0_6132 = torch.constant.int 0
    %4388 = torch.aten.select.int %4387, %int0_6131, %int0_6132 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_6133 = torch.constant.int 6
    %4389 = torch.prims.convert_element_type %4388, %int6_6133 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_6134 = torch.constant.int 2
    %4390 = torch.aten.pow.Tensor_Scalar %4389, %int2_6134 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_6135 = torch.constant.int -1
    %4391 = torch.prim.ListConstruct %int-1_6135 : (!torch.int) -> !torch.list<int>
    %true_6136 = torch.constant.bool true
    %none_6137 = torch.constant.none
    %4392 = torch.aten.mean.dim %4390, %4391, %true_6136, %none_6137 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_6138 = torch.constant.float 9.9999999999999995E-7
    %int1_6139 = torch.constant.int 1
    %4393 = torch.aten.add.Scalar %4392, %float9.999990e-07_6138, %int1_6139 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %4394 = torch.aten.rsqrt %4393 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %4395 = torch.aten.mul.Tensor %4389, %4394 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_6140 = torch.constant.int 5
    %4396 = torch.prims.convert_element_type %4395, %int5_6140 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.12.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.12.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %4397 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4398 = torch.aten.mul.Tensor %4396, %4397 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_6141 = torch.constant.int 1
    %int512_6142 = torch.constant.int 512
    %int3_6143 = torch.constant.int 3
    %int24_6144 = torch.constant.int 24
    %int128_6145 = torch.constant.int 128
    %4399 = torch.prim.ListConstruct %int1_6141, %int512_6142, %int3_6143, %int24_6144, %int128_6145 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4400 = torch.aten.view %4383, %4399 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_6146 = torch.constant.int 2
    %int0_6147 = torch.constant.int 0
    %int3_6148 = torch.constant.int 3
    %int1_6149 = torch.constant.int 1
    %int4_6150 = torch.constant.int 4
    %4401 = torch.prim.ListConstruct %int2_6146, %int0_6147, %int3_6148, %int1_6149, %int4_6150 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4402 = torch.aten.permute %4400, %4401 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_6151 = torch.constant.int 0
    %int1_6152 = torch.constant.int 1
    %4403 = torch.aten.select.int %4402, %int0_6151, %int1_6152 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_6153 = torch.constant.int 6
    %4404 = torch.prims.convert_element_type %4403, %int6_6153 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_6154 = torch.constant.int 2
    %4405 = torch.aten.pow.Tensor_Scalar %4404, %int2_6154 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_6155 = torch.constant.int -1
    %4406 = torch.prim.ListConstruct %int-1_6155 : (!torch.int) -> !torch.list<int>
    %true_6156 = torch.constant.bool true
    %none_6157 = torch.constant.none
    %4407 = torch.aten.mean.dim %4405, %4406, %true_6156, %none_6157 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_6158 = torch.constant.float 9.9999999999999995E-7
    %int1_6159 = torch.constant.int 1
    %4408 = torch.aten.add.Scalar %4407, %float9.999990e-07_6158, %int1_6159 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %4409 = torch.aten.rsqrt %4408 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %4410 = torch.aten.mul.Tensor %4404, %4409 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_6160 = torch.constant.int 5
    %4411 = torch.prims.convert_element_type %4410, %int5_6160 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.12.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.12.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %4412 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4413 = torch.aten.mul.Tensor %4411, %4412 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_6161 = torch.constant.int 5
    %4414 = torch.prims.convert_element_type %4398, %int5_6161 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_6162 = torch.constant.int 5
    %4415 = torch.prims.convert_element_type %4413, %int5_6162 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %4416 = torch.prim.ListConstruct %4414, %4355 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_6163 = torch.constant.int 2
    %4417 = torch.aten.cat %4416, %int2_6163 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %4418 = torch.prim.ListConstruct %4415, %4356 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_6164 = torch.constant.int 2
    %4419 = torch.aten.cat %4418, %int2_6164 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_6165 = torch.constant.int 1
    %int512_6166 = torch.constant.int 512
    %int3_6167 = torch.constant.int 3
    %int24_6168 = torch.constant.int 24
    %int128_6169 = torch.constant.int 128
    %4420 = torch.prim.ListConstruct %int1_6165, %int512_6166, %int3_6167, %int24_6168, %int128_6169 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4421 = torch.aten.view %4383, %4420 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_6170 = torch.constant.int 2
    %int0_6171 = torch.constant.int 0
    %int3_6172 = torch.constant.int 3
    %int1_6173 = torch.constant.int 1
    %int4_6174 = torch.constant.int 4
    %4422 = torch.prim.ListConstruct %int2_6170, %int0_6171, %int3_6172, %int1_6173, %int4_6174 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4423 = torch.aten.permute %4421, %4422 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_6175 = torch.constant.int 0
    %int2_6176 = torch.constant.int 2
    %4424 = torch.aten.select.int %4423, %int0_6175, %int2_6176 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_6177 = torch.constant.int 1
    %int4096_6178 = torch.constant.int 4096
    %int3_6179 = torch.constant.int 3
    %int24_6180 = torch.constant.int 24
    %int128_6181 = torch.constant.int 128
    %4425 = torch.prim.ListConstruct %int1_6177, %int4096_6178, %int3_6179, %int24_6180, %int128_6181 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4426 = torch.aten.view %4324, %4425 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_6182 = torch.constant.int 2
    %int0_6183 = torch.constant.int 0
    %int3_6184 = torch.constant.int 3
    %int1_6185 = torch.constant.int 1
    %int4_6186 = torch.constant.int 4
    %4427 = torch.prim.ListConstruct %int2_6182, %int0_6183, %int3_6184, %int1_6185, %int4_6186 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4428 = torch.aten.permute %4426, %4427 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_6187 = torch.constant.int 0
    %int2_6188 = torch.constant.int 2
    %4429 = torch.aten.select.int %4428, %int0_6187, %int2_6188 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %4430 = torch.prim.ListConstruct %4424, %4429 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_6189 = torch.constant.int 2
    %4431 = torch.aten.cat %4430, %int2_6189 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %4432 = torch_c.to_builtin_tensor %4417 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_6190 = tensor.cast %4432 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_6191 = arith.constant 0 : index
    %dim_6192 = tensor.dim %cast_6190, %c0_6191 : tensor<?x?x?x?xf16>
    %c1_6193 = arith.constant 1 : index
    %dim_6194 = tensor.dim %cast_6190, %c1_6193 : tensor<?x?x?x?xf16>
    %c2_6195 = arith.constant 2 : index
    %dim_6196 = tensor.dim %cast_6190, %c2_6195 : tensor<?x?x?x?xf16>
    %c3_6197 = arith.constant 3 : index
    %dim_6198 = tensor.dim %cast_6190, %c3_6197 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_6190 : tensor<?x?x?x?xf16>{%dim_6192, %dim_6194, %dim_6196, %dim_6198}]
    %cast_6199 = tensor.cast %cast_6190 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %4433 = torch_c.from_builtin_tensor %cast_6199 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %4434 = torch_c.to_builtin_tensor %4419 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_6200 = tensor.cast %4434 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_6201 = arith.constant 0 : index
    %dim_6202 = tensor.dim %cast_6200, %c0_6201 : tensor<?x?x?x?xf16>
    %c1_6203 = arith.constant 1 : index
    %dim_6204 = tensor.dim %cast_6200, %c1_6203 : tensor<?x?x?x?xf16>
    %c2_6205 = arith.constant 2 : index
    %dim_6206 = tensor.dim %cast_6200, %c2_6205 : tensor<?x?x?x?xf16>
    %c3_6207 = arith.constant 3 : index
    %dim_6208 = tensor.dim %cast_6200, %c3_6207 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_6200 : tensor<?x?x?x?xf16>{%dim_6202, %dim_6204, %dim_6206, %dim_6208}]
    %cast_6209 = tensor.cast %cast_6200 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %4435 = torch_c.from_builtin_tensor %cast_6209 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %4436 = torch_c.to_builtin_tensor %4431 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_6210 = tensor.cast %4436 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_6211 = arith.constant 0 : index
    %dim_6212 = tensor.dim %cast_6210, %c0_6211 : tensor<?x?x?x?xf16>
    %c1_6213 = arith.constant 1 : index
    %dim_6214 = tensor.dim %cast_6210, %c1_6213 : tensor<?x?x?x?xf16>
    %c2_6215 = arith.constant 2 : index
    %dim_6216 = tensor.dim %cast_6210, %c2_6215 : tensor<?x?x?x?xf16>
    %c3_6217 = arith.constant 3 : index
    %dim_6218 = tensor.dim %cast_6210, %c3_6217 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_6210 : tensor<?x?x?x?xf16>{%dim_6212, %dim_6214, %dim_6216, %dim_6218}]
    %cast_6219 = tensor.cast %cast_6210 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %4437 = torch_c.from_builtin_tensor %cast_6219 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_6220 = torch.constant.int 6
    %4438 = torch.prims.convert_element_type %4433, %int6_6220 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_6221 = torch.constant.int 1
    %int24_6222 = torch.constant.int 24
    %int4608_6223 = torch.constant.int 4608
    %int-1_6224 = torch.constant.int -1
    %int1_6225 = torch.constant.int 1
    %int2_6226 = torch.constant.int 2
    %4439 = torch.prim.ListConstruct %int1_6221, %int24_6222, %int4608_6223, %int-1_6224, %int1_6225, %int2_6226 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4440 = torch.aten.view %4438, %4439 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_6227 = torch.constant.int 6
    %4441 = torch.prims.convert_element_type %4435, %int6_6227 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_6228 = torch.constant.int 1
    %int24_6229 = torch.constant.int 24
    %int4608_6230 = torch.constant.int 4608
    %int-1_6231 = torch.constant.int -1
    %int1_6232 = torch.constant.int 1
    %int2_6233 = torch.constant.int 2
    %4442 = torch.prim.ListConstruct %int1_6228, %int24_6229, %int4608_6230, %int-1_6231, %int1_6232, %int2_6233 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4443 = torch.aten.view %4441, %4442 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_6234 = torch.constant.int 5
    %int0_6235 = torch.constant.int 0
    %4444 = torch.aten.select.int %211, %int5_6234, %int0_6235 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6236 = torch.constant.int 5
    %int0_6237 = torch.constant.int 0
    %4445 = torch.aten.select.int %4440, %int5_6236, %int0_6237 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4446 = torch.aten.mul.Tensor %4444, %4445 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_6238 = torch.constant.int 5
    %int1_6239 = torch.constant.int 1
    %4447 = torch.aten.select.int %211, %int5_6238, %int1_6239 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6240 = torch.constant.int 5
    %int1_6241 = torch.constant.int 1
    %4448 = torch.aten.select.int %4440, %int5_6240, %int1_6241 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4449 = torch.aten.mul.Tensor %4447, %4448 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_6242 = torch.constant.int 1
    %4450 = torch.aten.add.Tensor %4446, %4449, %int1_6242 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_6243 = torch.constant.int 5
    %int0_6244 = torch.constant.int 0
    %4451 = torch.aten.select.int %211, %int5_6243, %int0_6244 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6245 = torch.constant.int 5
    %int0_6246 = torch.constant.int 0
    %4452 = torch.aten.select.int %4443, %int5_6245, %int0_6246 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4453 = torch.aten.mul.Tensor %4451, %4452 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_6247 = torch.constant.int 5
    %int1_6248 = torch.constant.int 1
    %4454 = torch.aten.select.int %211, %int5_6247, %int1_6248 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6249 = torch.constant.int 5
    %int1_6250 = torch.constant.int 1
    %4455 = torch.aten.select.int %4443, %int5_6249, %int1_6250 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4456 = torch.aten.mul.Tensor %4454, %4455 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_6251 = torch.constant.int 1
    %4457 = torch.aten.add.Tensor %4453, %4456, %int1_6251 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_6252 = torch.constant.int 1
    %int24_6253 = torch.constant.int 24
    %int4608_6254 = torch.constant.int 4608
    %int128_6255 = torch.constant.int 128
    %4458 = torch.prim.ListConstruct %int1_6252, %int24_6253, %int4608_6254, %int128_6255 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4459 = torch.aten.view %4450, %4458 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_6256 = torch.constant.int 5
    %4460 = torch.prims.convert_element_type %4459, %int5_6256 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_6257 = torch.constant.int 1
    %int24_6258 = torch.constant.int 24
    %int4608_6259 = torch.constant.int 4608
    %int128_6260 = torch.constant.int 128
    %4461 = torch.prim.ListConstruct %int1_6257, %int24_6258, %int4608_6259, %int128_6260 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4462 = torch.aten.view %4457, %4461 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_6261 = torch.constant.int 5
    %4463 = torch.prims.convert_element_type %4462, %int5_6261 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_6262 = torch.constant.float 0.000000e+00
    %false_6263 = torch.constant.bool false
    %none_6264 = torch.constant.none
    %none_6265 = torch.constant.none
    %4464:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%4460, %4463, %4437, %float0.000000e00_6262, %false_6263, %none_6264, %none_6265) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_6266 = torch.constant.int 0
    %int2_6267 = torch.constant.int 2
    %int1_6268 = torch.constant.int 1
    %int3_6269 = torch.constant.int 3
    %4465 = torch.prim.ListConstruct %int0_6266, %int2_6267, %int1_6268, %int3_6269 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4466 = torch.aten.permute %4464#0, %4465 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_6270 = torch.constant.int 1
    %int4608_6271 = torch.constant.int 4608
    %int3072_6272 = torch.constant.int 3072
    %4467 = torch.prim.ListConstruct %int1_6270, %int4608_6271, %int3072_6272 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4468 = torch.aten.view %4466, %4467 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_6273 = torch.constant.int 0
    %int0_6274 = torch.constant.int 0
    %int9223372036854775807_6275 = torch.constant.int 9223372036854775807
    %int1_6276 = torch.constant.int 1
    %4469 = torch.aten.slice.Tensor %4468, %int0_6273, %int0_6274, %int9223372036854775807_6275, %int1_6276 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_6277 = torch.constant.int 1
    %int0_6278 = torch.constant.int 0
    %int512_6279 = torch.constant.int 512
    %int1_6280 = torch.constant.int 1
    %4470 = torch.aten.slice.Tensor %4469, %int1_6277, %int0_6278, %int512_6279, %int1_6280 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_6281 = torch.constant.int 0
    %int0_6282 = torch.constant.int 0
    %int9223372036854775807_6283 = torch.constant.int 9223372036854775807
    %int1_6284 = torch.constant.int 1
    %4471 = torch.aten.slice.Tensor %4468, %int0_6281, %int0_6282, %int9223372036854775807_6283, %int1_6284 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_6285 = torch.constant.int 1
    %int512_6286 = torch.constant.int 512
    %int9223372036854775807_6287 = torch.constant.int 9223372036854775807
    %int1_6288 = torch.constant.int 1
    %4472 = torch.aten.slice.Tensor %4471, %int1_6285, %int512_6286, %int9223372036854775807_6287, %int1_6288 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_6289 = torch.constant.int 4096
    %int3072_6290 = torch.constant.int 3072
    %4473 = torch.prim.ListConstruct %int4096_6289, %int3072_6290 : (!torch.int, !torch.int) -> !torch.list<int>
    %4474 = torch.aten.view %4472, %4473 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.12.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.12.img_attn.proj.weight : tensor<3072x3072xf16>
    %4475 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_6291 = torch.constant.int 0
    %int1_6292 = torch.constant.int 1
    %4476 = torch.aten.transpose.int %4475, %int0_6291, %int1_6292 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.12.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.12.img_attn.proj.bias : tensor<3072xf16>
    %4477 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6293 = torch.constant.int 6
    %4478 = torch.prims.convert_element_type %4477, %int6_6293 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6294 = torch.constant.int 6
    %4479 = torch.prims.convert_element_type %4474, %int6_6294 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_6295 = torch.constant.int 6
    %4480 = torch.prims.convert_element_type %4476, %int6_6295 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %4481 = torch.aten.mm %4479, %4480 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_6296 = torch.constant.int 1
    %4482 = torch.aten.mul.Scalar %4481, %int1_6296 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_6297 = torch.constant.int 1
    %4483 = torch.aten.mul.Scalar %4478, %int1_6297 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6298 = torch.constant.int 1
    %4484 = torch.aten.add.Tensor %4482, %4483, %int1_6298 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_6299 = torch.constant.int 5
    %4485 = torch.prims.convert_element_type %4484, %int5_6299 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_6300 = torch.constant.int 1
    %int4096_6301 = torch.constant.int 4096
    %int3072_6302 = torch.constant.int 3072
    %4486 = torch.prim.ListConstruct %int1_6300, %int4096_6301, %int3072_6302 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4487 = torch.aten.view %4485, %4486 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %4488 = torch.aten.mul.Tensor %4273, %4487 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6303 = torch.constant.int 1
    %4489 = torch.aten.add.Tensor %4195, %4488, %int1_6303 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6304 = torch.constant.int 1
    %int1_6305 = torch.constant.int 1
    %4490 = torch.aten.add.Scalar %4275, %int1_6304, %int1_6305 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_6306 = torch.constant.int 6
    %4491 = torch.prims.convert_element_type %4489, %int6_6306 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_6307 = torch.constant.int 2
    %4492 = torch.prim.ListConstruct %int2_6307 : (!torch.int) -> !torch.list<int>
    %int0_6308 = torch.constant.int 0
    %true_6309 = torch.constant.bool true
    %result0_6310, %result1_6311 = torch.aten.var_mean.correction %4491, %4492, %int0_6308, %true_6309 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_6312 = torch.constant.float 9.9999999999999995E-7
    %int1_6313 = torch.constant.int 1
    %4493 = torch.aten.add.Scalar %result0_6310, %float9.999990e-07_6312, %int1_6313 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %4494 = torch.aten.rsqrt %4493 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_6314 = torch.constant.int 1
    %4495 = torch.aten.sub.Tensor %4489, %result1_6311, %int1_6314 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %4496 = torch.aten.mul.Tensor %4495, %4494 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_6315 = torch.constant.int 5
    %4497 = torch.prims.convert_element_type %4496, %int5_6315 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %4498 = torch.aten.mul.Tensor %4490, %4497 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6316 = torch.constant.int 1
    %4499 = torch.aten.add.Tensor %4498, %4274, %int1_6316 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_6317 = torch.constant.int 4096
    %int3072_6318 = torch.constant.int 3072
    %4500 = torch.prim.ListConstruct %int4096_6317, %int3072_6318 : (!torch.int, !torch.int) -> !torch.list<int>
    %4501 = torch.aten.view %4499, %4500 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.12.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.12.img_mlp.0.weight : tensor<12288x3072xf16>
    %4502 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_6319 = torch.constant.int 0
    %int1_6320 = torch.constant.int 1
    %4503 = torch.aten.transpose.int %4502, %int0_6319, %int1_6320 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.12.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.12.img_mlp.0.bias : tensor<12288xf16>
    %4504 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_6321 = torch.constant.int 6
    %4505 = torch.prims.convert_element_type %4504, %int6_6321 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_6322 = torch.constant.int 6
    %4506 = torch.prims.convert_element_type %4501, %int6_6322 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_6323 = torch.constant.int 6
    %4507 = torch.prims.convert_element_type %4503, %int6_6323 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %4508 = torch.aten.mm %4506, %4507 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_6324 = torch.constant.int 1
    %4509 = torch.aten.mul.Scalar %4508, %int1_6324 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_6325 = torch.constant.int 1
    %4510 = torch.aten.mul.Scalar %4505, %int1_6325 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_6326 = torch.constant.int 1
    %4511 = torch.aten.add.Tensor %4509, %4510, %int1_6326 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_6327 = torch.constant.int 5
    %4512 = torch.prims.convert_element_type %4511, %int5_6327 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_6328 = torch.constant.int 1
    %int4096_6329 = torch.constant.int 4096
    %int12288_6330 = torch.constant.int 12288
    %4513 = torch.prim.ListConstruct %int1_6328, %int4096_6329, %int12288_6330 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4514 = torch.aten.view %4512, %4513 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_6331 = torch.constant.str "tanh"
    %4515 = torch.aten.gelu %4514, %str_6331 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_6332 = torch.constant.int 4096
    %int12288_6333 = torch.constant.int 12288
    %4516 = torch.prim.ListConstruct %int4096_6332, %int12288_6333 : (!torch.int, !torch.int) -> !torch.list<int>
    %4517 = torch.aten.view %4515, %4516 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.12.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.12.img_mlp.2.weight : tensor<3072x12288xf16>
    %4518 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_6334 = torch.constant.int 0
    %int1_6335 = torch.constant.int 1
    %4519 = torch.aten.transpose.int %4518, %int0_6334, %int1_6335 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.12.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.12.img_mlp.2.bias : tensor<3072xf16>
    %4520 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6336 = torch.constant.int 6
    %4521 = torch.prims.convert_element_type %4520, %int6_6336 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6337 = torch.constant.int 6
    %4522 = torch.prims.convert_element_type %4517, %int6_6337 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_6338 = torch.constant.int 6
    %4523 = torch.prims.convert_element_type %4519, %int6_6338 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %4524 = torch.aten.mm %4522, %4523 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_6339 = torch.constant.int 1
    %4525 = torch.aten.mul.Scalar %4524, %int1_6339 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_6340 = torch.constant.int 1
    %4526 = torch.aten.mul.Scalar %4521, %int1_6340 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6341 = torch.constant.int 1
    %4527 = torch.aten.add.Tensor %4525, %4526, %int1_6341 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_6342 = torch.constant.int 5
    %4528 = torch.prims.convert_element_type %4527, %int5_6342 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_6343 = torch.constant.int 1
    %int4096_6344 = torch.constant.int 4096
    %int3072_6345 = torch.constant.int 3072
    %4529 = torch.prim.ListConstruct %int1_6343, %int4096_6344, %int3072_6345 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4530 = torch.aten.view %4528, %4529 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %4531 = torch.aten.mul.Tensor %4276, %4530 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6346 = torch.constant.int 1
    %4532 = torch.aten.add.Tensor %4489, %4531, %int1_6346 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_6347 = torch.constant.int 512
    %int3072_6348 = torch.constant.int 3072
    %4533 = torch.prim.ListConstruct %int512_6347, %int3072_6348 : (!torch.int, !torch.int) -> !torch.list<int>
    %4534 = torch.aten.view %4470, %4533 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.12.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.12.txt_attn.proj.weight : tensor<3072x3072xf16>
    %4535 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_6349 = torch.constant.int 0
    %int1_6350 = torch.constant.int 1
    %4536 = torch.aten.transpose.int %4535, %int0_6349, %int1_6350 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.12.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.12.txt_attn.proj.bias : tensor<3072xf16>
    %4537 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6351 = torch.constant.int 6
    %4538 = torch.prims.convert_element_type %4537, %int6_6351 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6352 = torch.constant.int 6
    %4539 = torch.prims.convert_element_type %4534, %int6_6352 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_6353 = torch.constant.int 6
    %4540 = torch.prims.convert_element_type %4536, %int6_6353 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %4541 = torch.aten.mm %4539, %4540 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_6354 = torch.constant.int 1
    %4542 = torch.aten.mul.Scalar %4541, %int1_6354 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_6355 = torch.constant.int 1
    %4543 = torch.aten.mul.Scalar %4538, %int1_6355 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6356 = torch.constant.int 1
    %4544 = torch.aten.add.Tensor %4542, %4543, %int1_6356 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_6357 = torch.constant.int 5
    %4545 = torch.prims.convert_element_type %4544, %int5_6357 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_6358 = torch.constant.int 1
    %int512_6359 = torch.constant.int 512
    %int3072_6360 = torch.constant.int 3072
    %4546 = torch.prim.ListConstruct %int1_6358, %int512_6359, %int3072_6360 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4547 = torch.aten.view %4545, %4546 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %4548 = torch.aten.mul.Tensor %4294, %4547 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6361 = torch.constant.int 1
    %4549 = torch.aten.add.Tensor %4255, %4548, %int1_6361 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_6362 = torch.constant.int 1
    %int1_6363 = torch.constant.int 1
    %4550 = torch.aten.add.Scalar %4296, %int1_6362, %int1_6363 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_6364 = torch.constant.int 6
    %4551 = torch.prims.convert_element_type %4549, %int6_6364 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_6365 = torch.constant.int 2
    %4552 = torch.prim.ListConstruct %int2_6365 : (!torch.int) -> !torch.list<int>
    %int0_6366 = torch.constant.int 0
    %true_6367 = torch.constant.bool true
    %result0_6368, %result1_6369 = torch.aten.var_mean.correction %4551, %4552, %int0_6366, %true_6367 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_6370 = torch.constant.float 9.9999999999999995E-7
    %int1_6371 = torch.constant.int 1
    %4553 = torch.aten.add.Scalar %result0_6368, %float9.999990e-07_6370, %int1_6371 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %4554 = torch.aten.rsqrt %4553 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_6372 = torch.constant.int 1
    %4555 = torch.aten.sub.Tensor %4549, %result1_6369, %int1_6372 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %4556 = torch.aten.mul.Tensor %4555, %4554 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_6373 = torch.constant.int 5
    %4557 = torch.prims.convert_element_type %4556, %int5_6373 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %4558 = torch.aten.mul.Tensor %4550, %4557 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6374 = torch.constant.int 1
    %4559 = torch.aten.add.Tensor %4558, %4295, %int1_6374 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_6375 = torch.constant.int 512
    %int3072_6376 = torch.constant.int 3072
    %4560 = torch.prim.ListConstruct %int512_6375, %int3072_6376 : (!torch.int, !torch.int) -> !torch.list<int>
    %4561 = torch.aten.view %4559, %4560 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.12.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.12.txt_mlp.0.weight : tensor<12288x3072xf16>
    %4562 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_6377 = torch.constant.int 0
    %int1_6378 = torch.constant.int 1
    %4563 = torch.aten.transpose.int %4562, %int0_6377, %int1_6378 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.12.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.12.txt_mlp.0.bias : tensor<12288xf16>
    %4564 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_6379 = torch.constant.int 6
    %4565 = torch.prims.convert_element_type %4564, %int6_6379 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_6380 = torch.constant.int 6
    %4566 = torch.prims.convert_element_type %4561, %int6_6380 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_6381 = torch.constant.int 6
    %4567 = torch.prims.convert_element_type %4563, %int6_6381 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %4568 = torch.aten.mm %4566, %4567 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_6382 = torch.constant.int 1
    %4569 = torch.aten.mul.Scalar %4568, %int1_6382 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_6383 = torch.constant.int 1
    %4570 = torch.aten.mul.Scalar %4565, %int1_6383 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_6384 = torch.constant.int 1
    %4571 = torch.aten.add.Tensor %4569, %4570, %int1_6384 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_6385 = torch.constant.int 5
    %4572 = torch.prims.convert_element_type %4571, %int5_6385 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_6386 = torch.constant.int 1
    %int512_6387 = torch.constant.int 512
    %int12288_6388 = torch.constant.int 12288
    %4573 = torch.prim.ListConstruct %int1_6386, %int512_6387, %int12288_6388 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4574 = torch.aten.view %4572, %4573 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_6389 = torch.constant.str "tanh"
    %4575 = torch.aten.gelu %4574, %str_6389 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_6390 = torch.constant.int 512
    %int12288_6391 = torch.constant.int 12288
    %4576 = torch.prim.ListConstruct %int512_6390, %int12288_6391 : (!torch.int, !torch.int) -> !torch.list<int>
    %4577 = torch.aten.view %4575, %4576 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.12.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.12.txt_mlp.2.weight : tensor<3072x12288xf16>
    %4578 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_6392 = torch.constant.int 0
    %int1_6393 = torch.constant.int 1
    %4579 = torch.aten.transpose.int %4578, %int0_6392, %int1_6393 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.12.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.12.txt_mlp.2.bias : tensor<3072xf16>
    %4580 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6394 = torch.constant.int 6
    %4581 = torch.prims.convert_element_type %4580, %int6_6394 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6395 = torch.constant.int 6
    %4582 = torch.prims.convert_element_type %4577, %int6_6395 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_6396 = torch.constant.int 6
    %4583 = torch.prims.convert_element_type %4579, %int6_6396 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %4584 = torch.aten.mm %4582, %4583 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_6397 = torch.constant.int 1
    %4585 = torch.aten.mul.Scalar %4584, %int1_6397 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_6398 = torch.constant.int 1
    %4586 = torch.aten.mul.Scalar %4581, %int1_6398 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6399 = torch.constant.int 1
    %4587 = torch.aten.add.Tensor %4585, %4586, %int1_6399 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_6400 = torch.constant.int 5
    %4588 = torch.prims.convert_element_type %4587, %int5_6400 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_6401 = torch.constant.int 1
    %int512_6402 = torch.constant.int 512
    %int3072_6403 = torch.constant.int 3072
    %4589 = torch.prim.ListConstruct %int1_6401, %int512_6402, %int3072_6403 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4590 = torch.aten.view %4588, %4589 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %4591 = torch.aten.mul.Tensor %4297, %4590 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6404 = torch.constant.int 1
    %4592 = torch.aten.add.Tensor %4549, %4591, %int1_6404 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %4593 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.13.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.13.img_mod.lin.weight : tensor<18432x3072xf16>
    %4594 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_6405 = torch.constant.int 0
    %int1_6406 = torch.constant.int 1
    %4595 = torch.aten.transpose.int %4594, %int0_6405, %int1_6406 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.13.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.13.img_mod.lin.bias : tensor<18432xf16>
    %4596 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_6407 = torch.constant.int 6
    %4597 = torch.prims.convert_element_type %4596, %int6_6407 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_6408 = torch.constant.int 6
    %4598 = torch.prims.convert_element_type %4593, %int6_6408 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_6409 = torch.constant.int 6
    %4599 = torch.prims.convert_element_type %4595, %int6_6409 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %4600 = torch.aten.mm %4598, %4599 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_6410 = torch.constant.int 1
    %4601 = torch.aten.mul.Scalar %4600, %int1_6410 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_6411 = torch.constant.int 1
    %4602 = torch.aten.mul.Scalar %4597, %int1_6411 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_6412 = torch.constant.int 1
    %4603 = torch.aten.add.Tensor %4601, %4602, %int1_6412 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_6413 = torch.constant.int 5
    %4604 = torch.prims.convert_element_type %4603, %int5_6413 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_6414 = torch.constant.int 0
    %int0_6415 = torch.constant.int 0
    %int9223372036854775807_6416 = torch.constant.int 9223372036854775807
    %int1_6417 = torch.constant.int 1
    %4605 = torch.aten.slice.Tensor %4604, %int0_6414, %int0_6415, %int9223372036854775807_6416, %int1_6417 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_6418 = torch.constant.int 1
    %4606 = torch.aten.unsqueeze %4605, %int1_6418 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_6419 = torch.constant.int 2
    %int0_6420 = torch.constant.int 0
    %int9223372036854775807_6421 = torch.constant.int 9223372036854775807
    %int1_6422 = torch.constant.int 1
    %4607 = torch.aten.slice.Tensor %4606, %int2_6419, %int0_6420, %int9223372036854775807_6421, %int1_6422 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_6423 = torch.constant.int -1
    %int0_6424 = torch.constant.int 0
    %int3072_6425 = torch.constant.int 3072
    %int1_6426 = torch.constant.int 1
    %4608 = torch.aten.slice.Tensor %4607, %int-1_6423, %int0_6424, %int3072_6425, %int1_6426 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6427 = torch.constant.int -1
    %int3072_6428 = torch.constant.int 3072
    %int6144_6429 = torch.constant.int 6144
    %int1_6430 = torch.constant.int 1
    %4609 = torch.aten.slice.Tensor %4607, %int-1_6427, %int3072_6428, %int6144_6429, %int1_6430 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6431 = torch.constant.int -1
    %int6144_6432 = torch.constant.int 6144
    %int9216_6433 = torch.constant.int 9216
    %int1_6434 = torch.constant.int 1
    %4610 = torch.aten.slice.Tensor %4607, %int-1_6431, %int6144_6432, %int9216_6433, %int1_6434 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6435 = torch.constant.int -1
    %int9216_6436 = torch.constant.int 9216
    %int12288_6437 = torch.constant.int 12288
    %int1_6438 = torch.constant.int 1
    %4611 = torch.aten.slice.Tensor %4607, %int-1_6435, %int9216_6436, %int12288_6437, %int1_6438 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6439 = torch.constant.int -1
    %int12288_6440 = torch.constant.int 12288
    %int15360_6441 = torch.constant.int 15360
    %int1_6442 = torch.constant.int 1
    %4612 = torch.aten.slice.Tensor %4607, %int-1_6439, %int12288_6440, %int15360_6441, %int1_6442 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6443 = torch.constant.int -1
    %int15360_6444 = torch.constant.int 15360
    %int18432_6445 = torch.constant.int 18432
    %int1_6446 = torch.constant.int 1
    %4613 = torch.aten.slice.Tensor %4607, %int-1_6443, %int15360_6444, %int18432_6445, %int1_6446 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4614 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.13.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.13.txt_mod.lin.weight : tensor<18432x3072xf16>
    %4615 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_6447 = torch.constant.int 0
    %int1_6448 = torch.constant.int 1
    %4616 = torch.aten.transpose.int %4615, %int0_6447, %int1_6448 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.13.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.13.txt_mod.lin.bias : tensor<18432xf16>
    %4617 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_6449 = torch.constant.int 6
    %4618 = torch.prims.convert_element_type %4617, %int6_6449 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_6450 = torch.constant.int 6
    %4619 = torch.prims.convert_element_type %4614, %int6_6450 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_6451 = torch.constant.int 6
    %4620 = torch.prims.convert_element_type %4616, %int6_6451 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %4621 = torch.aten.mm %4619, %4620 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_6452 = torch.constant.int 1
    %4622 = torch.aten.mul.Scalar %4621, %int1_6452 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_6453 = torch.constant.int 1
    %4623 = torch.aten.mul.Scalar %4618, %int1_6453 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_6454 = torch.constant.int 1
    %4624 = torch.aten.add.Tensor %4622, %4623, %int1_6454 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_6455 = torch.constant.int 5
    %4625 = torch.prims.convert_element_type %4624, %int5_6455 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_6456 = torch.constant.int 0
    %int0_6457 = torch.constant.int 0
    %int9223372036854775807_6458 = torch.constant.int 9223372036854775807
    %int1_6459 = torch.constant.int 1
    %4626 = torch.aten.slice.Tensor %4625, %int0_6456, %int0_6457, %int9223372036854775807_6458, %int1_6459 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_6460 = torch.constant.int 1
    %4627 = torch.aten.unsqueeze %4626, %int1_6460 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_6461 = torch.constant.int 2
    %int0_6462 = torch.constant.int 0
    %int9223372036854775807_6463 = torch.constant.int 9223372036854775807
    %int1_6464 = torch.constant.int 1
    %4628 = torch.aten.slice.Tensor %4627, %int2_6461, %int0_6462, %int9223372036854775807_6463, %int1_6464 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_6465 = torch.constant.int -1
    %int0_6466 = torch.constant.int 0
    %int3072_6467 = torch.constant.int 3072
    %int1_6468 = torch.constant.int 1
    %4629 = torch.aten.slice.Tensor %4628, %int-1_6465, %int0_6466, %int3072_6467, %int1_6468 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6469 = torch.constant.int -1
    %int3072_6470 = torch.constant.int 3072
    %int6144_6471 = torch.constant.int 6144
    %int1_6472 = torch.constant.int 1
    %4630 = torch.aten.slice.Tensor %4628, %int-1_6469, %int3072_6470, %int6144_6471, %int1_6472 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6473 = torch.constant.int -1
    %int6144_6474 = torch.constant.int 6144
    %int9216_6475 = torch.constant.int 9216
    %int1_6476 = torch.constant.int 1
    %4631 = torch.aten.slice.Tensor %4628, %int-1_6473, %int6144_6474, %int9216_6475, %int1_6476 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6477 = torch.constant.int -1
    %int9216_6478 = torch.constant.int 9216
    %int12288_6479 = torch.constant.int 12288
    %int1_6480 = torch.constant.int 1
    %4632 = torch.aten.slice.Tensor %4628, %int-1_6477, %int9216_6478, %int12288_6479, %int1_6480 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6481 = torch.constant.int -1
    %int12288_6482 = torch.constant.int 12288
    %int15360_6483 = torch.constant.int 15360
    %int1_6484 = torch.constant.int 1
    %4633 = torch.aten.slice.Tensor %4628, %int-1_6481, %int12288_6482, %int15360_6483, %int1_6484 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6485 = torch.constant.int -1
    %int15360_6486 = torch.constant.int 15360
    %int18432_6487 = torch.constant.int 18432
    %int1_6488 = torch.constant.int 1
    %4634 = torch.aten.slice.Tensor %4628, %int-1_6485, %int15360_6486, %int18432_6487, %int1_6488 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_6489 = torch.constant.int 6
    %4635 = torch.prims.convert_element_type %4532, %int6_6489 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_6490 = torch.constant.int 2
    %4636 = torch.prim.ListConstruct %int2_6490 : (!torch.int) -> !torch.list<int>
    %int0_6491 = torch.constant.int 0
    %true_6492 = torch.constant.bool true
    %result0_6493, %result1_6494 = torch.aten.var_mean.correction %4635, %4636, %int0_6491, %true_6492 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_6495 = torch.constant.float 9.9999999999999995E-7
    %int1_6496 = torch.constant.int 1
    %4637 = torch.aten.add.Scalar %result0_6493, %float9.999990e-07_6495, %int1_6496 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %4638 = torch.aten.rsqrt %4637 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_6497 = torch.constant.int 1
    %4639 = torch.aten.sub.Tensor %4532, %result1_6494, %int1_6497 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %4640 = torch.aten.mul.Tensor %4639, %4638 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_6498 = torch.constant.int 5
    %4641 = torch.prims.convert_element_type %4640, %int5_6498 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6499 = torch.constant.int 1
    %int1_6500 = torch.constant.int 1
    %4642 = torch.aten.add.Scalar %4609, %int1_6499, %int1_6500 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4643 = torch.aten.mul.Tensor %4642, %4641 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6501 = torch.constant.int 1
    %4644 = torch.aten.add.Tensor %4643, %4608, %int1_6501 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_6502 = torch.constant.int 4096
    %int3072_6503 = torch.constant.int 3072
    %4645 = torch.prim.ListConstruct %int4096_6502, %int3072_6503 : (!torch.int, !torch.int) -> !torch.list<int>
    %4646 = torch.aten.view %4644, %4645 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.13.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.13.img_attn.qkv.weight : tensor<9216x3072xf16>
    %4647 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_6504 = torch.constant.int 0
    %int1_6505 = torch.constant.int 1
    %4648 = torch.aten.transpose.int %4647, %int0_6504, %int1_6505 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.13.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.13.img_attn.qkv.bias : tensor<9216xf16>
    %4649 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_6506 = torch.constant.int 6
    %4650 = torch.prims.convert_element_type %4649, %int6_6506 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_6507 = torch.constant.int 6
    %4651 = torch.prims.convert_element_type %4646, %int6_6507 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_6508 = torch.constant.int 6
    %4652 = torch.prims.convert_element_type %4648, %int6_6508 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %4653 = torch.aten.mm %4651, %4652 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_6509 = torch.constant.int 1
    %4654 = torch.aten.mul.Scalar %4653, %int1_6509 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_6510 = torch.constant.int 1
    %4655 = torch.aten.mul.Scalar %4650, %int1_6510 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_6511 = torch.constant.int 1
    %4656 = torch.aten.add.Tensor %4654, %4655, %int1_6511 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_6512 = torch.constant.int 5
    %4657 = torch.prims.convert_element_type %4656, %int5_6512 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_6513 = torch.constant.int 1
    %int4096_6514 = torch.constant.int 4096
    %int9216_6515 = torch.constant.int 9216
    %4658 = torch.prim.ListConstruct %int1_6513, %int4096_6514, %int9216_6515 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4659 = torch.aten.view %4657, %4658 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %4660 = torch_c.to_builtin_tensor %4659 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_6516 = tensor.cast %4660 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_6517 = arith.constant 0 : index
    %dim_6518 = tensor.dim %cast_6516, %c0_6517 : tensor<?x?x?xf16>
    %c1_6519 = arith.constant 1 : index
    %dim_6520 = tensor.dim %cast_6516, %c1_6519 : tensor<?x?x?xf16>
    %c2_6521 = arith.constant 2 : index
    %dim_6522 = tensor.dim %cast_6516, %c2_6521 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_6516 : tensor<?x?x?xf16>{%dim_6518, %dim_6520, %dim_6522}]
    %cast_6523 = tensor.cast %cast_6516 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %4661 = torch_c.from_builtin_tensor %cast_6523 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_6524 = torch.constant.int 1
    %int4096_6525 = torch.constant.int 4096
    %int3_6526 = torch.constant.int 3
    %int24_6527 = torch.constant.int 24
    %int128_6528 = torch.constant.int 128
    %4662 = torch.prim.ListConstruct %int1_6524, %int4096_6525, %int3_6526, %int24_6527, %int128_6528 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4663 = torch.aten.view %4661, %4662 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_6529 = torch.constant.int 2
    %int0_6530 = torch.constant.int 0
    %int3_6531 = torch.constant.int 3
    %int1_6532 = torch.constant.int 1
    %int4_6533 = torch.constant.int 4
    %4664 = torch.prim.ListConstruct %int2_6529, %int0_6530, %int3_6531, %int1_6532, %int4_6533 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4665 = torch.aten.permute %4663, %4664 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_6534 = torch.constant.int 0
    %int0_6535 = torch.constant.int 0
    %4666 = torch.aten.select.int %4665, %int0_6534, %int0_6535 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_6536 = torch.constant.int 6
    %4667 = torch.prims.convert_element_type %4666, %int6_6536 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_6537 = torch.constant.int 2
    %4668 = torch.aten.pow.Tensor_Scalar %4667, %int2_6537 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_6538 = torch.constant.int -1
    %4669 = torch.prim.ListConstruct %int-1_6538 : (!torch.int) -> !torch.list<int>
    %true_6539 = torch.constant.bool true
    %none_6540 = torch.constant.none
    %4670 = torch.aten.mean.dim %4668, %4669, %true_6539, %none_6540 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_6541 = torch.constant.float 9.9999999999999995E-7
    %int1_6542 = torch.constant.int 1
    %4671 = torch.aten.add.Scalar %4670, %float9.999990e-07_6541, %int1_6542 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %4672 = torch.aten.rsqrt %4671 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %4673 = torch.aten.mul.Tensor %4667, %4672 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_6543 = torch.constant.int 5
    %4674 = torch.prims.convert_element_type %4673, %int5_6543 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.13.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.13.img_attn.norm.query_norm.scale : tensor<128xf16>
    %4675 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4676 = torch.aten.mul.Tensor %4674, %4675 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_6544 = torch.constant.int 1
    %int4096_6545 = torch.constant.int 4096
    %int3_6546 = torch.constant.int 3
    %int24_6547 = torch.constant.int 24
    %int128_6548 = torch.constant.int 128
    %4677 = torch.prim.ListConstruct %int1_6544, %int4096_6545, %int3_6546, %int24_6547, %int128_6548 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4678 = torch.aten.view %4661, %4677 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_6549 = torch.constant.int 2
    %int0_6550 = torch.constant.int 0
    %int3_6551 = torch.constant.int 3
    %int1_6552 = torch.constant.int 1
    %int4_6553 = torch.constant.int 4
    %4679 = torch.prim.ListConstruct %int2_6549, %int0_6550, %int3_6551, %int1_6552, %int4_6553 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4680 = torch.aten.permute %4678, %4679 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_6554 = torch.constant.int 0
    %int1_6555 = torch.constant.int 1
    %4681 = torch.aten.select.int %4680, %int0_6554, %int1_6555 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_6556 = torch.constant.int 6
    %4682 = torch.prims.convert_element_type %4681, %int6_6556 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_6557 = torch.constant.int 2
    %4683 = torch.aten.pow.Tensor_Scalar %4682, %int2_6557 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_6558 = torch.constant.int -1
    %4684 = torch.prim.ListConstruct %int-1_6558 : (!torch.int) -> !torch.list<int>
    %true_6559 = torch.constant.bool true
    %none_6560 = torch.constant.none
    %4685 = torch.aten.mean.dim %4683, %4684, %true_6559, %none_6560 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_6561 = torch.constant.float 9.9999999999999995E-7
    %int1_6562 = torch.constant.int 1
    %4686 = torch.aten.add.Scalar %4685, %float9.999990e-07_6561, %int1_6562 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %4687 = torch.aten.rsqrt %4686 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %4688 = torch.aten.mul.Tensor %4682, %4687 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_6563 = torch.constant.int 5
    %4689 = torch.prims.convert_element_type %4688, %int5_6563 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.13.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.13.img_attn.norm.key_norm.scale : tensor<128xf16>
    %4690 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4691 = torch.aten.mul.Tensor %4689, %4690 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_6564 = torch.constant.int 5
    %4692 = torch.prims.convert_element_type %4676, %int5_6564 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_6565 = torch.constant.int 5
    %4693 = torch.prims.convert_element_type %4691, %int5_6565 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_6566 = torch.constant.int 6
    %4694 = torch.prims.convert_element_type %4592, %int6_6566 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_6567 = torch.constant.int 2
    %4695 = torch.prim.ListConstruct %int2_6567 : (!torch.int) -> !torch.list<int>
    %int0_6568 = torch.constant.int 0
    %true_6569 = torch.constant.bool true
    %result0_6570, %result1_6571 = torch.aten.var_mean.correction %4694, %4695, %int0_6568, %true_6569 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_6572 = torch.constant.float 9.9999999999999995E-7
    %int1_6573 = torch.constant.int 1
    %4696 = torch.aten.add.Scalar %result0_6570, %float9.999990e-07_6572, %int1_6573 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %4697 = torch.aten.rsqrt %4696 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_6574 = torch.constant.int 1
    %4698 = torch.aten.sub.Tensor %4592, %result1_6571, %int1_6574 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %4699 = torch.aten.mul.Tensor %4698, %4697 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_6575 = torch.constant.int 5
    %4700 = torch.prims.convert_element_type %4699, %int5_6575 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_6576 = torch.constant.int 1
    %int1_6577 = torch.constant.int 1
    %4701 = torch.aten.add.Scalar %4630, %int1_6576, %int1_6577 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4702 = torch.aten.mul.Tensor %4701, %4700 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6578 = torch.constant.int 1
    %4703 = torch.aten.add.Tensor %4702, %4629, %int1_6578 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_6579 = torch.constant.int 512
    %int3072_6580 = torch.constant.int 3072
    %4704 = torch.prim.ListConstruct %int512_6579, %int3072_6580 : (!torch.int, !torch.int) -> !torch.list<int>
    %4705 = torch.aten.view %4703, %4704 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.13.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.13.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %4706 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_6581 = torch.constant.int 0
    %int1_6582 = torch.constant.int 1
    %4707 = torch.aten.transpose.int %4706, %int0_6581, %int1_6582 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.13.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.13.txt_attn.qkv.bias : tensor<9216xf16>
    %4708 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_6583 = torch.constant.int 6
    %4709 = torch.prims.convert_element_type %4708, %int6_6583 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_6584 = torch.constant.int 6
    %4710 = torch.prims.convert_element_type %4705, %int6_6584 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_6585 = torch.constant.int 6
    %4711 = torch.prims.convert_element_type %4707, %int6_6585 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %4712 = torch.aten.mm %4710, %4711 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_6586 = torch.constant.int 1
    %4713 = torch.aten.mul.Scalar %4712, %int1_6586 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_6587 = torch.constant.int 1
    %4714 = torch.aten.mul.Scalar %4709, %int1_6587 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_6588 = torch.constant.int 1
    %4715 = torch.aten.add.Tensor %4713, %4714, %int1_6588 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_6589 = torch.constant.int 5
    %4716 = torch.prims.convert_element_type %4715, %int5_6589 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_6590 = torch.constant.int 1
    %int512_6591 = torch.constant.int 512
    %int9216_6592 = torch.constant.int 9216
    %4717 = torch.prim.ListConstruct %int1_6590, %int512_6591, %int9216_6592 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4718 = torch.aten.view %4716, %4717 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %4719 = torch_c.to_builtin_tensor %4718 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_6593 = tensor.cast %4719 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_6594 = arith.constant 0 : index
    %dim_6595 = tensor.dim %cast_6593, %c0_6594 : tensor<?x?x?xf16>
    %c1_6596 = arith.constant 1 : index
    %dim_6597 = tensor.dim %cast_6593, %c1_6596 : tensor<?x?x?xf16>
    %c2_6598 = arith.constant 2 : index
    %dim_6599 = tensor.dim %cast_6593, %c2_6598 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_6593 : tensor<?x?x?xf16>{%dim_6595, %dim_6597, %dim_6599}]
    %cast_6600 = tensor.cast %cast_6593 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %4720 = torch_c.from_builtin_tensor %cast_6600 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_6601 = torch.constant.int 1
    %int512_6602 = torch.constant.int 512
    %int3_6603 = torch.constant.int 3
    %int24_6604 = torch.constant.int 24
    %int128_6605 = torch.constant.int 128
    %4721 = torch.prim.ListConstruct %int1_6601, %int512_6602, %int3_6603, %int24_6604, %int128_6605 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4722 = torch.aten.view %4720, %4721 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_6606 = torch.constant.int 2
    %int0_6607 = torch.constant.int 0
    %int3_6608 = torch.constant.int 3
    %int1_6609 = torch.constant.int 1
    %int4_6610 = torch.constant.int 4
    %4723 = torch.prim.ListConstruct %int2_6606, %int0_6607, %int3_6608, %int1_6609, %int4_6610 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4724 = torch.aten.permute %4722, %4723 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_6611 = torch.constant.int 0
    %int0_6612 = torch.constant.int 0
    %4725 = torch.aten.select.int %4724, %int0_6611, %int0_6612 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_6613 = torch.constant.int 6
    %4726 = torch.prims.convert_element_type %4725, %int6_6613 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_6614 = torch.constant.int 2
    %4727 = torch.aten.pow.Tensor_Scalar %4726, %int2_6614 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_6615 = torch.constant.int -1
    %4728 = torch.prim.ListConstruct %int-1_6615 : (!torch.int) -> !torch.list<int>
    %true_6616 = torch.constant.bool true
    %none_6617 = torch.constant.none
    %4729 = torch.aten.mean.dim %4727, %4728, %true_6616, %none_6617 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_6618 = torch.constant.float 9.9999999999999995E-7
    %int1_6619 = torch.constant.int 1
    %4730 = torch.aten.add.Scalar %4729, %float9.999990e-07_6618, %int1_6619 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %4731 = torch.aten.rsqrt %4730 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %4732 = torch.aten.mul.Tensor %4726, %4731 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_6620 = torch.constant.int 5
    %4733 = torch.prims.convert_element_type %4732, %int5_6620 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.13.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.13.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %4734 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4735 = torch.aten.mul.Tensor %4733, %4734 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_6621 = torch.constant.int 1
    %int512_6622 = torch.constant.int 512
    %int3_6623 = torch.constant.int 3
    %int24_6624 = torch.constant.int 24
    %int128_6625 = torch.constant.int 128
    %4736 = torch.prim.ListConstruct %int1_6621, %int512_6622, %int3_6623, %int24_6624, %int128_6625 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4737 = torch.aten.view %4720, %4736 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_6626 = torch.constant.int 2
    %int0_6627 = torch.constant.int 0
    %int3_6628 = torch.constant.int 3
    %int1_6629 = torch.constant.int 1
    %int4_6630 = torch.constant.int 4
    %4738 = torch.prim.ListConstruct %int2_6626, %int0_6627, %int3_6628, %int1_6629, %int4_6630 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4739 = torch.aten.permute %4737, %4738 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_6631 = torch.constant.int 0
    %int1_6632 = torch.constant.int 1
    %4740 = torch.aten.select.int %4739, %int0_6631, %int1_6632 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_6633 = torch.constant.int 6
    %4741 = torch.prims.convert_element_type %4740, %int6_6633 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_6634 = torch.constant.int 2
    %4742 = torch.aten.pow.Tensor_Scalar %4741, %int2_6634 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_6635 = torch.constant.int -1
    %4743 = torch.prim.ListConstruct %int-1_6635 : (!torch.int) -> !torch.list<int>
    %true_6636 = torch.constant.bool true
    %none_6637 = torch.constant.none
    %4744 = torch.aten.mean.dim %4742, %4743, %true_6636, %none_6637 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_6638 = torch.constant.float 9.9999999999999995E-7
    %int1_6639 = torch.constant.int 1
    %4745 = torch.aten.add.Scalar %4744, %float9.999990e-07_6638, %int1_6639 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %4746 = torch.aten.rsqrt %4745 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %4747 = torch.aten.mul.Tensor %4741, %4746 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_6640 = torch.constant.int 5
    %4748 = torch.prims.convert_element_type %4747, %int5_6640 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.13.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.13.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %4749 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4750 = torch.aten.mul.Tensor %4748, %4749 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_6641 = torch.constant.int 5
    %4751 = torch.prims.convert_element_type %4735, %int5_6641 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_6642 = torch.constant.int 5
    %4752 = torch.prims.convert_element_type %4750, %int5_6642 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %4753 = torch.prim.ListConstruct %4751, %4692 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_6643 = torch.constant.int 2
    %4754 = torch.aten.cat %4753, %int2_6643 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %4755 = torch.prim.ListConstruct %4752, %4693 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_6644 = torch.constant.int 2
    %4756 = torch.aten.cat %4755, %int2_6644 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_6645 = torch.constant.int 1
    %int512_6646 = torch.constant.int 512
    %int3_6647 = torch.constant.int 3
    %int24_6648 = torch.constant.int 24
    %int128_6649 = torch.constant.int 128
    %4757 = torch.prim.ListConstruct %int1_6645, %int512_6646, %int3_6647, %int24_6648, %int128_6649 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4758 = torch.aten.view %4720, %4757 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_6650 = torch.constant.int 2
    %int0_6651 = torch.constant.int 0
    %int3_6652 = torch.constant.int 3
    %int1_6653 = torch.constant.int 1
    %int4_6654 = torch.constant.int 4
    %4759 = torch.prim.ListConstruct %int2_6650, %int0_6651, %int3_6652, %int1_6653, %int4_6654 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4760 = torch.aten.permute %4758, %4759 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_6655 = torch.constant.int 0
    %int2_6656 = torch.constant.int 2
    %4761 = torch.aten.select.int %4760, %int0_6655, %int2_6656 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_6657 = torch.constant.int 1
    %int4096_6658 = torch.constant.int 4096
    %int3_6659 = torch.constant.int 3
    %int24_6660 = torch.constant.int 24
    %int128_6661 = torch.constant.int 128
    %4762 = torch.prim.ListConstruct %int1_6657, %int4096_6658, %int3_6659, %int24_6660, %int128_6661 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4763 = torch.aten.view %4661, %4762 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_6662 = torch.constant.int 2
    %int0_6663 = torch.constant.int 0
    %int3_6664 = torch.constant.int 3
    %int1_6665 = torch.constant.int 1
    %int4_6666 = torch.constant.int 4
    %4764 = torch.prim.ListConstruct %int2_6662, %int0_6663, %int3_6664, %int1_6665, %int4_6666 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4765 = torch.aten.permute %4763, %4764 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_6667 = torch.constant.int 0
    %int2_6668 = torch.constant.int 2
    %4766 = torch.aten.select.int %4765, %int0_6667, %int2_6668 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %4767 = torch.prim.ListConstruct %4761, %4766 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_6669 = torch.constant.int 2
    %4768 = torch.aten.cat %4767, %int2_6669 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %4769 = torch_c.to_builtin_tensor %4754 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_6670 = tensor.cast %4769 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_6671 = arith.constant 0 : index
    %dim_6672 = tensor.dim %cast_6670, %c0_6671 : tensor<?x?x?x?xf16>
    %c1_6673 = arith.constant 1 : index
    %dim_6674 = tensor.dim %cast_6670, %c1_6673 : tensor<?x?x?x?xf16>
    %c2_6675 = arith.constant 2 : index
    %dim_6676 = tensor.dim %cast_6670, %c2_6675 : tensor<?x?x?x?xf16>
    %c3_6677 = arith.constant 3 : index
    %dim_6678 = tensor.dim %cast_6670, %c3_6677 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_6670 : tensor<?x?x?x?xf16>{%dim_6672, %dim_6674, %dim_6676, %dim_6678}]
    %cast_6679 = tensor.cast %cast_6670 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %4770 = torch_c.from_builtin_tensor %cast_6679 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %4771 = torch_c.to_builtin_tensor %4756 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_6680 = tensor.cast %4771 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_6681 = arith.constant 0 : index
    %dim_6682 = tensor.dim %cast_6680, %c0_6681 : tensor<?x?x?x?xf16>
    %c1_6683 = arith.constant 1 : index
    %dim_6684 = tensor.dim %cast_6680, %c1_6683 : tensor<?x?x?x?xf16>
    %c2_6685 = arith.constant 2 : index
    %dim_6686 = tensor.dim %cast_6680, %c2_6685 : tensor<?x?x?x?xf16>
    %c3_6687 = arith.constant 3 : index
    %dim_6688 = tensor.dim %cast_6680, %c3_6687 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_6680 : tensor<?x?x?x?xf16>{%dim_6682, %dim_6684, %dim_6686, %dim_6688}]
    %cast_6689 = tensor.cast %cast_6680 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %4772 = torch_c.from_builtin_tensor %cast_6689 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %4773 = torch_c.to_builtin_tensor %4768 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_6690 = tensor.cast %4773 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_6691 = arith.constant 0 : index
    %dim_6692 = tensor.dim %cast_6690, %c0_6691 : tensor<?x?x?x?xf16>
    %c1_6693 = arith.constant 1 : index
    %dim_6694 = tensor.dim %cast_6690, %c1_6693 : tensor<?x?x?x?xf16>
    %c2_6695 = arith.constant 2 : index
    %dim_6696 = tensor.dim %cast_6690, %c2_6695 : tensor<?x?x?x?xf16>
    %c3_6697 = arith.constant 3 : index
    %dim_6698 = tensor.dim %cast_6690, %c3_6697 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_6690 : tensor<?x?x?x?xf16>{%dim_6692, %dim_6694, %dim_6696, %dim_6698}]
    %cast_6699 = tensor.cast %cast_6690 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %4774 = torch_c.from_builtin_tensor %cast_6699 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_6700 = torch.constant.int 6
    %4775 = torch.prims.convert_element_type %4770, %int6_6700 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_6701 = torch.constant.int 1
    %int24_6702 = torch.constant.int 24
    %int4608_6703 = torch.constant.int 4608
    %int-1_6704 = torch.constant.int -1
    %int1_6705 = torch.constant.int 1
    %int2_6706 = torch.constant.int 2
    %4776 = torch.prim.ListConstruct %int1_6701, %int24_6702, %int4608_6703, %int-1_6704, %int1_6705, %int2_6706 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4777 = torch.aten.view %4775, %4776 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_6707 = torch.constant.int 6
    %4778 = torch.prims.convert_element_type %4772, %int6_6707 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_6708 = torch.constant.int 1
    %int24_6709 = torch.constant.int 24
    %int4608_6710 = torch.constant.int 4608
    %int-1_6711 = torch.constant.int -1
    %int1_6712 = torch.constant.int 1
    %int2_6713 = torch.constant.int 2
    %4779 = torch.prim.ListConstruct %int1_6708, %int24_6709, %int4608_6710, %int-1_6711, %int1_6712, %int2_6713 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4780 = torch.aten.view %4778, %4779 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_6714 = torch.constant.int 5
    %int0_6715 = torch.constant.int 0
    %4781 = torch.aten.select.int %211, %int5_6714, %int0_6715 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6716 = torch.constant.int 5
    %int0_6717 = torch.constant.int 0
    %4782 = torch.aten.select.int %4777, %int5_6716, %int0_6717 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4783 = torch.aten.mul.Tensor %4781, %4782 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_6718 = torch.constant.int 5
    %int1_6719 = torch.constant.int 1
    %4784 = torch.aten.select.int %211, %int5_6718, %int1_6719 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6720 = torch.constant.int 5
    %int1_6721 = torch.constant.int 1
    %4785 = torch.aten.select.int %4777, %int5_6720, %int1_6721 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4786 = torch.aten.mul.Tensor %4784, %4785 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_6722 = torch.constant.int 1
    %4787 = torch.aten.add.Tensor %4783, %4786, %int1_6722 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_6723 = torch.constant.int 5
    %int0_6724 = torch.constant.int 0
    %4788 = torch.aten.select.int %211, %int5_6723, %int0_6724 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6725 = torch.constant.int 5
    %int0_6726 = torch.constant.int 0
    %4789 = torch.aten.select.int %4780, %int5_6725, %int0_6726 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4790 = torch.aten.mul.Tensor %4788, %4789 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_6727 = torch.constant.int 5
    %int1_6728 = torch.constant.int 1
    %4791 = torch.aten.select.int %211, %int5_6727, %int1_6728 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6729 = torch.constant.int 5
    %int1_6730 = torch.constant.int 1
    %4792 = torch.aten.select.int %4780, %int5_6729, %int1_6730 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4793 = torch.aten.mul.Tensor %4791, %4792 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_6731 = torch.constant.int 1
    %4794 = torch.aten.add.Tensor %4790, %4793, %int1_6731 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_6732 = torch.constant.int 1
    %int24_6733 = torch.constant.int 24
    %int4608_6734 = torch.constant.int 4608
    %int128_6735 = torch.constant.int 128
    %4795 = torch.prim.ListConstruct %int1_6732, %int24_6733, %int4608_6734, %int128_6735 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4796 = torch.aten.view %4787, %4795 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_6736 = torch.constant.int 5
    %4797 = torch.prims.convert_element_type %4796, %int5_6736 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_6737 = torch.constant.int 1
    %int24_6738 = torch.constant.int 24
    %int4608_6739 = torch.constant.int 4608
    %int128_6740 = torch.constant.int 128
    %4798 = torch.prim.ListConstruct %int1_6737, %int24_6738, %int4608_6739, %int128_6740 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4799 = torch.aten.view %4794, %4798 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_6741 = torch.constant.int 5
    %4800 = torch.prims.convert_element_type %4799, %int5_6741 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_6742 = torch.constant.float 0.000000e+00
    %false_6743 = torch.constant.bool false
    %none_6744 = torch.constant.none
    %none_6745 = torch.constant.none
    %4801:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%4797, %4800, %4774, %float0.000000e00_6742, %false_6743, %none_6744, %none_6745) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_6746 = torch.constant.int 0
    %int2_6747 = torch.constant.int 2
    %int1_6748 = torch.constant.int 1
    %int3_6749 = torch.constant.int 3
    %4802 = torch.prim.ListConstruct %int0_6746, %int2_6747, %int1_6748, %int3_6749 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4803 = torch.aten.permute %4801#0, %4802 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_6750 = torch.constant.int 1
    %int4608_6751 = torch.constant.int 4608
    %int3072_6752 = torch.constant.int 3072
    %4804 = torch.prim.ListConstruct %int1_6750, %int4608_6751, %int3072_6752 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4805 = torch.aten.view %4803, %4804 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_6753 = torch.constant.int 0
    %int0_6754 = torch.constant.int 0
    %int9223372036854775807_6755 = torch.constant.int 9223372036854775807
    %int1_6756 = torch.constant.int 1
    %4806 = torch.aten.slice.Tensor %4805, %int0_6753, %int0_6754, %int9223372036854775807_6755, %int1_6756 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_6757 = torch.constant.int 1
    %int0_6758 = torch.constant.int 0
    %int512_6759 = torch.constant.int 512
    %int1_6760 = torch.constant.int 1
    %4807 = torch.aten.slice.Tensor %4806, %int1_6757, %int0_6758, %int512_6759, %int1_6760 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_6761 = torch.constant.int 0
    %int0_6762 = torch.constant.int 0
    %int9223372036854775807_6763 = torch.constant.int 9223372036854775807
    %int1_6764 = torch.constant.int 1
    %4808 = torch.aten.slice.Tensor %4805, %int0_6761, %int0_6762, %int9223372036854775807_6763, %int1_6764 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_6765 = torch.constant.int 1
    %int512_6766 = torch.constant.int 512
    %int9223372036854775807_6767 = torch.constant.int 9223372036854775807
    %int1_6768 = torch.constant.int 1
    %4809 = torch.aten.slice.Tensor %4808, %int1_6765, %int512_6766, %int9223372036854775807_6767, %int1_6768 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_6769 = torch.constant.int 4096
    %int3072_6770 = torch.constant.int 3072
    %4810 = torch.prim.ListConstruct %int4096_6769, %int3072_6770 : (!torch.int, !torch.int) -> !torch.list<int>
    %4811 = torch.aten.view %4809, %4810 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.13.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.13.img_attn.proj.weight : tensor<3072x3072xf16>
    %4812 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_6771 = torch.constant.int 0
    %int1_6772 = torch.constant.int 1
    %4813 = torch.aten.transpose.int %4812, %int0_6771, %int1_6772 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.13.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.13.img_attn.proj.bias : tensor<3072xf16>
    %4814 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6773 = torch.constant.int 6
    %4815 = torch.prims.convert_element_type %4814, %int6_6773 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6774 = torch.constant.int 6
    %4816 = torch.prims.convert_element_type %4811, %int6_6774 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_6775 = torch.constant.int 6
    %4817 = torch.prims.convert_element_type %4813, %int6_6775 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %4818 = torch.aten.mm %4816, %4817 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_6776 = torch.constant.int 1
    %4819 = torch.aten.mul.Scalar %4818, %int1_6776 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_6777 = torch.constant.int 1
    %4820 = torch.aten.mul.Scalar %4815, %int1_6777 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6778 = torch.constant.int 1
    %4821 = torch.aten.add.Tensor %4819, %4820, %int1_6778 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_6779 = torch.constant.int 5
    %4822 = torch.prims.convert_element_type %4821, %int5_6779 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_6780 = torch.constant.int 1
    %int4096_6781 = torch.constant.int 4096
    %int3072_6782 = torch.constant.int 3072
    %4823 = torch.prim.ListConstruct %int1_6780, %int4096_6781, %int3072_6782 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4824 = torch.aten.view %4822, %4823 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %4825 = torch.aten.mul.Tensor %4610, %4824 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6783 = torch.constant.int 1
    %4826 = torch.aten.add.Tensor %4532, %4825, %int1_6783 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6784 = torch.constant.int 1
    %int1_6785 = torch.constant.int 1
    %4827 = torch.aten.add.Scalar %4612, %int1_6784, %int1_6785 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_6786 = torch.constant.int 6
    %4828 = torch.prims.convert_element_type %4826, %int6_6786 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_6787 = torch.constant.int 2
    %4829 = torch.prim.ListConstruct %int2_6787 : (!torch.int) -> !torch.list<int>
    %int0_6788 = torch.constant.int 0
    %true_6789 = torch.constant.bool true
    %result0_6790, %result1_6791 = torch.aten.var_mean.correction %4828, %4829, %int0_6788, %true_6789 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_6792 = torch.constant.float 9.9999999999999995E-7
    %int1_6793 = torch.constant.int 1
    %4830 = torch.aten.add.Scalar %result0_6790, %float9.999990e-07_6792, %int1_6793 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %4831 = torch.aten.rsqrt %4830 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_6794 = torch.constant.int 1
    %4832 = torch.aten.sub.Tensor %4826, %result1_6791, %int1_6794 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %4833 = torch.aten.mul.Tensor %4832, %4831 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_6795 = torch.constant.int 5
    %4834 = torch.prims.convert_element_type %4833, %int5_6795 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %4835 = torch.aten.mul.Tensor %4827, %4834 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6796 = torch.constant.int 1
    %4836 = torch.aten.add.Tensor %4835, %4611, %int1_6796 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_6797 = torch.constant.int 4096
    %int3072_6798 = torch.constant.int 3072
    %4837 = torch.prim.ListConstruct %int4096_6797, %int3072_6798 : (!torch.int, !torch.int) -> !torch.list<int>
    %4838 = torch.aten.view %4836, %4837 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.13.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.13.img_mlp.0.weight : tensor<12288x3072xf16>
    %4839 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_6799 = torch.constant.int 0
    %int1_6800 = torch.constant.int 1
    %4840 = torch.aten.transpose.int %4839, %int0_6799, %int1_6800 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.13.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.13.img_mlp.0.bias : tensor<12288xf16>
    %4841 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_6801 = torch.constant.int 6
    %4842 = torch.prims.convert_element_type %4841, %int6_6801 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_6802 = torch.constant.int 6
    %4843 = torch.prims.convert_element_type %4838, %int6_6802 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_6803 = torch.constant.int 6
    %4844 = torch.prims.convert_element_type %4840, %int6_6803 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %4845 = torch.aten.mm %4843, %4844 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_6804 = torch.constant.int 1
    %4846 = torch.aten.mul.Scalar %4845, %int1_6804 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_6805 = torch.constant.int 1
    %4847 = torch.aten.mul.Scalar %4842, %int1_6805 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_6806 = torch.constant.int 1
    %4848 = torch.aten.add.Tensor %4846, %4847, %int1_6806 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_6807 = torch.constant.int 5
    %4849 = torch.prims.convert_element_type %4848, %int5_6807 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_6808 = torch.constant.int 1
    %int4096_6809 = torch.constant.int 4096
    %int12288_6810 = torch.constant.int 12288
    %4850 = torch.prim.ListConstruct %int1_6808, %int4096_6809, %int12288_6810 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4851 = torch.aten.view %4849, %4850 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_6811 = torch.constant.str "tanh"
    %4852 = torch.aten.gelu %4851, %str_6811 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_6812 = torch.constant.int 4096
    %int12288_6813 = torch.constant.int 12288
    %4853 = torch.prim.ListConstruct %int4096_6812, %int12288_6813 : (!torch.int, !torch.int) -> !torch.list<int>
    %4854 = torch.aten.view %4852, %4853 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.13.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.13.img_mlp.2.weight : tensor<3072x12288xf16>
    %4855 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_6814 = torch.constant.int 0
    %int1_6815 = torch.constant.int 1
    %4856 = torch.aten.transpose.int %4855, %int0_6814, %int1_6815 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.13.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.13.img_mlp.2.bias : tensor<3072xf16>
    %4857 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6816 = torch.constant.int 6
    %4858 = torch.prims.convert_element_type %4857, %int6_6816 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6817 = torch.constant.int 6
    %4859 = torch.prims.convert_element_type %4854, %int6_6817 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_6818 = torch.constant.int 6
    %4860 = torch.prims.convert_element_type %4856, %int6_6818 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %4861 = torch.aten.mm %4859, %4860 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_6819 = torch.constant.int 1
    %4862 = torch.aten.mul.Scalar %4861, %int1_6819 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_6820 = torch.constant.int 1
    %4863 = torch.aten.mul.Scalar %4858, %int1_6820 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6821 = torch.constant.int 1
    %4864 = torch.aten.add.Tensor %4862, %4863, %int1_6821 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_6822 = torch.constant.int 5
    %4865 = torch.prims.convert_element_type %4864, %int5_6822 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_6823 = torch.constant.int 1
    %int4096_6824 = torch.constant.int 4096
    %int3072_6825 = torch.constant.int 3072
    %4866 = torch.prim.ListConstruct %int1_6823, %int4096_6824, %int3072_6825 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4867 = torch.aten.view %4865, %4866 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %4868 = torch.aten.mul.Tensor %4613, %4867 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6826 = torch.constant.int 1
    %4869 = torch.aten.add.Tensor %4826, %4868, %int1_6826 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_6827 = torch.constant.int 512
    %int3072_6828 = torch.constant.int 3072
    %4870 = torch.prim.ListConstruct %int512_6827, %int3072_6828 : (!torch.int, !torch.int) -> !torch.list<int>
    %4871 = torch.aten.view %4807, %4870 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.13.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.13.txt_attn.proj.weight : tensor<3072x3072xf16>
    %4872 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_6829 = torch.constant.int 0
    %int1_6830 = torch.constant.int 1
    %4873 = torch.aten.transpose.int %4872, %int0_6829, %int1_6830 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.13.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.13.txt_attn.proj.bias : tensor<3072xf16>
    %4874 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6831 = torch.constant.int 6
    %4875 = torch.prims.convert_element_type %4874, %int6_6831 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6832 = torch.constant.int 6
    %4876 = torch.prims.convert_element_type %4871, %int6_6832 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_6833 = torch.constant.int 6
    %4877 = torch.prims.convert_element_type %4873, %int6_6833 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %4878 = torch.aten.mm %4876, %4877 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_6834 = torch.constant.int 1
    %4879 = torch.aten.mul.Scalar %4878, %int1_6834 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_6835 = torch.constant.int 1
    %4880 = torch.aten.mul.Scalar %4875, %int1_6835 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6836 = torch.constant.int 1
    %4881 = torch.aten.add.Tensor %4879, %4880, %int1_6836 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_6837 = torch.constant.int 5
    %4882 = torch.prims.convert_element_type %4881, %int5_6837 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_6838 = torch.constant.int 1
    %int512_6839 = torch.constant.int 512
    %int3072_6840 = torch.constant.int 3072
    %4883 = torch.prim.ListConstruct %int1_6838, %int512_6839, %int3072_6840 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4884 = torch.aten.view %4882, %4883 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %4885 = torch.aten.mul.Tensor %4631, %4884 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6841 = torch.constant.int 1
    %4886 = torch.aten.add.Tensor %4592, %4885, %int1_6841 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_6842 = torch.constant.int 1
    %int1_6843 = torch.constant.int 1
    %4887 = torch.aten.add.Scalar %4633, %int1_6842, %int1_6843 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_6844 = torch.constant.int 6
    %4888 = torch.prims.convert_element_type %4886, %int6_6844 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_6845 = torch.constant.int 2
    %4889 = torch.prim.ListConstruct %int2_6845 : (!torch.int) -> !torch.list<int>
    %int0_6846 = torch.constant.int 0
    %true_6847 = torch.constant.bool true
    %result0_6848, %result1_6849 = torch.aten.var_mean.correction %4888, %4889, %int0_6846, %true_6847 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_6850 = torch.constant.float 9.9999999999999995E-7
    %int1_6851 = torch.constant.int 1
    %4890 = torch.aten.add.Scalar %result0_6848, %float9.999990e-07_6850, %int1_6851 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %4891 = torch.aten.rsqrt %4890 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_6852 = torch.constant.int 1
    %4892 = torch.aten.sub.Tensor %4886, %result1_6849, %int1_6852 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %4893 = torch.aten.mul.Tensor %4892, %4891 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_6853 = torch.constant.int 5
    %4894 = torch.prims.convert_element_type %4893, %int5_6853 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %4895 = torch.aten.mul.Tensor %4887, %4894 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6854 = torch.constant.int 1
    %4896 = torch.aten.add.Tensor %4895, %4632, %int1_6854 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_6855 = torch.constant.int 512
    %int3072_6856 = torch.constant.int 3072
    %4897 = torch.prim.ListConstruct %int512_6855, %int3072_6856 : (!torch.int, !torch.int) -> !torch.list<int>
    %4898 = torch.aten.view %4896, %4897 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.13.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.13.txt_mlp.0.weight : tensor<12288x3072xf16>
    %4899 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_6857 = torch.constant.int 0
    %int1_6858 = torch.constant.int 1
    %4900 = torch.aten.transpose.int %4899, %int0_6857, %int1_6858 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.13.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.13.txt_mlp.0.bias : tensor<12288xf16>
    %4901 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_6859 = torch.constant.int 6
    %4902 = torch.prims.convert_element_type %4901, %int6_6859 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_6860 = torch.constant.int 6
    %4903 = torch.prims.convert_element_type %4898, %int6_6860 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_6861 = torch.constant.int 6
    %4904 = torch.prims.convert_element_type %4900, %int6_6861 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %4905 = torch.aten.mm %4903, %4904 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_6862 = torch.constant.int 1
    %4906 = torch.aten.mul.Scalar %4905, %int1_6862 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_6863 = torch.constant.int 1
    %4907 = torch.aten.mul.Scalar %4902, %int1_6863 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_6864 = torch.constant.int 1
    %4908 = torch.aten.add.Tensor %4906, %4907, %int1_6864 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_6865 = torch.constant.int 5
    %4909 = torch.prims.convert_element_type %4908, %int5_6865 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_6866 = torch.constant.int 1
    %int512_6867 = torch.constant.int 512
    %int12288_6868 = torch.constant.int 12288
    %4910 = torch.prim.ListConstruct %int1_6866, %int512_6867, %int12288_6868 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4911 = torch.aten.view %4909, %4910 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_6869 = torch.constant.str "tanh"
    %4912 = torch.aten.gelu %4911, %str_6869 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_6870 = torch.constant.int 512
    %int12288_6871 = torch.constant.int 12288
    %4913 = torch.prim.ListConstruct %int512_6870, %int12288_6871 : (!torch.int, !torch.int) -> !torch.list<int>
    %4914 = torch.aten.view %4912, %4913 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.13.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.13.txt_mlp.2.weight : tensor<3072x12288xf16>
    %4915 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_6872 = torch.constant.int 0
    %int1_6873 = torch.constant.int 1
    %4916 = torch.aten.transpose.int %4915, %int0_6872, %int1_6873 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.13.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.13.txt_mlp.2.bias : tensor<3072xf16>
    %4917 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6874 = torch.constant.int 6
    %4918 = torch.prims.convert_element_type %4917, %int6_6874 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6875 = torch.constant.int 6
    %4919 = torch.prims.convert_element_type %4914, %int6_6875 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_6876 = torch.constant.int 6
    %4920 = torch.prims.convert_element_type %4916, %int6_6876 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %4921 = torch.aten.mm %4919, %4920 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_6877 = torch.constant.int 1
    %4922 = torch.aten.mul.Scalar %4921, %int1_6877 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_6878 = torch.constant.int 1
    %4923 = torch.aten.mul.Scalar %4918, %int1_6878 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6879 = torch.constant.int 1
    %4924 = torch.aten.add.Tensor %4922, %4923, %int1_6879 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_6880 = torch.constant.int 5
    %4925 = torch.prims.convert_element_type %4924, %int5_6880 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_6881 = torch.constant.int 1
    %int512_6882 = torch.constant.int 512
    %int3072_6883 = torch.constant.int 3072
    %4926 = torch.prim.ListConstruct %int1_6881, %int512_6882, %int3072_6883 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4927 = torch.aten.view %4925, %4926 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %4928 = torch.aten.mul.Tensor %4634, %4927 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6884 = torch.constant.int 1
    %4929 = torch.aten.add.Tensor %4886, %4928, %int1_6884 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %4930 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.14.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.14.img_mod.lin.weight : tensor<18432x3072xf16>
    %4931 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_6885 = torch.constant.int 0
    %int1_6886 = torch.constant.int 1
    %4932 = torch.aten.transpose.int %4931, %int0_6885, %int1_6886 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.14.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.14.img_mod.lin.bias : tensor<18432xf16>
    %4933 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_6887 = torch.constant.int 6
    %4934 = torch.prims.convert_element_type %4933, %int6_6887 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_6888 = torch.constant.int 6
    %4935 = torch.prims.convert_element_type %4930, %int6_6888 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_6889 = torch.constant.int 6
    %4936 = torch.prims.convert_element_type %4932, %int6_6889 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %4937 = torch.aten.mm %4935, %4936 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_6890 = torch.constant.int 1
    %4938 = torch.aten.mul.Scalar %4937, %int1_6890 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_6891 = torch.constant.int 1
    %4939 = torch.aten.mul.Scalar %4934, %int1_6891 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_6892 = torch.constant.int 1
    %4940 = torch.aten.add.Tensor %4938, %4939, %int1_6892 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_6893 = torch.constant.int 5
    %4941 = torch.prims.convert_element_type %4940, %int5_6893 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_6894 = torch.constant.int 0
    %int0_6895 = torch.constant.int 0
    %int9223372036854775807_6896 = torch.constant.int 9223372036854775807
    %int1_6897 = torch.constant.int 1
    %4942 = torch.aten.slice.Tensor %4941, %int0_6894, %int0_6895, %int9223372036854775807_6896, %int1_6897 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_6898 = torch.constant.int 1
    %4943 = torch.aten.unsqueeze %4942, %int1_6898 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_6899 = torch.constant.int 2
    %int0_6900 = torch.constant.int 0
    %int9223372036854775807_6901 = torch.constant.int 9223372036854775807
    %int1_6902 = torch.constant.int 1
    %4944 = torch.aten.slice.Tensor %4943, %int2_6899, %int0_6900, %int9223372036854775807_6901, %int1_6902 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_6903 = torch.constant.int -1
    %int0_6904 = torch.constant.int 0
    %int3072_6905 = torch.constant.int 3072
    %int1_6906 = torch.constant.int 1
    %4945 = torch.aten.slice.Tensor %4944, %int-1_6903, %int0_6904, %int3072_6905, %int1_6906 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6907 = torch.constant.int -1
    %int3072_6908 = torch.constant.int 3072
    %int6144_6909 = torch.constant.int 6144
    %int1_6910 = torch.constant.int 1
    %4946 = torch.aten.slice.Tensor %4944, %int-1_6907, %int3072_6908, %int6144_6909, %int1_6910 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6911 = torch.constant.int -1
    %int6144_6912 = torch.constant.int 6144
    %int9216_6913 = torch.constant.int 9216
    %int1_6914 = torch.constant.int 1
    %4947 = torch.aten.slice.Tensor %4944, %int-1_6911, %int6144_6912, %int9216_6913, %int1_6914 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6915 = torch.constant.int -1
    %int9216_6916 = torch.constant.int 9216
    %int12288_6917 = torch.constant.int 12288
    %int1_6918 = torch.constant.int 1
    %4948 = torch.aten.slice.Tensor %4944, %int-1_6915, %int9216_6916, %int12288_6917, %int1_6918 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6919 = torch.constant.int -1
    %int12288_6920 = torch.constant.int 12288
    %int15360_6921 = torch.constant.int 15360
    %int1_6922 = torch.constant.int 1
    %4949 = torch.aten.slice.Tensor %4944, %int-1_6919, %int12288_6920, %int15360_6921, %int1_6922 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6923 = torch.constant.int -1
    %int15360_6924 = torch.constant.int 15360
    %int18432_6925 = torch.constant.int 18432
    %int1_6926 = torch.constant.int 1
    %4950 = torch.aten.slice.Tensor %4944, %int-1_6923, %int15360_6924, %int18432_6925, %int1_6926 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4951 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.14.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.14.txt_mod.lin.weight : tensor<18432x3072xf16>
    %4952 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_6927 = torch.constant.int 0
    %int1_6928 = torch.constant.int 1
    %4953 = torch.aten.transpose.int %4952, %int0_6927, %int1_6928 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.14.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.14.txt_mod.lin.bias : tensor<18432xf16>
    %4954 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_6929 = torch.constant.int 6
    %4955 = torch.prims.convert_element_type %4954, %int6_6929 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_6930 = torch.constant.int 6
    %4956 = torch.prims.convert_element_type %4951, %int6_6930 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_6931 = torch.constant.int 6
    %4957 = torch.prims.convert_element_type %4953, %int6_6931 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %4958 = torch.aten.mm %4956, %4957 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_6932 = torch.constant.int 1
    %4959 = torch.aten.mul.Scalar %4958, %int1_6932 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_6933 = torch.constant.int 1
    %4960 = torch.aten.mul.Scalar %4955, %int1_6933 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_6934 = torch.constant.int 1
    %4961 = torch.aten.add.Tensor %4959, %4960, %int1_6934 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_6935 = torch.constant.int 5
    %4962 = torch.prims.convert_element_type %4961, %int5_6935 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_6936 = torch.constant.int 0
    %int0_6937 = torch.constant.int 0
    %int9223372036854775807_6938 = torch.constant.int 9223372036854775807
    %int1_6939 = torch.constant.int 1
    %4963 = torch.aten.slice.Tensor %4962, %int0_6936, %int0_6937, %int9223372036854775807_6938, %int1_6939 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_6940 = torch.constant.int 1
    %4964 = torch.aten.unsqueeze %4963, %int1_6940 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_6941 = torch.constant.int 2
    %int0_6942 = torch.constant.int 0
    %int9223372036854775807_6943 = torch.constant.int 9223372036854775807
    %int1_6944 = torch.constant.int 1
    %4965 = torch.aten.slice.Tensor %4964, %int2_6941, %int0_6942, %int9223372036854775807_6943, %int1_6944 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_6945 = torch.constant.int -1
    %int0_6946 = torch.constant.int 0
    %int3072_6947 = torch.constant.int 3072
    %int1_6948 = torch.constant.int 1
    %4966 = torch.aten.slice.Tensor %4965, %int-1_6945, %int0_6946, %int3072_6947, %int1_6948 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6949 = torch.constant.int -1
    %int3072_6950 = torch.constant.int 3072
    %int6144_6951 = torch.constant.int 6144
    %int1_6952 = torch.constant.int 1
    %4967 = torch.aten.slice.Tensor %4965, %int-1_6949, %int3072_6950, %int6144_6951, %int1_6952 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6953 = torch.constant.int -1
    %int6144_6954 = torch.constant.int 6144
    %int9216_6955 = torch.constant.int 9216
    %int1_6956 = torch.constant.int 1
    %4968 = torch.aten.slice.Tensor %4965, %int-1_6953, %int6144_6954, %int9216_6955, %int1_6956 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6957 = torch.constant.int -1
    %int9216_6958 = torch.constant.int 9216
    %int12288_6959 = torch.constant.int 12288
    %int1_6960 = torch.constant.int 1
    %4969 = torch.aten.slice.Tensor %4965, %int-1_6957, %int9216_6958, %int12288_6959, %int1_6960 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6961 = torch.constant.int -1
    %int12288_6962 = torch.constant.int 12288
    %int15360_6963 = torch.constant.int 15360
    %int1_6964 = torch.constant.int 1
    %4970 = torch.aten.slice.Tensor %4965, %int-1_6961, %int12288_6962, %int15360_6963, %int1_6964 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6965 = torch.constant.int -1
    %int15360_6966 = torch.constant.int 15360
    %int18432_6967 = torch.constant.int 18432
    %int1_6968 = torch.constant.int 1
    %4971 = torch.aten.slice.Tensor %4965, %int-1_6965, %int15360_6966, %int18432_6967, %int1_6968 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_6969 = torch.constant.int 6
    %4972 = torch.prims.convert_element_type %4869, %int6_6969 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_6970 = torch.constant.int 2
    %4973 = torch.prim.ListConstruct %int2_6970 : (!torch.int) -> !torch.list<int>
    %int0_6971 = torch.constant.int 0
    %true_6972 = torch.constant.bool true
    %result0_6973, %result1_6974 = torch.aten.var_mean.correction %4972, %4973, %int0_6971, %true_6972 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_6975 = torch.constant.float 9.9999999999999995E-7
    %int1_6976 = torch.constant.int 1
    %4974 = torch.aten.add.Scalar %result0_6973, %float9.999990e-07_6975, %int1_6976 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %4975 = torch.aten.rsqrt %4974 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_6977 = torch.constant.int 1
    %4976 = torch.aten.sub.Tensor %4869, %result1_6974, %int1_6977 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %4977 = torch.aten.mul.Tensor %4976, %4975 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_6978 = torch.constant.int 5
    %4978 = torch.prims.convert_element_type %4977, %int5_6978 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6979 = torch.constant.int 1
    %int1_6980 = torch.constant.int 1
    %4979 = torch.aten.add.Scalar %4946, %int1_6979, %int1_6980 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4980 = torch.aten.mul.Tensor %4979, %4978 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6981 = torch.constant.int 1
    %4981 = torch.aten.add.Tensor %4980, %4945, %int1_6981 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_6982 = torch.constant.int 4096
    %int3072_6983 = torch.constant.int 3072
    %4982 = torch.prim.ListConstruct %int4096_6982, %int3072_6983 : (!torch.int, !torch.int) -> !torch.list<int>
    %4983 = torch.aten.view %4981, %4982 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.14.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.14.img_attn.qkv.weight : tensor<9216x3072xf16>
    %4984 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_6984 = torch.constant.int 0
    %int1_6985 = torch.constant.int 1
    %4985 = torch.aten.transpose.int %4984, %int0_6984, %int1_6985 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.14.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.14.img_attn.qkv.bias : tensor<9216xf16>
    %4986 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_6986 = torch.constant.int 6
    %4987 = torch.prims.convert_element_type %4986, %int6_6986 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_6987 = torch.constant.int 6
    %4988 = torch.prims.convert_element_type %4983, %int6_6987 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_6988 = torch.constant.int 6
    %4989 = torch.prims.convert_element_type %4985, %int6_6988 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %4990 = torch.aten.mm %4988, %4989 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_6989 = torch.constant.int 1
    %4991 = torch.aten.mul.Scalar %4990, %int1_6989 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_6990 = torch.constant.int 1
    %4992 = torch.aten.mul.Scalar %4987, %int1_6990 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_6991 = torch.constant.int 1
    %4993 = torch.aten.add.Tensor %4991, %4992, %int1_6991 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_6992 = torch.constant.int 5
    %4994 = torch.prims.convert_element_type %4993, %int5_6992 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_6993 = torch.constant.int 1
    %int4096_6994 = torch.constant.int 4096
    %int9216_6995 = torch.constant.int 9216
    %4995 = torch.prim.ListConstruct %int1_6993, %int4096_6994, %int9216_6995 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4996 = torch.aten.view %4994, %4995 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %4997 = torch_c.to_builtin_tensor %4996 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_6996 = tensor.cast %4997 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_6997 = arith.constant 0 : index
    %dim_6998 = tensor.dim %cast_6996, %c0_6997 : tensor<?x?x?xf16>
    %c1_6999 = arith.constant 1 : index
    %dim_7000 = tensor.dim %cast_6996, %c1_6999 : tensor<?x?x?xf16>
    %c2_7001 = arith.constant 2 : index
    %dim_7002 = tensor.dim %cast_6996, %c2_7001 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_6996 : tensor<?x?x?xf16>{%dim_6998, %dim_7000, %dim_7002}]
    %cast_7003 = tensor.cast %cast_6996 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %4998 = torch_c.from_builtin_tensor %cast_7003 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_7004 = torch.constant.int 1
    %int4096_7005 = torch.constant.int 4096
    %int3_7006 = torch.constant.int 3
    %int24_7007 = torch.constant.int 24
    %int128_7008 = torch.constant.int 128
    %4999 = torch.prim.ListConstruct %int1_7004, %int4096_7005, %int3_7006, %int24_7007, %int128_7008 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5000 = torch.aten.view %4998, %4999 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_7009 = torch.constant.int 2
    %int0_7010 = torch.constant.int 0
    %int3_7011 = torch.constant.int 3
    %int1_7012 = torch.constant.int 1
    %int4_7013 = torch.constant.int 4
    %5001 = torch.prim.ListConstruct %int2_7009, %int0_7010, %int3_7011, %int1_7012, %int4_7013 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5002 = torch.aten.permute %5000, %5001 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_7014 = torch.constant.int 0
    %int0_7015 = torch.constant.int 0
    %5003 = torch.aten.select.int %5002, %int0_7014, %int0_7015 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_7016 = torch.constant.int 6
    %5004 = torch.prims.convert_element_type %5003, %int6_7016 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_7017 = torch.constant.int 2
    %5005 = torch.aten.pow.Tensor_Scalar %5004, %int2_7017 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_7018 = torch.constant.int -1
    %5006 = torch.prim.ListConstruct %int-1_7018 : (!torch.int) -> !torch.list<int>
    %true_7019 = torch.constant.bool true
    %none_7020 = torch.constant.none
    %5007 = torch.aten.mean.dim %5005, %5006, %true_7019, %none_7020 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_7021 = torch.constant.float 9.9999999999999995E-7
    %int1_7022 = torch.constant.int 1
    %5008 = torch.aten.add.Scalar %5007, %float9.999990e-07_7021, %int1_7022 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %5009 = torch.aten.rsqrt %5008 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %5010 = torch.aten.mul.Tensor %5004, %5009 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_7023 = torch.constant.int 5
    %5011 = torch.prims.convert_element_type %5010, %int5_7023 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.14.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.14.img_attn.norm.query_norm.scale : tensor<128xf16>
    %5012 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5013 = torch.aten.mul.Tensor %5011, %5012 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_7024 = torch.constant.int 1
    %int4096_7025 = torch.constant.int 4096
    %int3_7026 = torch.constant.int 3
    %int24_7027 = torch.constant.int 24
    %int128_7028 = torch.constant.int 128
    %5014 = torch.prim.ListConstruct %int1_7024, %int4096_7025, %int3_7026, %int24_7027, %int128_7028 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5015 = torch.aten.view %4998, %5014 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_7029 = torch.constant.int 2
    %int0_7030 = torch.constant.int 0
    %int3_7031 = torch.constant.int 3
    %int1_7032 = torch.constant.int 1
    %int4_7033 = torch.constant.int 4
    %5016 = torch.prim.ListConstruct %int2_7029, %int0_7030, %int3_7031, %int1_7032, %int4_7033 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5017 = torch.aten.permute %5015, %5016 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_7034 = torch.constant.int 0
    %int1_7035 = torch.constant.int 1
    %5018 = torch.aten.select.int %5017, %int0_7034, %int1_7035 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_7036 = torch.constant.int 6
    %5019 = torch.prims.convert_element_type %5018, %int6_7036 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_7037 = torch.constant.int 2
    %5020 = torch.aten.pow.Tensor_Scalar %5019, %int2_7037 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_7038 = torch.constant.int -1
    %5021 = torch.prim.ListConstruct %int-1_7038 : (!torch.int) -> !torch.list<int>
    %true_7039 = torch.constant.bool true
    %none_7040 = torch.constant.none
    %5022 = torch.aten.mean.dim %5020, %5021, %true_7039, %none_7040 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_7041 = torch.constant.float 9.9999999999999995E-7
    %int1_7042 = torch.constant.int 1
    %5023 = torch.aten.add.Scalar %5022, %float9.999990e-07_7041, %int1_7042 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %5024 = torch.aten.rsqrt %5023 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %5025 = torch.aten.mul.Tensor %5019, %5024 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_7043 = torch.constant.int 5
    %5026 = torch.prims.convert_element_type %5025, %int5_7043 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.14.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.14.img_attn.norm.key_norm.scale : tensor<128xf16>
    %5027 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5028 = torch.aten.mul.Tensor %5026, %5027 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_7044 = torch.constant.int 5
    %5029 = torch.prims.convert_element_type %5013, %int5_7044 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_7045 = torch.constant.int 5
    %5030 = torch.prims.convert_element_type %5028, %int5_7045 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_7046 = torch.constant.int 6
    %5031 = torch.prims.convert_element_type %4929, %int6_7046 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_7047 = torch.constant.int 2
    %5032 = torch.prim.ListConstruct %int2_7047 : (!torch.int) -> !torch.list<int>
    %int0_7048 = torch.constant.int 0
    %true_7049 = torch.constant.bool true
    %result0_7050, %result1_7051 = torch.aten.var_mean.correction %5031, %5032, %int0_7048, %true_7049 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_7052 = torch.constant.float 9.9999999999999995E-7
    %int1_7053 = torch.constant.int 1
    %5033 = torch.aten.add.Scalar %result0_7050, %float9.999990e-07_7052, %int1_7053 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %5034 = torch.aten.rsqrt %5033 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_7054 = torch.constant.int 1
    %5035 = torch.aten.sub.Tensor %4929, %result1_7051, %int1_7054 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %5036 = torch.aten.mul.Tensor %5035, %5034 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_7055 = torch.constant.int 5
    %5037 = torch.prims.convert_element_type %5036, %int5_7055 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_7056 = torch.constant.int 1
    %int1_7057 = torch.constant.int 1
    %5038 = torch.aten.add.Scalar %4967, %int1_7056, %int1_7057 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5039 = torch.aten.mul.Tensor %5038, %5037 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7058 = torch.constant.int 1
    %5040 = torch.aten.add.Tensor %5039, %4966, %int1_7058 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_7059 = torch.constant.int 512
    %int3072_7060 = torch.constant.int 3072
    %5041 = torch.prim.ListConstruct %int512_7059, %int3072_7060 : (!torch.int, !torch.int) -> !torch.list<int>
    %5042 = torch.aten.view %5040, %5041 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.14.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.14.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %5043 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_7061 = torch.constant.int 0
    %int1_7062 = torch.constant.int 1
    %5044 = torch.aten.transpose.int %5043, %int0_7061, %int1_7062 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.14.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.14.txt_attn.qkv.bias : tensor<9216xf16>
    %5045 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_7063 = torch.constant.int 6
    %5046 = torch.prims.convert_element_type %5045, %int6_7063 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_7064 = torch.constant.int 6
    %5047 = torch.prims.convert_element_type %5042, %int6_7064 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_7065 = torch.constant.int 6
    %5048 = torch.prims.convert_element_type %5044, %int6_7065 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %5049 = torch.aten.mm %5047, %5048 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_7066 = torch.constant.int 1
    %5050 = torch.aten.mul.Scalar %5049, %int1_7066 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_7067 = torch.constant.int 1
    %5051 = torch.aten.mul.Scalar %5046, %int1_7067 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_7068 = torch.constant.int 1
    %5052 = torch.aten.add.Tensor %5050, %5051, %int1_7068 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_7069 = torch.constant.int 5
    %5053 = torch.prims.convert_element_type %5052, %int5_7069 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_7070 = torch.constant.int 1
    %int512_7071 = torch.constant.int 512
    %int9216_7072 = torch.constant.int 9216
    %5054 = torch.prim.ListConstruct %int1_7070, %int512_7071, %int9216_7072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5055 = torch.aten.view %5053, %5054 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %5056 = torch_c.to_builtin_tensor %5055 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_7073 = tensor.cast %5056 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_7074 = arith.constant 0 : index
    %dim_7075 = tensor.dim %cast_7073, %c0_7074 : tensor<?x?x?xf16>
    %c1_7076 = arith.constant 1 : index
    %dim_7077 = tensor.dim %cast_7073, %c1_7076 : tensor<?x?x?xf16>
    %c2_7078 = arith.constant 2 : index
    %dim_7079 = tensor.dim %cast_7073, %c2_7078 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_7073 : tensor<?x?x?xf16>{%dim_7075, %dim_7077, %dim_7079}]
    %cast_7080 = tensor.cast %cast_7073 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %5057 = torch_c.from_builtin_tensor %cast_7080 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_7081 = torch.constant.int 1
    %int512_7082 = torch.constant.int 512
    %int3_7083 = torch.constant.int 3
    %int24_7084 = torch.constant.int 24
    %int128_7085 = torch.constant.int 128
    %5058 = torch.prim.ListConstruct %int1_7081, %int512_7082, %int3_7083, %int24_7084, %int128_7085 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5059 = torch.aten.view %5057, %5058 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_7086 = torch.constant.int 2
    %int0_7087 = torch.constant.int 0
    %int3_7088 = torch.constant.int 3
    %int1_7089 = torch.constant.int 1
    %int4_7090 = torch.constant.int 4
    %5060 = torch.prim.ListConstruct %int2_7086, %int0_7087, %int3_7088, %int1_7089, %int4_7090 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5061 = torch.aten.permute %5059, %5060 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_7091 = torch.constant.int 0
    %int0_7092 = torch.constant.int 0
    %5062 = torch.aten.select.int %5061, %int0_7091, %int0_7092 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_7093 = torch.constant.int 6
    %5063 = torch.prims.convert_element_type %5062, %int6_7093 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_7094 = torch.constant.int 2
    %5064 = torch.aten.pow.Tensor_Scalar %5063, %int2_7094 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_7095 = torch.constant.int -1
    %5065 = torch.prim.ListConstruct %int-1_7095 : (!torch.int) -> !torch.list<int>
    %true_7096 = torch.constant.bool true
    %none_7097 = torch.constant.none
    %5066 = torch.aten.mean.dim %5064, %5065, %true_7096, %none_7097 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_7098 = torch.constant.float 9.9999999999999995E-7
    %int1_7099 = torch.constant.int 1
    %5067 = torch.aten.add.Scalar %5066, %float9.999990e-07_7098, %int1_7099 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %5068 = torch.aten.rsqrt %5067 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %5069 = torch.aten.mul.Tensor %5063, %5068 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_7100 = torch.constant.int 5
    %5070 = torch.prims.convert_element_type %5069, %int5_7100 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.14.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.14.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %5071 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5072 = torch.aten.mul.Tensor %5070, %5071 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_7101 = torch.constant.int 1
    %int512_7102 = torch.constant.int 512
    %int3_7103 = torch.constant.int 3
    %int24_7104 = torch.constant.int 24
    %int128_7105 = torch.constant.int 128
    %5073 = torch.prim.ListConstruct %int1_7101, %int512_7102, %int3_7103, %int24_7104, %int128_7105 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5074 = torch.aten.view %5057, %5073 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_7106 = torch.constant.int 2
    %int0_7107 = torch.constant.int 0
    %int3_7108 = torch.constant.int 3
    %int1_7109 = torch.constant.int 1
    %int4_7110 = torch.constant.int 4
    %5075 = torch.prim.ListConstruct %int2_7106, %int0_7107, %int3_7108, %int1_7109, %int4_7110 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5076 = torch.aten.permute %5074, %5075 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_7111 = torch.constant.int 0
    %int1_7112 = torch.constant.int 1
    %5077 = torch.aten.select.int %5076, %int0_7111, %int1_7112 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_7113 = torch.constant.int 6
    %5078 = torch.prims.convert_element_type %5077, %int6_7113 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_7114 = torch.constant.int 2
    %5079 = torch.aten.pow.Tensor_Scalar %5078, %int2_7114 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_7115 = torch.constant.int -1
    %5080 = torch.prim.ListConstruct %int-1_7115 : (!torch.int) -> !torch.list<int>
    %true_7116 = torch.constant.bool true
    %none_7117 = torch.constant.none
    %5081 = torch.aten.mean.dim %5079, %5080, %true_7116, %none_7117 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_7118 = torch.constant.float 9.9999999999999995E-7
    %int1_7119 = torch.constant.int 1
    %5082 = torch.aten.add.Scalar %5081, %float9.999990e-07_7118, %int1_7119 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %5083 = torch.aten.rsqrt %5082 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %5084 = torch.aten.mul.Tensor %5078, %5083 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_7120 = torch.constant.int 5
    %5085 = torch.prims.convert_element_type %5084, %int5_7120 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.14.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.14.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %5086 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5087 = torch.aten.mul.Tensor %5085, %5086 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_7121 = torch.constant.int 5
    %5088 = torch.prims.convert_element_type %5072, %int5_7121 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_7122 = torch.constant.int 5
    %5089 = torch.prims.convert_element_type %5087, %int5_7122 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %5090 = torch.prim.ListConstruct %5088, %5029 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_7123 = torch.constant.int 2
    %5091 = torch.aten.cat %5090, %int2_7123 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %5092 = torch.prim.ListConstruct %5089, %5030 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_7124 = torch.constant.int 2
    %5093 = torch.aten.cat %5092, %int2_7124 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_7125 = torch.constant.int 1
    %int512_7126 = torch.constant.int 512
    %int3_7127 = torch.constant.int 3
    %int24_7128 = torch.constant.int 24
    %int128_7129 = torch.constant.int 128
    %5094 = torch.prim.ListConstruct %int1_7125, %int512_7126, %int3_7127, %int24_7128, %int128_7129 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5095 = torch.aten.view %5057, %5094 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_7130 = torch.constant.int 2
    %int0_7131 = torch.constant.int 0
    %int3_7132 = torch.constant.int 3
    %int1_7133 = torch.constant.int 1
    %int4_7134 = torch.constant.int 4
    %5096 = torch.prim.ListConstruct %int2_7130, %int0_7131, %int3_7132, %int1_7133, %int4_7134 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5097 = torch.aten.permute %5095, %5096 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_7135 = torch.constant.int 0
    %int2_7136 = torch.constant.int 2
    %5098 = torch.aten.select.int %5097, %int0_7135, %int2_7136 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_7137 = torch.constant.int 1
    %int4096_7138 = torch.constant.int 4096
    %int3_7139 = torch.constant.int 3
    %int24_7140 = torch.constant.int 24
    %int128_7141 = torch.constant.int 128
    %5099 = torch.prim.ListConstruct %int1_7137, %int4096_7138, %int3_7139, %int24_7140, %int128_7141 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5100 = torch.aten.view %4998, %5099 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_7142 = torch.constant.int 2
    %int0_7143 = torch.constant.int 0
    %int3_7144 = torch.constant.int 3
    %int1_7145 = torch.constant.int 1
    %int4_7146 = torch.constant.int 4
    %5101 = torch.prim.ListConstruct %int2_7142, %int0_7143, %int3_7144, %int1_7145, %int4_7146 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5102 = torch.aten.permute %5100, %5101 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_7147 = torch.constant.int 0
    %int2_7148 = torch.constant.int 2
    %5103 = torch.aten.select.int %5102, %int0_7147, %int2_7148 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %5104 = torch.prim.ListConstruct %5098, %5103 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_7149 = torch.constant.int 2
    %5105 = torch.aten.cat %5104, %int2_7149 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %5106 = torch_c.to_builtin_tensor %5091 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_7150 = tensor.cast %5106 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_7151 = arith.constant 0 : index
    %dim_7152 = tensor.dim %cast_7150, %c0_7151 : tensor<?x?x?x?xf16>
    %c1_7153 = arith.constant 1 : index
    %dim_7154 = tensor.dim %cast_7150, %c1_7153 : tensor<?x?x?x?xf16>
    %c2_7155 = arith.constant 2 : index
    %dim_7156 = tensor.dim %cast_7150, %c2_7155 : tensor<?x?x?x?xf16>
    %c3_7157 = arith.constant 3 : index
    %dim_7158 = tensor.dim %cast_7150, %c3_7157 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_7150 : tensor<?x?x?x?xf16>{%dim_7152, %dim_7154, %dim_7156, %dim_7158}]
    %cast_7159 = tensor.cast %cast_7150 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %5107 = torch_c.from_builtin_tensor %cast_7159 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %5108 = torch_c.to_builtin_tensor %5093 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_7160 = tensor.cast %5108 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_7161 = arith.constant 0 : index
    %dim_7162 = tensor.dim %cast_7160, %c0_7161 : tensor<?x?x?x?xf16>
    %c1_7163 = arith.constant 1 : index
    %dim_7164 = tensor.dim %cast_7160, %c1_7163 : tensor<?x?x?x?xf16>
    %c2_7165 = arith.constant 2 : index
    %dim_7166 = tensor.dim %cast_7160, %c2_7165 : tensor<?x?x?x?xf16>
    %c3_7167 = arith.constant 3 : index
    %dim_7168 = tensor.dim %cast_7160, %c3_7167 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_7160 : tensor<?x?x?x?xf16>{%dim_7162, %dim_7164, %dim_7166, %dim_7168}]
    %cast_7169 = tensor.cast %cast_7160 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %5109 = torch_c.from_builtin_tensor %cast_7169 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %5110 = torch_c.to_builtin_tensor %5105 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_7170 = tensor.cast %5110 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_7171 = arith.constant 0 : index
    %dim_7172 = tensor.dim %cast_7170, %c0_7171 : tensor<?x?x?x?xf16>
    %c1_7173 = arith.constant 1 : index
    %dim_7174 = tensor.dim %cast_7170, %c1_7173 : tensor<?x?x?x?xf16>
    %c2_7175 = arith.constant 2 : index
    %dim_7176 = tensor.dim %cast_7170, %c2_7175 : tensor<?x?x?x?xf16>
    %c3_7177 = arith.constant 3 : index
    %dim_7178 = tensor.dim %cast_7170, %c3_7177 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_7170 : tensor<?x?x?x?xf16>{%dim_7172, %dim_7174, %dim_7176, %dim_7178}]
    %cast_7179 = tensor.cast %cast_7170 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %5111 = torch_c.from_builtin_tensor %cast_7179 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_7180 = torch.constant.int 6
    %5112 = torch.prims.convert_element_type %5107, %int6_7180 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_7181 = torch.constant.int 1
    %int24_7182 = torch.constant.int 24
    %int4608_7183 = torch.constant.int 4608
    %int-1_7184 = torch.constant.int -1
    %int1_7185 = torch.constant.int 1
    %int2_7186 = torch.constant.int 2
    %5113 = torch.prim.ListConstruct %int1_7181, %int24_7182, %int4608_7183, %int-1_7184, %int1_7185, %int2_7186 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5114 = torch.aten.view %5112, %5113 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_7187 = torch.constant.int 6
    %5115 = torch.prims.convert_element_type %5109, %int6_7187 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_7188 = torch.constant.int 1
    %int24_7189 = torch.constant.int 24
    %int4608_7190 = torch.constant.int 4608
    %int-1_7191 = torch.constant.int -1
    %int1_7192 = torch.constant.int 1
    %int2_7193 = torch.constant.int 2
    %5116 = torch.prim.ListConstruct %int1_7188, %int24_7189, %int4608_7190, %int-1_7191, %int1_7192, %int2_7193 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5117 = torch.aten.view %5115, %5116 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_7194 = torch.constant.int 5
    %int0_7195 = torch.constant.int 0
    %5118 = torch.aten.select.int %211, %int5_7194, %int0_7195 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7196 = torch.constant.int 5
    %int0_7197 = torch.constant.int 0
    %5119 = torch.aten.select.int %5114, %int5_7196, %int0_7197 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5120 = torch.aten.mul.Tensor %5118, %5119 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7198 = torch.constant.int 5
    %int1_7199 = torch.constant.int 1
    %5121 = torch.aten.select.int %211, %int5_7198, %int1_7199 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7200 = torch.constant.int 5
    %int1_7201 = torch.constant.int 1
    %5122 = torch.aten.select.int %5114, %int5_7200, %int1_7201 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5123 = torch.aten.mul.Tensor %5121, %5122 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7202 = torch.constant.int 1
    %5124 = torch.aten.add.Tensor %5120, %5123, %int1_7202 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7203 = torch.constant.int 5
    %int0_7204 = torch.constant.int 0
    %5125 = torch.aten.select.int %211, %int5_7203, %int0_7204 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7205 = torch.constant.int 5
    %int0_7206 = torch.constant.int 0
    %5126 = torch.aten.select.int %5117, %int5_7205, %int0_7206 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5127 = torch.aten.mul.Tensor %5125, %5126 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7207 = torch.constant.int 5
    %int1_7208 = torch.constant.int 1
    %5128 = torch.aten.select.int %211, %int5_7207, %int1_7208 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7209 = torch.constant.int 5
    %int1_7210 = torch.constant.int 1
    %5129 = torch.aten.select.int %5117, %int5_7209, %int1_7210 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5130 = torch.aten.mul.Tensor %5128, %5129 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7211 = torch.constant.int 1
    %5131 = torch.aten.add.Tensor %5127, %5130, %int1_7211 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7212 = torch.constant.int 1
    %int24_7213 = torch.constant.int 24
    %int4608_7214 = torch.constant.int 4608
    %int128_7215 = torch.constant.int 128
    %5132 = torch.prim.ListConstruct %int1_7212, %int24_7213, %int4608_7214, %int128_7215 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5133 = torch.aten.view %5124, %5132 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7216 = torch.constant.int 5
    %5134 = torch.prims.convert_element_type %5133, %int5_7216 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_7217 = torch.constant.int 1
    %int24_7218 = torch.constant.int 24
    %int4608_7219 = torch.constant.int 4608
    %int128_7220 = torch.constant.int 128
    %5135 = torch.prim.ListConstruct %int1_7217, %int24_7218, %int4608_7219, %int128_7220 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5136 = torch.aten.view %5131, %5135 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7221 = torch.constant.int 5
    %5137 = torch.prims.convert_element_type %5136, %int5_7221 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_7222 = torch.constant.float 0.000000e+00
    %false_7223 = torch.constant.bool false
    %none_7224 = torch.constant.none
    %none_7225 = torch.constant.none
    %5138:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%5134, %5137, %5111, %float0.000000e00_7222, %false_7223, %none_7224, %none_7225) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_7226 = torch.constant.int 0
    %int2_7227 = torch.constant.int 2
    %int1_7228 = torch.constant.int 1
    %int3_7229 = torch.constant.int 3
    %5139 = torch.prim.ListConstruct %int0_7226, %int2_7227, %int1_7228, %int3_7229 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5140 = torch.aten.permute %5138#0, %5139 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_7230 = torch.constant.int 1
    %int4608_7231 = torch.constant.int 4608
    %int3072_7232 = torch.constant.int 3072
    %5141 = torch.prim.ListConstruct %int1_7230, %int4608_7231, %int3072_7232 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5142 = torch.aten.view %5140, %5141 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_7233 = torch.constant.int 0
    %int0_7234 = torch.constant.int 0
    %int9223372036854775807_7235 = torch.constant.int 9223372036854775807
    %int1_7236 = torch.constant.int 1
    %5143 = torch.aten.slice.Tensor %5142, %int0_7233, %int0_7234, %int9223372036854775807_7235, %int1_7236 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_7237 = torch.constant.int 1
    %int0_7238 = torch.constant.int 0
    %int512_7239 = torch.constant.int 512
    %int1_7240 = torch.constant.int 1
    %5144 = torch.aten.slice.Tensor %5143, %int1_7237, %int0_7238, %int512_7239, %int1_7240 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_7241 = torch.constant.int 0
    %int0_7242 = torch.constant.int 0
    %int9223372036854775807_7243 = torch.constant.int 9223372036854775807
    %int1_7244 = torch.constant.int 1
    %5145 = torch.aten.slice.Tensor %5142, %int0_7241, %int0_7242, %int9223372036854775807_7243, %int1_7244 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_7245 = torch.constant.int 1
    %int512_7246 = torch.constant.int 512
    %int9223372036854775807_7247 = torch.constant.int 9223372036854775807
    %int1_7248 = torch.constant.int 1
    %5146 = torch.aten.slice.Tensor %5145, %int1_7245, %int512_7246, %int9223372036854775807_7247, %int1_7248 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_7249 = torch.constant.int 4096
    %int3072_7250 = torch.constant.int 3072
    %5147 = torch.prim.ListConstruct %int4096_7249, %int3072_7250 : (!torch.int, !torch.int) -> !torch.list<int>
    %5148 = torch.aten.view %5146, %5147 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.14.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.14.img_attn.proj.weight : tensor<3072x3072xf16>
    %5149 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_7251 = torch.constant.int 0
    %int1_7252 = torch.constant.int 1
    %5150 = torch.aten.transpose.int %5149, %int0_7251, %int1_7252 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.14.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.14.img_attn.proj.bias : tensor<3072xf16>
    %5151 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7253 = torch.constant.int 6
    %5152 = torch.prims.convert_element_type %5151, %int6_7253 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7254 = torch.constant.int 6
    %5153 = torch.prims.convert_element_type %5148, %int6_7254 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_7255 = torch.constant.int 6
    %5154 = torch.prims.convert_element_type %5150, %int6_7255 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %5155 = torch.aten.mm %5153, %5154 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_7256 = torch.constant.int 1
    %5156 = torch.aten.mul.Scalar %5155, %int1_7256 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_7257 = torch.constant.int 1
    %5157 = torch.aten.mul.Scalar %5152, %int1_7257 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7258 = torch.constant.int 1
    %5158 = torch.aten.add.Tensor %5156, %5157, %int1_7258 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_7259 = torch.constant.int 5
    %5159 = torch.prims.convert_element_type %5158, %int5_7259 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_7260 = torch.constant.int 1
    %int4096_7261 = torch.constant.int 4096
    %int3072_7262 = torch.constant.int 3072
    %5160 = torch.prim.ListConstruct %int1_7260, %int4096_7261, %int3072_7262 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5161 = torch.aten.view %5159, %5160 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %5162 = torch.aten.mul.Tensor %4947, %5161 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7263 = torch.constant.int 1
    %5163 = torch.aten.add.Tensor %4869, %5162, %int1_7263 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7264 = torch.constant.int 1
    %int1_7265 = torch.constant.int 1
    %5164 = torch.aten.add.Scalar %4949, %int1_7264, %int1_7265 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_7266 = torch.constant.int 6
    %5165 = torch.prims.convert_element_type %5163, %int6_7266 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_7267 = torch.constant.int 2
    %5166 = torch.prim.ListConstruct %int2_7267 : (!torch.int) -> !torch.list<int>
    %int0_7268 = torch.constant.int 0
    %true_7269 = torch.constant.bool true
    %result0_7270, %result1_7271 = torch.aten.var_mean.correction %5165, %5166, %int0_7268, %true_7269 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_7272 = torch.constant.float 9.9999999999999995E-7
    %int1_7273 = torch.constant.int 1
    %5167 = torch.aten.add.Scalar %result0_7270, %float9.999990e-07_7272, %int1_7273 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %5168 = torch.aten.rsqrt %5167 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_7274 = torch.constant.int 1
    %5169 = torch.aten.sub.Tensor %5163, %result1_7271, %int1_7274 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %5170 = torch.aten.mul.Tensor %5169, %5168 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_7275 = torch.constant.int 5
    %5171 = torch.prims.convert_element_type %5170, %int5_7275 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %5172 = torch.aten.mul.Tensor %5164, %5171 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7276 = torch.constant.int 1
    %5173 = torch.aten.add.Tensor %5172, %4948, %int1_7276 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_7277 = torch.constant.int 4096
    %int3072_7278 = torch.constant.int 3072
    %5174 = torch.prim.ListConstruct %int4096_7277, %int3072_7278 : (!torch.int, !torch.int) -> !torch.list<int>
    %5175 = torch.aten.view %5173, %5174 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.14.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.14.img_mlp.0.weight : tensor<12288x3072xf16>
    %5176 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_7279 = torch.constant.int 0
    %int1_7280 = torch.constant.int 1
    %5177 = torch.aten.transpose.int %5176, %int0_7279, %int1_7280 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.14.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.14.img_mlp.0.bias : tensor<12288xf16>
    %5178 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_7281 = torch.constant.int 6
    %5179 = torch.prims.convert_element_type %5178, %int6_7281 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_7282 = torch.constant.int 6
    %5180 = torch.prims.convert_element_type %5175, %int6_7282 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_7283 = torch.constant.int 6
    %5181 = torch.prims.convert_element_type %5177, %int6_7283 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %5182 = torch.aten.mm %5180, %5181 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_7284 = torch.constant.int 1
    %5183 = torch.aten.mul.Scalar %5182, %int1_7284 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_7285 = torch.constant.int 1
    %5184 = torch.aten.mul.Scalar %5179, %int1_7285 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_7286 = torch.constant.int 1
    %5185 = torch.aten.add.Tensor %5183, %5184, %int1_7286 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_7287 = torch.constant.int 5
    %5186 = torch.prims.convert_element_type %5185, %int5_7287 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_7288 = torch.constant.int 1
    %int4096_7289 = torch.constant.int 4096
    %int12288_7290 = torch.constant.int 12288
    %5187 = torch.prim.ListConstruct %int1_7288, %int4096_7289, %int12288_7290 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5188 = torch.aten.view %5186, %5187 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_7291 = torch.constant.str "tanh"
    %5189 = torch.aten.gelu %5188, %str_7291 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_7292 = torch.constant.int 4096
    %int12288_7293 = torch.constant.int 12288
    %5190 = torch.prim.ListConstruct %int4096_7292, %int12288_7293 : (!torch.int, !torch.int) -> !torch.list<int>
    %5191 = torch.aten.view %5189, %5190 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.14.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.14.img_mlp.2.weight : tensor<3072x12288xf16>
    %5192 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_7294 = torch.constant.int 0
    %int1_7295 = torch.constant.int 1
    %5193 = torch.aten.transpose.int %5192, %int0_7294, %int1_7295 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.14.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.14.img_mlp.2.bias : tensor<3072xf16>
    %5194 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7296 = torch.constant.int 6
    %5195 = torch.prims.convert_element_type %5194, %int6_7296 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7297 = torch.constant.int 6
    %5196 = torch.prims.convert_element_type %5191, %int6_7297 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_7298 = torch.constant.int 6
    %5197 = torch.prims.convert_element_type %5193, %int6_7298 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %5198 = torch.aten.mm %5196, %5197 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_7299 = torch.constant.int 1
    %5199 = torch.aten.mul.Scalar %5198, %int1_7299 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_7300 = torch.constant.int 1
    %5200 = torch.aten.mul.Scalar %5195, %int1_7300 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7301 = torch.constant.int 1
    %5201 = torch.aten.add.Tensor %5199, %5200, %int1_7301 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_7302 = torch.constant.int 5
    %5202 = torch.prims.convert_element_type %5201, %int5_7302 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_7303 = torch.constant.int 1
    %int4096_7304 = torch.constant.int 4096
    %int3072_7305 = torch.constant.int 3072
    %5203 = torch.prim.ListConstruct %int1_7303, %int4096_7304, %int3072_7305 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5204 = torch.aten.view %5202, %5203 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %5205 = torch.aten.mul.Tensor %4950, %5204 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7306 = torch.constant.int 1
    %5206 = torch.aten.add.Tensor %5163, %5205, %int1_7306 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_7307 = torch.constant.int 512
    %int3072_7308 = torch.constant.int 3072
    %5207 = torch.prim.ListConstruct %int512_7307, %int3072_7308 : (!torch.int, !torch.int) -> !torch.list<int>
    %5208 = torch.aten.view %5144, %5207 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.14.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.14.txt_attn.proj.weight : tensor<3072x3072xf16>
    %5209 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_7309 = torch.constant.int 0
    %int1_7310 = torch.constant.int 1
    %5210 = torch.aten.transpose.int %5209, %int0_7309, %int1_7310 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.14.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.14.txt_attn.proj.bias : tensor<3072xf16>
    %5211 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7311 = torch.constant.int 6
    %5212 = torch.prims.convert_element_type %5211, %int6_7311 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7312 = torch.constant.int 6
    %5213 = torch.prims.convert_element_type %5208, %int6_7312 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_7313 = torch.constant.int 6
    %5214 = torch.prims.convert_element_type %5210, %int6_7313 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %5215 = torch.aten.mm %5213, %5214 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_7314 = torch.constant.int 1
    %5216 = torch.aten.mul.Scalar %5215, %int1_7314 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_7315 = torch.constant.int 1
    %5217 = torch.aten.mul.Scalar %5212, %int1_7315 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7316 = torch.constant.int 1
    %5218 = torch.aten.add.Tensor %5216, %5217, %int1_7316 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_7317 = torch.constant.int 5
    %5219 = torch.prims.convert_element_type %5218, %int5_7317 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_7318 = torch.constant.int 1
    %int512_7319 = torch.constant.int 512
    %int3072_7320 = torch.constant.int 3072
    %5220 = torch.prim.ListConstruct %int1_7318, %int512_7319, %int3072_7320 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5221 = torch.aten.view %5219, %5220 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %5222 = torch.aten.mul.Tensor %4968, %5221 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7321 = torch.constant.int 1
    %5223 = torch.aten.add.Tensor %4929, %5222, %int1_7321 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_7322 = torch.constant.int 1
    %int1_7323 = torch.constant.int 1
    %5224 = torch.aten.add.Scalar %4970, %int1_7322, %int1_7323 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_7324 = torch.constant.int 6
    %5225 = torch.prims.convert_element_type %5223, %int6_7324 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_7325 = torch.constant.int 2
    %5226 = torch.prim.ListConstruct %int2_7325 : (!torch.int) -> !torch.list<int>
    %int0_7326 = torch.constant.int 0
    %true_7327 = torch.constant.bool true
    %result0_7328, %result1_7329 = torch.aten.var_mean.correction %5225, %5226, %int0_7326, %true_7327 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_7330 = torch.constant.float 9.9999999999999995E-7
    %int1_7331 = torch.constant.int 1
    %5227 = torch.aten.add.Scalar %result0_7328, %float9.999990e-07_7330, %int1_7331 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %5228 = torch.aten.rsqrt %5227 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_7332 = torch.constant.int 1
    %5229 = torch.aten.sub.Tensor %5223, %result1_7329, %int1_7332 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %5230 = torch.aten.mul.Tensor %5229, %5228 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_7333 = torch.constant.int 5
    %5231 = torch.prims.convert_element_type %5230, %int5_7333 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %5232 = torch.aten.mul.Tensor %5224, %5231 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7334 = torch.constant.int 1
    %5233 = torch.aten.add.Tensor %5232, %4969, %int1_7334 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_7335 = torch.constant.int 512
    %int3072_7336 = torch.constant.int 3072
    %5234 = torch.prim.ListConstruct %int512_7335, %int3072_7336 : (!torch.int, !torch.int) -> !torch.list<int>
    %5235 = torch.aten.view %5233, %5234 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.14.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.14.txt_mlp.0.weight : tensor<12288x3072xf16>
    %5236 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_7337 = torch.constant.int 0
    %int1_7338 = torch.constant.int 1
    %5237 = torch.aten.transpose.int %5236, %int0_7337, %int1_7338 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.14.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.14.txt_mlp.0.bias : tensor<12288xf16>
    %5238 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_7339 = torch.constant.int 6
    %5239 = torch.prims.convert_element_type %5238, %int6_7339 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_7340 = torch.constant.int 6
    %5240 = torch.prims.convert_element_type %5235, %int6_7340 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_7341 = torch.constant.int 6
    %5241 = torch.prims.convert_element_type %5237, %int6_7341 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %5242 = torch.aten.mm %5240, %5241 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_7342 = torch.constant.int 1
    %5243 = torch.aten.mul.Scalar %5242, %int1_7342 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_7343 = torch.constant.int 1
    %5244 = torch.aten.mul.Scalar %5239, %int1_7343 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_7344 = torch.constant.int 1
    %5245 = torch.aten.add.Tensor %5243, %5244, %int1_7344 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_7345 = torch.constant.int 5
    %5246 = torch.prims.convert_element_type %5245, %int5_7345 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_7346 = torch.constant.int 1
    %int512_7347 = torch.constant.int 512
    %int12288_7348 = torch.constant.int 12288
    %5247 = torch.prim.ListConstruct %int1_7346, %int512_7347, %int12288_7348 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5248 = torch.aten.view %5246, %5247 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_7349 = torch.constant.str "tanh"
    %5249 = torch.aten.gelu %5248, %str_7349 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_7350 = torch.constant.int 512
    %int12288_7351 = torch.constant.int 12288
    %5250 = torch.prim.ListConstruct %int512_7350, %int12288_7351 : (!torch.int, !torch.int) -> !torch.list<int>
    %5251 = torch.aten.view %5249, %5250 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.14.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.14.txt_mlp.2.weight : tensor<3072x12288xf16>
    %5252 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_7352 = torch.constant.int 0
    %int1_7353 = torch.constant.int 1
    %5253 = torch.aten.transpose.int %5252, %int0_7352, %int1_7353 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.14.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.14.txt_mlp.2.bias : tensor<3072xf16>
    %5254 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7354 = torch.constant.int 6
    %5255 = torch.prims.convert_element_type %5254, %int6_7354 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7355 = torch.constant.int 6
    %5256 = torch.prims.convert_element_type %5251, %int6_7355 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_7356 = torch.constant.int 6
    %5257 = torch.prims.convert_element_type %5253, %int6_7356 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %5258 = torch.aten.mm %5256, %5257 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_7357 = torch.constant.int 1
    %5259 = torch.aten.mul.Scalar %5258, %int1_7357 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_7358 = torch.constant.int 1
    %5260 = torch.aten.mul.Scalar %5255, %int1_7358 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7359 = torch.constant.int 1
    %5261 = torch.aten.add.Tensor %5259, %5260, %int1_7359 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_7360 = torch.constant.int 5
    %5262 = torch.prims.convert_element_type %5261, %int5_7360 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_7361 = torch.constant.int 1
    %int512_7362 = torch.constant.int 512
    %int3072_7363 = torch.constant.int 3072
    %5263 = torch.prim.ListConstruct %int1_7361, %int512_7362, %int3072_7363 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5264 = torch.aten.view %5262, %5263 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %5265 = torch.aten.mul.Tensor %4971, %5264 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7364 = torch.constant.int 1
    %5266 = torch.aten.add.Tensor %5223, %5265, %int1_7364 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %5267 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.15.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.15.img_mod.lin.weight : tensor<18432x3072xf16>
    %5268 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_7365 = torch.constant.int 0
    %int1_7366 = torch.constant.int 1
    %5269 = torch.aten.transpose.int %5268, %int0_7365, %int1_7366 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.15.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.15.img_mod.lin.bias : tensor<18432xf16>
    %5270 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_7367 = torch.constant.int 6
    %5271 = torch.prims.convert_element_type %5270, %int6_7367 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_7368 = torch.constant.int 6
    %5272 = torch.prims.convert_element_type %5267, %int6_7368 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_7369 = torch.constant.int 6
    %5273 = torch.prims.convert_element_type %5269, %int6_7369 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %5274 = torch.aten.mm %5272, %5273 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_7370 = torch.constant.int 1
    %5275 = torch.aten.mul.Scalar %5274, %int1_7370 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_7371 = torch.constant.int 1
    %5276 = torch.aten.mul.Scalar %5271, %int1_7371 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_7372 = torch.constant.int 1
    %5277 = torch.aten.add.Tensor %5275, %5276, %int1_7372 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_7373 = torch.constant.int 5
    %5278 = torch.prims.convert_element_type %5277, %int5_7373 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_7374 = torch.constant.int 0
    %int0_7375 = torch.constant.int 0
    %int9223372036854775807_7376 = torch.constant.int 9223372036854775807
    %int1_7377 = torch.constant.int 1
    %5279 = torch.aten.slice.Tensor %5278, %int0_7374, %int0_7375, %int9223372036854775807_7376, %int1_7377 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_7378 = torch.constant.int 1
    %5280 = torch.aten.unsqueeze %5279, %int1_7378 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_7379 = torch.constant.int 2
    %int0_7380 = torch.constant.int 0
    %int9223372036854775807_7381 = torch.constant.int 9223372036854775807
    %int1_7382 = torch.constant.int 1
    %5281 = torch.aten.slice.Tensor %5280, %int2_7379, %int0_7380, %int9223372036854775807_7381, %int1_7382 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_7383 = torch.constant.int -1
    %int0_7384 = torch.constant.int 0
    %int3072_7385 = torch.constant.int 3072
    %int1_7386 = torch.constant.int 1
    %5282 = torch.aten.slice.Tensor %5281, %int-1_7383, %int0_7384, %int3072_7385, %int1_7386 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7387 = torch.constant.int -1
    %int3072_7388 = torch.constant.int 3072
    %int6144_7389 = torch.constant.int 6144
    %int1_7390 = torch.constant.int 1
    %5283 = torch.aten.slice.Tensor %5281, %int-1_7387, %int3072_7388, %int6144_7389, %int1_7390 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7391 = torch.constant.int -1
    %int6144_7392 = torch.constant.int 6144
    %int9216_7393 = torch.constant.int 9216
    %int1_7394 = torch.constant.int 1
    %5284 = torch.aten.slice.Tensor %5281, %int-1_7391, %int6144_7392, %int9216_7393, %int1_7394 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7395 = torch.constant.int -1
    %int9216_7396 = torch.constant.int 9216
    %int12288_7397 = torch.constant.int 12288
    %int1_7398 = torch.constant.int 1
    %5285 = torch.aten.slice.Tensor %5281, %int-1_7395, %int9216_7396, %int12288_7397, %int1_7398 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7399 = torch.constant.int -1
    %int12288_7400 = torch.constant.int 12288
    %int15360_7401 = torch.constant.int 15360
    %int1_7402 = torch.constant.int 1
    %5286 = torch.aten.slice.Tensor %5281, %int-1_7399, %int12288_7400, %int15360_7401, %int1_7402 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7403 = torch.constant.int -1
    %int15360_7404 = torch.constant.int 15360
    %int18432_7405 = torch.constant.int 18432
    %int1_7406 = torch.constant.int 1
    %5287 = torch.aten.slice.Tensor %5281, %int-1_7403, %int15360_7404, %int18432_7405, %int1_7406 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5288 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.15.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.15.txt_mod.lin.weight : tensor<18432x3072xf16>
    %5289 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_7407 = torch.constant.int 0
    %int1_7408 = torch.constant.int 1
    %5290 = torch.aten.transpose.int %5289, %int0_7407, %int1_7408 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.15.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.15.txt_mod.lin.bias : tensor<18432xf16>
    %5291 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_7409 = torch.constant.int 6
    %5292 = torch.prims.convert_element_type %5291, %int6_7409 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_7410 = torch.constant.int 6
    %5293 = torch.prims.convert_element_type %5288, %int6_7410 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_7411 = torch.constant.int 6
    %5294 = torch.prims.convert_element_type %5290, %int6_7411 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %5295 = torch.aten.mm %5293, %5294 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_7412 = torch.constant.int 1
    %5296 = torch.aten.mul.Scalar %5295, %int1_7412 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_7413 = torch.constant.int 1
    %5297 = torch.aten.mul.Scalar %5292, %int1_7413 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_7414 = torch.constant.int 1
    %5298 = torch.aten.add.Tensor %5296, %5297, %int1_7414 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_7415 = torch.constant.int 5
    %5299 = torch.prims.convert_element_type %5298, %int5_7415 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_7416 = torch.constant.int 0
    %int0_7417 = torch.constant.int 0
    %int9223372036854775807_7418 = torch.constant.int 9223372036854775807
    %int1_7419 = torch.constant.int 1
    %5300 = torch.aten.slice.Tensor %5299, %int0_7416, %int0_7417, %int9223372036854775807_7418, %int1_7419 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_7420 = torch.constant.int 1
    %5301 = torch.aten.unsqueeze %5300, %int1_7420 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_7421 = torch.constant.int 2
    %int0_7422 = torch.constant.int 0
    %int9223372036854775807_7423 = torch.constant.int 9223372036854775807
    %int1_7424 = torch.constant.int 1
    %5302 = torch.aten.slice.Tensor %5301, %int2_7421, %int0_7422, %int9223372036854775807_7423, %int1_7424 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_7425 = torch.constant.int -1
    %int0_7426 = torch.constant.int 0
    %int3072_7427 = torch.constant.int 3072
    %int1_7428 = torch.constant.int 1
    %5303 = torch.aten.slice.Tensor %5302, %int-1_7425, %int0_7426, %int3072_7427, %int1_7428 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7429 = torch.constant.int -1
    %int3072_7430 = torch.constant.int 3072
    %int6144_7431 = torch.constant.int 6144
    %int1_7432 = torch.constant.int 1
    %5304 = torch.aten.slice.Tensor %5302, %int-1_7429, %int3072_7430, %int6144_7431, %int1_7432 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7433 = torch.constant.int -1
    %int6144_7434 = torch.constant.int 6144
    %int9216_7435 = torch.constant.int 9216
    %int1_7436 = torch.constant.int 1
    %5305 = torch.aten.slice.Tensor %5302, %int-1_7433, %int6144_7434, %int9216_7435, %int1_7436 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7437 = torch.constant.int -1
    %int9216_7438 = torch.constant.int 9216
    %int12288_7439 = torch.constant.int 12288
    %int1_7440 = torch.constant.int 1
    %5306 = torch.aten.slice.Tensor %5302, %int-1_7437, %int9216_7438, %int12288_7439, %int1_7440 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7441 = torch.constant.int -1
    %int12288_7442 = torch.constant.int 12288
    %int15360_7443 = torch.constant.int 15360
    %int1_7444 = torch.constant.int 1
    %5307 = torch.aten.slice.Tensor %5302, %int-1_7441, %int12288_7442, %int15360_7443, %int1_7444 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7445 = torch.constant.int -1
    %int15360_7446 = torch.constant.int 15360
    %int18432_7447 = torch.constant.int 18432
    %int1_7448 = torch.constant.int 1
    %5308 = torch.aten.slice.Tensor %5302, %int-1_7445, %int15360_7446, %int18432_7447, %int1_7448 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_7449 = torch.constant.int 6
    %5309 = torch.prims.convert_element_type %5206, %int6_7449 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_7450 = torch.constant.int 2
    %5310 = torch.prim.ListConstruct %int2_7450 : (!torch.int) -> !torch.list<int>
    %int0_7451 = torch.constant.int 0
    %true_7452 = torch.constant.bool true
    %result0_7453, %result1_7454 = torch.aten.var_mean.correction %5309, %5310, %int0_7451, %true_7452 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_7455 = torch.constant.float 9.9999999999999995E-7
    %int1_7456 = torch.constant.int 1
    %5311 = torch.aten.add.Scalar %result0_7453, %float9.999990e-07_7455, %int1_7456 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %5312 = torch.aten.rsqrt %5311 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_7457 = torch.constant.int 1
    %5313 = torch.aten.sub.Tensor %5206, %result1_7454, %int1_7457 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %5314 = torch.aten.mul.Tensor %5313, %5312 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_7458 = torch.constant.int 5
    %5315 = torch.prims.convert_element_type %5314, %int5_7458 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7459 = torch.constant.int 1
    %int1_7460 = torch.constant.int 1
    %5316 = torch.aten.add.Scalar %5283, %int1_7459, %int1_7460 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5317 = torch.aten.mul.Tensor %5316, %5315 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7461 = torch.constant.int 1
    %5318 = torch.aten.add.Tensor %5317, %5282, %int1_7461 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_7462 = torch.constant.int 4096
    %int3072_7463 = torch.constant.int 3072
    %5319 = torch.prim.ListConstruct %int4096_7462, %int3072_7463 : (!torch.int, !torch.int) -> !torch.list<int>
    %5320 = torch.aten.view %5318, %5319 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.15.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.15.img_attn.qkv.weight : tensor<9216x3072xf16>
    %5321 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_7464 = torch.constant.int 0
    %int1_7465 = torch.constant.int 1
    %5322 = torch.aten.transpose.int %5321, %int0_7464, %int1_7465 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.15.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.15.img_attn.qkv.bias : tensor<9216xf16>
    %5323 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_7466 = torch.constant.int 6
    %5324 = torch.prims.convert_element_type %5323, %int6_7466 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_7467 = torch.constant.int 6
    %5325 = torch.prims.convert_element_type %5320, %int6_7467 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_7468 = torch.constant.int 6
    %5326 = torch.prims.convert_element_type %5322, %int6_7468 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %5327 = torch.aten.mm %5325, %5326 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_7469 = torch.constant.int 1
    %5328 = torch.aten.mul.Scalar %5327, %int1_7469 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_7470 = torch.constant.int 1
    %5329 = torch.aten.mul.Scalar %5324, %int1_7470 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_7471 = torch.constant.int 1
    %5330 = torch.aten.add.Tensor %5328, %5329, %int1_7471 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_7472 = torch.constant.int 5
    %5331 = torch.prims.convert_element_type %5330, %int5_7472 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_7473 = torch.constant.int 1
    %int4096_7474 = torch.constant.int 4096
    %int9216_7475 = torch.constant.int 9216
    %5332 = torch.prim.ListConstruct %int1_7473, %int4096_7474, %int9216_7475 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5333 = torch.aten.view %5331, %5332 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %5334 = torch_c.to_builtin_tensor %5333 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_7476 = tensor.cast %5334 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_7477 = arith.constant 0 : index
    %dim_7478 = tensor.dim %cast_7476, %c0_7477 : tensor<?x?x?xf16>
    %c1_7479 = arith.constant 1 : index
    %dim_7480 = tensor.dim %cast_7476, %c1_7479 : tensor<?x?x?xf16>
    %c2_7481 = arith.constant 2 : index
    %dim_7482 = tensor.dim %cast_7476, %c2_7481 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_7476 : tensor<?x?x?xf16>{%dim_7478, %dim_7480, %dim_7482}]
    %cast_7483 = tensor.cast %cast_7476 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %5335 = torch_c.from_builtin_tensor %cast_7483 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_7484 = torch.constant.int 1
    %int4096_7485 = torch.constant.int 4096
    %int3_7486 = torch.constant.int 3
    %int24_7487 = torch.constant.int 24
    %int128_7488 = torch.constant.int 128
    %5336 = torch.prim.ListConstruct %int1_7484, %int4096_7485, %int3_7486, %int24_7487, %int128_7488 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5337 = torch.aten.view %5335, %5336 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_7489 = torch.constant.int 2
    %int0_7490 = torch.constant.int 0
    %int3_7491 = torch.constant.int 3
    %int1_7492 = torch.constant.int 1
    %int4_7493 = torch.constant.int 4
    %5338 = torch.prim.ListConstruct %int2_7489, %int0_7490, %int3_7491, %int1_7492, %int4_7493 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5339 = torch.aten.permute %5337, %5338 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_7494 = torch.constant.int 0
    %int0_7495 = torch.constant.int 0
    %5340 = torch.aten.select.int %5339, %int0_7494, %int0_7495 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_7496 = torch.constant.int 6
    %5341 = torch.prims.convert_element_type %5340, %int6_7496 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_7497 = torch.constant.int 2
    %5342 = torch.aten.pow.Tensor_Scalar %5341, %int2_7497 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_7498 = torch.constant.int -1
    %5343 = torch.prim.ListConstruct %int-1_7498 : (!torch.int) -> !torch.list<int>
    %true_7499 = torch.constant.bool true
    %none_7500 = torch.constant.none
    %5344 = torch.aten.mean.dim %5342, %5343, %true_7499, %none_7500 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_7501 = torch.constant.float 9.9999999999999995E-7
    %int1_7502 = torch.constant.int 1
    %5345 = torch.aten.add.Scalar %5344, %float9.999990e-07_7501, %int1_7502 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %5346 = torch.aten.rsqrt %5345 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %5347 = torch.aten.mul.Tensor %5341, %5346 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_7503 = torch.constant.int 5
    %5348 = torch.prims.convert_element_type %5347, %int5_7503 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.15.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.15.img_attn.norm.query_norm.scale : tensor<128xf16>
    %5349 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5350 = torch.aten.mul.Tensor %5348, %5349 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_7504 = torch.constant.int 1
    %int4096_7505 = torch.constant.int 4096
    %int3_7506 = torch.constant.int 3
    %int24_7507 = torch.constant.int 24
    %int128_7508 = torch.constant.int 128
    %5351 = torch.prim.ListConstruct %int1_7504, %int4096_7505, %int3_7506, %int24_7507, %int128_7508 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5352 = torch.aten.view %5335, %5351 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_7509 = torch.constant.int 2
    %int0_7510 = torch.constant.int 0
    %int3_7511 = torch.constant.int 3
    %int1_7512 = torch.constant.int 1
    %int4_7513 = torch.constant.int 4
    %5353 = torch.prim.ListConstruct %int2_7509, %int0_7510, %int3_7511, %int1_7512, %int4_7513 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5354 = torch.aten.permute %5352, %5353 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_7514 = torch.constant.int 0
    %int1_7515 = torch.constant.int 1
    %5355 = torch.aten.select.int %5354, %int0_7514, %int1_7515 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_7516 = torch.constant.int 6
    %5356 = torch.prims.convert_element_type %5355, %int6_7516 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_7517 = torch.constant.int 2
    %5357 = torch.aten.pow.Tensor_Scalar %5356, %int2_7517 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_7518 = torch.constant.int -1
    %5358 = torch.prim.ListConstruct %int-1_7518 : (!torch.int) -> !torch.list<int>
    %true_7519 = torch.constant.bool true
    %none_7520 = torch.constant.none
    %5359 = torch.aten.mean.dim %5357, %5358, %true_7519, %none_7520 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_7521 = torch.constant.float 9.9999999999999995E-7
    %int1_7522 = torch.constant.int 1
    %5360 = torch.aten.add.Scalar %5359, %float9.999990e-07_7521, %int1_7522 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %5361 = torch.aten.rsqrt %5360 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %5362 = torch.aten.mul.Tensor %5356, %5361 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_7523 = torch.constant.int 5
    %5363 = torch.prims.convert_element_type %5362, %int5_7523 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.15.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.15.img_attn.norm.key_norm.scale : tensor<128xf16>
    %5364 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5365 = torch.aten.mul.Tensor %5363, %5364 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_7524 = torch.constant.int 5
    %5366 = torch.prims.convert_element_type %5350, %int5_7524 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_7525 = torch.constant.int 5
    %5367 = torch.prims.convert_element_type %5365, %int5_7525 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_7526 = torch.constant.int 6
    %5368 = torch.prims.convert_element_type %5266, %int6_7526 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_7527 = torch.constant.int 2
    %5369 = torch.prim.ListConstruct %int2_7527 : (!torch.int) -> !torch.list<int>
    %int0_7528 = torch.constant.int 0
    %true_7529 = torch.constant.bool true
    %result0_7530, %result1_7531 = torch.aten.var_mean.correction %5368, %5369, %int0_7528, %true_7529 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_7532 = torch.constant.float 9.9999999999999995E-7
    %int1_7533 = torch.constant.int 1
    %5370 = torch.aten.add.Scalar %result0_7530, %float9.999990e-07_7532, %int1_7533 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %5371 = torch.aten.rsqrt %5370 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_7534 = torch.constant.int 1
    %5372 = torch.aten.sub.Tensor %5266, %result1_7531, %int1_7534 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %5373 = torch.aten.mul.Tensor %5372, %5371 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_7535 = torch.constant.int 5
    %5374 = torch.prims.convert_element_type %5373, %int5_7535 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_7536 = torch.constant.int 1
    %int1_7537 = torch.constant.int 1
    %5375 = torch.aten.add.Scalar %5304, %int1_7536, %int1_7537 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5376 = torch.aten.mul.Tensor %5375, %5374 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7538 = torch.constant.int 1
    %5377 = torch.aten.add.Tensor %5376, %5303, %int1_7538 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_7539 = torch.constant.int 512
    %int3072_7540 = torch.constant.int 3072
    %5378 = torch.prim.ListConstruct %int512_7539, %int3072_7540 : (!torch.int, !torch.int) -> !torch.list<int>
    %5379 = torch.aten.view %5377, %5378 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.15.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.15.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %5380 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_7541 = torch.constant.int 0
    %int1_7542 = torch.constant.int 1
    %5381 = torch.aten.transpose.int %5380, %int0_7541, %int1_7542 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.15.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.15.txt_attn.qkv.bias : tensor<9216xf16>
    %5382 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_7543 = torch.constant.int 6
    %5383 = torch.prims.convert_element_type %5382, %int6_7543 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_7544 = torch.constant.int 6
    %5384 = torch.prims.convert_element_type %5379, %int6_7544 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_7545 = torch.constant.int 6
    %5385 = torch.prims.convert_element_type %5381, %int6_7545 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %5386 = torch.aten.mm %5384, %5385 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_7546 = torch.constant.int 1
    %5387 = torch.aten.mul.Scalar %5386, %int1_7546 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_7547 = torch.constant.int 1
    %5388 = torch.aten.mul.Scalar %5383, %int1_7547 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_7548 = torch.constant.int 1
    %5389 = torch.aten.add.Tensor %5387, %5388, %int1_7548 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_7549 = torch.constant.int 5
    %5390 = torch.prims.convert_element_type %5389, %int5_7549 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_7550 = torch.constant.int 1
    %int512_7551 = torch.constant.int 512
    %int9216_7552 = torch.constant.int 9216
    %5391 = torch.prim.ListConstruct %int1_7550, %int512_7551, %int9216_7552 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5392 = torch.aten.view %5390, %5391 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %5393 = torch_c.to_builtin_tensor %5392 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_7553 = tensor.cast %5393 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_7554 = arith.constant 0 : index
    %dim_7555 = tensor.dim %cast_7553, %c0_7554 : tensor<?x?x?xf16>
    %c1_7556 = arith.constant 1 : index
    %dim_7557 = tensor.dim %cast_7553, %c1_7556 : tensor<?x?x?xf16>
    %c2_7558 = arith.constant 2 : index
    %dim_7559 = tensor.dim %cast_7553, %c2_7558 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_7553 : tensor<?x?x?xf16>{%dim_7555, %dim_7557, %dim_7559}]
    %cast_7560 = tensor.cast %cast_7553 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %5394 = torch_c.from_builtin_tensor %cast_7560 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_7561 = torch.constant.int 1
    %int512_7562 = torch.constant.int 512
    %int3_7563 = torch.constant.int 3
    %int24_7564 = torch.constant.int 24
    %int128_7565 = torch.constant.int 128
    %5395 = torch.prim.ListConstruct %int1_7561, %int512_7562, %int3_7563, %int24_7564, %int128_7565 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5396 = torch.aten.view %5394, %5395 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_7566 = torch.constant.int 2
    %int0_7567 = torch.constant.int 0
    %int3_7568 = torch.constant.int 3
    %int1_7569 = torch.constant.int 1
    %int4_7570 = torch.constant.int 4
    %5397 = torch.prim.ListConstruct %int2_7566, %int0_7567, %int3_7568, %int1_7569, %int4_7570 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5398 = torch.aten.permute %5396, %5397 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_7571 = torch.constant.int 0
    %int0_7572 = torch.constant.int 0
    %5399 = torch.aten.select.int %5398, %int0_7571, %int0_7572 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_7573 = torch.constant.int 6
    %5400 = torch.prims.convert_element_type %5399, %int6_7573 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_7574 = torch.constant.int 2
    %5401 = torch.aten.pow.Tensor_Scalar %5400, %int2_7574 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_7575 = torch.constant.int -1
    %5402 = torch.prim.ListConstruct %int-1_7575 : (!torch.int) -> !torch.list<int>
    %true_7576 = torch.constant.bool true
    %none_7577 = torch.constant.none
    %5403 = torch.aten.mean.dim %5401, %5402, %true_7576, %none_7577 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_7578 = torch.constant.float 9.9999999999999995E-7
    %int1_7579 = torch.constant.int 1
    %5404 = torch.aten.add.Scalar %5403, %float9.999990e-07_7578, %int1_7579 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %5405 = torch.aten.rsqrt %5404 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %5406 = torch.aten.mul.Tensor %5400, %5405 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_7580 = torch.constant.int 5
    %5407 = torch.prims.convert_element_type %5406, %int5_7580 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.15.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.15.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %5408 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5409 = torch.aten.mul.Tensor %5407, %5408 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_7581 = torch.constant.int 1
    %int512_7582 = torch.constant.int 512
    %int3_7583 = torch.constant.int 3
    %int24_7584 = torch.constant.int 24
    %int128_7585 = torch.constant.int 128
    %5410 = torch.prim.ListConstruct %int1_7581, %int512_7582, %int3_7583, %int24_7584, %int128_7585 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5411 = torch.aten.view %5394, %5410 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_7586 = torch.constant.int 2
    %int0_7587 = torch.constant.int 0
    %int3_7588 = torch.constant.int 3
    %int1_7589 = torch.constant.int 1
    %int4_7590 = torch.constant.int 4
    %5412 = torch.prim.ListConstruct %int2_7586, %int0_7587, %int3_7588, %int1_7589, %int4_7590 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5413 = torch.aten.permute %5411, %5412 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_7591 = torch.constant.int 0
    %int1_7592 = torch.constant.int 1
    %5414 = torch.aten.select.int %5413, %int0_7591, %int1_7592 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_7593 = torch.constant.int 6
    %5415 = torch.prims.convert_element_type %5414, %int6_7593 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_7594 = torch.constant.int 2
    %5416 = torch.aten.pow.Tensor_Scalar %5415, %int2_7594 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_7595 = torch.constant.int -1
    %5417 = torch.prim.ListConstruct %int-1_7595 : (!torch.int) -> !torch.list<int>
    %true_7596 = torch.constant.bool true
    %none_7597 = torch.constant.none
    %5418 = torch.aten.mean.dim %5416, %5417, %true_7596, %none_7597 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_7598 = torch.constant.float 9.9999999999999995E-7
    %int1_7599 = torch.constant.int 1
    %5419 = torch.aten.add.Scalar %5418, %float9.999990e-07_7598, %int1_7599 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %5420 = torch.aten.rsqrt %5419 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %5421 = torch.aten.mul.Tensor %5415, %5420 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_7600 = torch.constant.int 5
    %5422 = torch.prims.convert_element_type %5421, %int5_7600 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.15.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.15.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %5423 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5424 = torch.aten.mul.Tensor %5422, %5423 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_7601 = torch.constant.int 5
    %5425 = torch.prims.convert_element_type %5409, %int5_7601 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_7602 = torch.constant.int 5
    %5426 = torch.prims.convert_element_type %5424, %int5_7602 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %5427 = torch.prim.ListConstruct %5425, %5366 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_7603 = torch.constant.int 2
    %5428 = torch.aten.cat %5427, %int2_7603 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %5429 = torch.prim.ListConstruct %5426, %5367 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_7604 = torch.constant.int 2
    %5430 = torch.aten.cat %5429, %int2_7604 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_7605 = torch.constant.int 1
    %int512_7606 = torch.constant.int 512
    %int3_7607 = torch.constant.int 3
    %int24_7608 = torch.constant.int 24
    %int128_7609 = torch.constant.int 128
    %5431 = torch.prim.ListConstruct %int1_7605, %int512_7606, %int3_7607, %int24_7608, %int128_7609 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5432 = torch.aten.view %5394, %5431 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_7610 = torch.constant.int 2
    %int0_7611 = torch.constant.int 0
    %int3_7612 = torch.constant.int 3
    %int1_7613 = torch.constant.int 1
    %int4_7614 = torch.constant.int 4
    %5433 = torch.prim.ListConstruct %int2_7610, %int0_7611, %int3_7612, %int1_7613, %int4_7614 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5434 = torch.aten.permute %5432, %5433 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_7615 = torch.constant.int 0
    %int2_7616 = torch.constant.int 2
    %5435 = torch.aten.select.int %5434, %int0_7615, %int2_7616 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_7617 = torch.constant.int 1
    %int4096_7618 = torch.constant.int 4096
    %int3_7619 = torch.constant.int 3
    %int24_7620 = torch.constant.int 24
    %int128_7621 = torch.constant.int 128
    %5436 = torch.prim.ListConstruct %int1_7617, %int4096_7618, %int3_7619, %int24_7620, %int128_7621 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5437 = torch.aten.view %5335, %5436 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_7622 = torch.constant.int 2
    %int0_7623 = torch.constant.int 0
    %int3_7624 = torch.constant.int 3
    %int1_7625 = torch.constant.int 1
    %int4_7626 = torch.constant.int 4
    %5438 = torch.prim.ListConstruct %int2_7622, %int0_7623, %int3_7624, %int1_7625, %int4_7626 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5439 = torch.aten.permute %5437, %5438 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_7627 = torch.constant.int 0
    %int2_7628 = torch.constant.int 2
    %5440 = torch.aten.select.int %5439, %int0_7627, %int2_7628 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %5441 = torch.prim.ListConstruct %5435, %5440 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_7629 = torch.constant.int 2
    %5442 = torch.aten.cat %5441, %int2_7629 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %5443 = torch_c.to_builtin_tensor %5428 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_7630 = tensor.cast %5443 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_7631 = arith.constant 0 : index
    %dim_7632 = tensor.dim %cast_7630, %c0_7631 : tensor<?x?x?x?xf16>
    %c1_7633 = arith.constant 1 : index
    %dim_7634 = tensor.dim %cast_7630, %c1_7633 : tensor<?x?x?x?xf16>
    %c2_7635 = arith.constant 2 : index
    %dim_7636 = tensor.dim %cast_7630, %c2_7635 : tensor<?x?x?x?xf16>
    %c3_7637 = arith.constant 3 : index
    %dim_7638 = tensor.dim %cast_7630, %c3_7637 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_7630 : tensor<?x?x?x?xf16>{%dim_7632, %dim_7634, %dim_7636, %dim_7638}]
    %cast_7639 = tensor.cast %cast_7630 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %5444 = torch_c.from_builtin_tensor %cast_7639 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %5445 = torch_c.to_builtin_tensor %5430 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_7640 = tensor.cast %5445 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_7641 = arith.constant 0 : index
    %dim_7642 = tensor.dim %cast_7640, %c0_7641 : tensor<?x?x?x?xf16>
    %c1_7643 = arith.constant 1 : index
    %dim_7644 = tensor.dim %cast_7640, %c1_7643 : tensor<?x?x?x?xf16>
    %c2_7645 = arith.constant 2 : index
    %dim_7646 = tensor.dim %cast_7640, %c2_7645 : tensor<?x?x?x?xf16>
    %c3_7647 = arith.constant 3 : index
    %dim_7648 = tensor.dim %cast_7640, %c3_7647 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_7640 : tensor<?x?x?x?xf16>{%dim_7642, %dim_7644, %dim_7646, %dim_7648}]
    %cast_7649 = tensor.cast %cast_7640 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %5446 = torch_c.from_builtin_tensor %cast_7649 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %5447 = torch_c.to_builtin_tensor %5442 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_7650 = tensor.cast %5447 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_7651 = arith.constant 0 : index
    %dim_7652 = tensor.dim %cast_7650, %c0_7651 : tensor<?x?x?x?xf16>
    %c1_7653 = arith.constant 1 : index
    %dim_7654 = tensor.dim %cast_7650, %c1_7653 : tensor<?x?x?x?xf16>
    %c2_7655 = arith.constant 2 : index
    %dim_7656 = tensor.dim %cast_7650, %c2_7655 : tensor<?x?x?x?xf16>
    %c3_7657 = arith.constant 3 : index
    %dim_7658 = tensor.dim %cast_7650, %c3_7657 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_7650 : tensor<?x?x?x?xf16>{%dim_7652, %dim_7654, %dim_7656, %dim_7658}]
    %cast_7659 = tensor.cast %cast_7650 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %5448 = torch_c.from_builtin_tensor %cast_7659 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_7660 = torch.constant.int 6
    %5449 = torch.prims.convert_element_type %5444, %int6_7660 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_7661 = torch.constant.int 1
    %int24_7662 = torch.constant.int 24
    %int4608_7663 = torch.constant.int 4608
    %int-1_7664 = torch.constant.int -1
    %int1_7665 = torch.constant.int 1
    %int2_7666 = torch.constant.int 2
    %5450 = torch.prim.ListConstruct %int1_7661, %int24_7662, %int4608_7663, %int-1_7664, %int1_7665, %int2_7666 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5451 = torch.aten.view %5449, %5450 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_7667 = torch.constant.int 6
    %5452 = torch.prims.convert_element_type %5446, %int6_7667 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_7668 = torch.constant.int 1
    %int24_7669 = torch.constant.int 24
    %int4608_7670 = torch.constant.int 4608
    %int-1_7671 = torch.constant.int -1
    %int1_7672 = torch.constant.int 1
    %int2_7673 = torch.constant.int 2
    %5453 = torch.prim.ListConstruct %int1_7668, %int24_7669, %int4608_7670, %int-1_7671, %int1_7672, %int2_7673 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5454 = torch.aten.view %5452, %5453 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_7674 = torch.constant.int 5
    %int0_7675 = torch.constant.int 0
    %5455 = torch.aten.select.int %211, %int5_7674, %int0_7675 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7676 = torch.constant.int 5
    %int0_7677 = torch.constant.int 0
    %5456 = torch.aten.select.int %5451, %int5_7676, %int0_7677 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5457 = torch.aten.mul.Tensor %5455, %5456 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7678 = torch.constant.int 5
    %int1_7679 = torch.constant.int 1
    %5458 = torch.aten.select.int %211, %int5_7678, %int1_7679 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7680 = torch.constant.int 5
    %int1_7681 = torch.constant.int 1
    %5459 = torch.aten.select.int %5451, %int5_7680, %int1_7681 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5460 = torch.aten.mul.Tensor %5458, %5459 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7682 = torch.constant.int 1
    %5461 = torch.aten.add.Tensor %5457, %5460, %int1_7682 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7683 = torch.constant.int 5
    %int0_7684 = torch.constant.int 0
    %5462 = torch.aten.select.int %211, %int5_7683, %int0_7684 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7685 = torch.constant.int 5
    %int0_7686 = torch.constant.int 0
    %5463 = torch.aten.select.int %5454, %int5_7685, %int0_7686 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5464 = torch.aten.mul.Tensor %5462, %5463 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7687 = torch.constant.int 5
    %int1_7688 = torch.constant.int 1
    %5465 = torch.aten.select.int %211, %int5_7687, %int1_7688 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7689 = torch.constant.int 5
    %int1_7690 = torch.constant.int 1
    %5466 = torch.aten.select.int %5454, %int5_7689, %int1_7690 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5467 = torch.aten.mul.Tensor %5465, %5466 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7691 = torch.constant.int 1
    %5468 = torch.aten.add.Tensor %5464, %5467, %int1_7691 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7692 = torch.constant.int 1
    %int24_7693 = torch.constant.int 24
    %int4608_7694 = torch.constant.int 4608
    %int128_7695 = torch.constant.int 128
    %5469 = torch.prim.ListConstruct %int1_7692, %int24_7693, %int4608_7694, %int128_7695 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5470 = torch.aten.view %5461, %5469 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7696 = torch.constant.int 5
    %5471 = torch.prims.convert_element_type %5470, %int5_7696 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_7697 = torch.constant.int 1
    %int24_7698 = torch.constant.int 24
    %int4608_7699 = torch.constant.int 4608
    %int128_7700 = torch.constant.int 128
    %5472 = torch.prim.ListConstruct %int1_7697, %int24_7698, %int4608_7699, %int128_7700 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5473 = torch.aten.view %5468, %5472 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7701 = torch.constant.int 5
    %5474 = torch.prims.convert_element_type %5473, %int5_7701 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_7702 = torch.constant.float 0.000000e+00
    %false_7703 = torch.constant.bool false
    %none_7704 = torch.constant.none
    %none_7705 = torch.constant.none
    %5475:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%5471, %5474, %5448, %float0.000000e00_7702, %false_7703, %none_7704, %none_7705) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_7706 = torch.constant.int 0
    %int2_7707 = torch.constant.int 2
    %int1_7708 = torch.constant.int 1
    %int3_7709 = torch.constant.int 3
    %5476 = torch.prim.ListConstruct %int0_7706, %int2_7707, %int1_7708, %int3_7709 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5477 = torch.aten.permute %5475#0, %5476 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_7710 = torch.constant.int 1
    %int4608_7711 = torch.constant.int 4608
    %int3072_7712 = torch.constant.int 3072
    %5478 = torch.prim.ListConstruct %int1_7710, %int4608_7711, %int3072_7712 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5479 = torch.aten.view %5477, %5478 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_7713 = torch.constant.int 0
    %int0_7714 = torch.constant.int 0
    %int9223372036854775807_7715 = torch.constant.int 9223372036854775807
    %int1_7716 = torch.constant.int 1
    %5480 = torch.aten.slice.Tensor %5479, %int0_7713, %int0_7714, %int9223372036854775807_7715, %int1_7716 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_7717 = torch.constant.int 1
    %int0_7718 = torch.constant.int 0
    %int512_7719 = torch.constant.int 512
    %int1_7720 = torch.constant.int 1
    %5481 = torch.aten.slice.Tensor %5480, %int1_7717, %int0_7718, %int512_7719, %int1_7720 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_7721 = torch.constant.int 0
    %int0_7722 = torch.constant.int 0
    %int9223372036854775807_7723 = torch.constant.int 9223372036854775807
    %int1_7724 = torch.constant.int 1
    %5482 = torch.aten.slice.Tensor %5479, %int0_7721, %int0_7722, %int9223372036854775807_7723, %int1_7724 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_7725 = torch.constant.int 1
    %int512_7726 = torch.constant.int 512
    %int9223372036854775807_7727 = torch.constant.int 9223372036854775807
    %int1_7728 = torch.constant.int 1
    %5483 = torch.aten.slice.Tensor %5482, %int1_7725, %int512_7726, %int9223372036854775807_7727, %int1_7728 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_7729 = torch.constant.int 4096
    %int3072_7730 = torch.constant.int 3072
    %5484 = torch.prim.ListConstruct %int4096_7729, %int3072_7730 : (!torch.int, !torch.int) -> !torch.list<int>
    %5485 = torch.aten.view %5483, %5484 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.15.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.15.img_attn.proj.weight : tensor<3072x3072xf16>
    %5486 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_7731 = torch.constant.int 0
    %int1_7732 = torch.constant.int 1
    %5487 = torch.aten.transpose.int %5486, %int0_7731, %int1_7732 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.15.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.15.img_attn.proj.bias : tensor<3072xf16>
    %5488 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7733 = torch.constant.int 6
    %5489 = torch.prims.convert_element_type %5488, %int6_7733 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7734 = torch.constant.int 6
    %5490 = torch.prims.convert_element_type %5485, %int6_7734 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_7735 = torch.constant.int 6
    %5491 = torch.prims.convert_element_type %5487, %int6_7735 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %5492 = torch.aten.mm %5490, %5491 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_7736 = torch.constant.int 1
    %5493 = torch.aten.mul.Scalar %5492, %int1_7736 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_7737 = torch.constant.int 1
    %5494 = torch.aten.mul.Scalar %5489, %int1_7737 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7738 = torch.constant.int 1
    %5495 = torch.aten.add.Tensor %5493, %5494, %int1_7738 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_7739 = torch.constant.int 5
    %5496 = torch.prims.convert_element_type %5495, %int5_7739 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_7740 = torch.constant.int 1
    %int4096_7741 = torch.constant.int 4096
    %int3072_7742 = torch.constant.int 3072
    %5497 = torch.prim.ListConstruct %int1_7740, %int4096_7741, %int3072_7742 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5498 = torch.aten.view %5496, %5497 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %5499 = torch.aten.mul.Tensor %5284, %5498 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7743 = torch.constant.int 1
    %5500 = torch.aten.add.Tensor %5206, %5499, %int1_7743 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7744 = torch.constant.int 1
    %int1_7745 = torch.constant.int 1
    %5501 = torch.aten.add.Scalar %5286, %int1_7744, %int1_7745 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_7746 = torch.constant.int 6
    %5502 = torch.prims.convert_element_type %5500, %int6_7746 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_7747 = torch.constant.int 2
    %5503 = torch.prim.ListConstruct %int2_7747 : (!torch.int) -> !torch.list<int>
    %int0_7748 = torch.constant.int 0
    %true_7749 = torch.constant.bool true
    %result0_7750, %result1_7751 = torch.aten.var_mean.correction %5502, %5503, %int0_7748, %true_7749 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_7752 = torch.constant.float 9.9999999999999995E-7
    %int1_7753 = torch.constant.int 1
    %5504 = torch.aten.add.Scalar %result0_7750, %float9.999990e-07_7752, %int1_7753 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %5505 = torch.aten.rsqrt %5504 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_7754 = torch.constant.int 1
    %5506 = torch.aten.sub.Tensor %5500, %result1_7751, %int1_7754 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %5507 = torch.aten.mul.Tensor %5506, %5505 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_7755 = torch.constant.int 5
    %5508 = torch.prims.convert_element_type %5507, %int5_7755 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %5509 = torch.aten.mul.Tensor %5501, %5508 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7756 = torch.constant.int 1
    %5510 = torch.aten.add.Tensor %5509, %5285, %int1_7756 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_7757 = torch.constant.int 4096
    %int3072_7758 = torch.constant.int 3072
    %5511 = torch.prim.ListConstruct %int4096_7757, %int3072_7758 : (!torch.int, !torch.int) -> !torch.list<int>
    %5512 = torch.aten.view %5510, %5511 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.15.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.15.img_mlp.0.weight : tensor<12288x3072xf16>
    %5513 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_7759 = torch.constant.int 0
    %int1_7760 = torch.constant.int 1
    %5514 = torch.aten.transpose.int %5513, %int0_7759, %int1_7760 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.15.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.15.img_mlp.0.bias : tensor<12288xf16>
    %5515 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_7761 = torch.constant.int 6
    %5516 = torch.prims.convert_element_type %5515, %int6_7761 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_7762 = torch.constant.int 6
    %5517 = torch.prims.convert_element_type %5512, %int6_7762 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_7763 = torch.constant.int 6
    %5518 = torch.prims.convert_element_type %5514, %int6_7763 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %5519 = torch.aten.mm %5517, %5518 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_7764 = torch.constant.int 1
    %5520 = torch.aten.mul.Scalar %5519, %int1_7764 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_7765 = torch.constant.int 1
    %5521 = torch.aten.mul.Scalar %5516, %int1_7765 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_7766 = torch.constant.int 1
    %5522 = torch.aten.add.Tensor %5520, %5521, %int1_7766 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_7767 = torch.constant.int 5
    %5523 = torch.prims.convert_element_type %5522, %int5_7767 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_7768 = torch.constant.int 1
    %int4096_7769 = torch.constant.int 4096
    %int12288_7770 = torch.constant.int 12288
    %5524 = torch.prim.ListConstruct %int1_7768, %int4096_7769, %int12288_7770 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5525 = torch.aten.view %5523, %5524 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_7771 = torch.constant.str "tanh"
    %5526 = torch.aten.gelu %5525, %str_7771 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_7772 = torch.constant.int 4096
    %int12288_7773 = torch.constant.int 12288
    %5527 = torch.prim.ListConstruct %int4096_7772, %int12288_7773 : (!torch.int, !torch.int) -> !torch.list<int>
    %5528 = torch.aten.view %5526, %5527 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.15.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.15.img_mlp.2.weight : tensor<3072x12288xf16>
    %5529 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_7774 = torch.constant.int 0
    %int1_7775 = torch.constant.int 1
    %5530 = torch.aten.transpose.int %5529, %int0_7774, %int1_7775 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.15.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.15.img_mlp.2.bias : tensor<3072xf16>
    %5531 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7776 = torch.constant.int 6
    %5532 = torch.prims.convert_element_type %5531, %int6_7776 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7777 = torch.constant.int 6
    %5533 = torch.prims.convert_element_type %5528, %int6_7777 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_7778 = torch.constant.int 6
    %5534 = torch.prims.convert_element_type %5530, %int6_7778 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %5535 = torch.aten.mm %5533, %5534 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_7779 = torch.constant.int 1
    %5536 = torch.aten.mul.Scalar %5535, %int1_7779 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_7780 = torch.constant.int 1
    %5537 = torch.aten.mul.Scalar %5532, %int1_7780 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7781 = torch.constant.int 1
    %5538 = torch.aten.add.Tensor %5536, %5537, %int1_7781 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_7782 = torch.constant.int 5
    %5539 = torch.prims.convert_element_type %5538, %int5_7782 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_7783 = torch.constant.int 1
    %int4096_7784 = torch.constant.int 4096
    %int3072_7785 = torch.constant.int 3072
    %5540 = torch.prim.ListConstruct %int1_7783, %int4096_7784, %int3072_7785 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5541 = torch.aten.view %5539, %5540 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %5542 = torch.aten.mul.Tensor %5287, %5541 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7786 = torch.constant.int 1
    %5543 = torch.aten.add.Tensor %5500, %5542, %int1_7786 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_7787 = torch.constant.int 512
    %int3072_7788 = torch.constant.int 3072
    %5544 = torch.prim.ListConstruct %int512_7787, %int3072_7788 : (!torch.int, !torch.int) -> !torch.list<int>
    %5545 = torch.aten.view %5481, %5544 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.15.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.15.txt_attn.proj.weight : tensor<3072x3072xf16>
    %5546 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_7789 = torch.constant.int 0
    %int1_7790 = torch.constant.int 1
    %5547 = torch.aten.transpose.int %5546, %int0_7789, %int1_7790 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.15.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.15.txt_attn.proj.bias : tensor<3072xf16>
    %5548 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7791 = torch.constant.int 6
    %5549 = torch.prims.convert_element_type %5548, %int6_7791 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7792 = torch.constant.int 6
    %5550 = torch.prims.convert_element_type %5545, %int6_7792 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_7793 = torch.constant.int 6
    %5551 = torch.prims.convert_element_type %5547, %int6_7793 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %5552 = torch.aten.mm %5550, %5551 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_7794 = torch.constant.int 1
    %5553 = torch.aten.mul.Scalar %5552, %int1_7794 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_7795 = torch.constant.int 1
    %5554 = torch.aten.mul.Scalar %5549, %int1_7795 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7796 = torch.constant.int 1
    %5555 = torch.aten.add.Tensor %5553, %5554, %int1_7796 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_7797 = torch.constant.int 5
    %5556 = torch.prims.convert_element_type %5555, %int5_7797 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_7798 = torch.constant.int 1
    %int512_7799 = torch.constant.int 512
    %int3072_7800 = torch.constant.int 3072
    %5557 = torch.prim.ListConstruct %int1_7798, %int512_7799, %int3072_7800 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5558 = torch.aten.view %5556, %5557 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %5559 = torch.aten.mul.Tensor %5305, %5558 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7801 = torch.constant.int 1
    %5560 = torch.aten.add.Tensor %5266, %5559, %int1_7801 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_7802 = torch.constant.int 1
    %int1_7803 = torch.constant.int 1
    %5561 = torch.aten.add.Scalar %5307, %int1_7802, %int1_7803 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_7804 = torch.constant.int 6
    %5562 = torch.prims.convert_element_type %5560, %int6_7804 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_7805 = torch.constant.int 2
    %5563 = torch.prim.ListConstruct %int2_7805 : (!torch.int) -> !torch.list<int>
    %int0_7806 = torch.constant.int 0
    %true_7807 = torch.constant.bool true
    %result0_7808, %result1_7809 = torch.aten.var_mean.correction %5562, %5563, %int0_7806, %true_7807 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_7810 = torch.constant.float 9.9999999999999995E-7
    %int1_7811 = torch.constant.int 1
    %5564 = torch.aten.add.Scalar %result0_7808, %float9.999990e-07_7810, %int1_7811 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %5565 = torch.aten.rsqrt %5564 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_7812 = torch.constant.int 1
    %5566 = torch.aten.sub.Tensor %5560, %result1_7809, %int1_7812 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %5567 = torch.aten.mul.Tensor %5566, %5565 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_7813 = torch.constant.int 5
    %5568 = torch.prims.convert_element_type %5567, %int5_7813 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %5569 = torch.aten.mul.Tensor %5561, %5568 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7814 = torch.constant.int 1
    %5570 = torch.aten.add.Tensor %5569, %5306, %int1_7814 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_7815 = torch.constant.int 512
    %int3072_7816 = torch.constant.int 3072
    %5571 = torch.prim.ListConstruct %int512_7815, %int3072_7816 : (!torch.int, !torch.int) -> !torch.list<int>
    %5572 = torch.aten.view %5570, %5571 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.15.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.15.txt_mlp.0.weight : tensor<12288x3072xf16>
    %5573 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_7817 = torch.constant.int 0
    %int1_7818 = torch.constant.int 1
    %5574 = torch.aten.transpose.int %5573, %int0_7817, %int1_7818 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.15.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.15.txt_mlp.0.bias : tensor<12288xf16>
    %5575 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_7819 = torch.constant.int 6
    %5576 = torch.prims.convert_element_type %5575, %int6_7819 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_7820 = torch.constant.int 6
    %5577 = torch.prims.convert_element_type %5572, %int6_7820 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_7821 = torch.constant.int 6
    %5578 = torch.prims.convert_element_type %5574, %int6_7821 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %5579 = torch.aten.mm %5577, %5578 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_7822 = torch.constant.int 1
    %5580 = torch.aten.mul.Scalar %5579, %int1_7822 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_7823 = torch.constant.int 1
    %5581 = torch.aten.mul.Scalar %5576, %int1_7823 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_7824 = torch.constant.int 1
    %5582 = torch.aten.add.Tensor %5580, %5581, %int1_7824 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_7825 = torch.constant.int 5
    %5583 = torch.prims.convert_element_type %5582, %int5_7825 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_7826 = torch.constant.int 1
    %int512_7827 = torch.constant.int 512
    %int12288_7828 = torch.constant.int 12288
    %5584 = torch.prim.ListConstruct %int1_7826, %int512_7827, %int12288_7828 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5585 = torch.aten.view %5583, %5584 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_7829 = torch.constant.str "tanh"
    %5586 = torch.aten.gelu %5585, %str_7829 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_7830 = torch.constant.int 512
    %int12288_7831 = torch.constant.int 12288
    %5587 = torch.prim.ListConstruct %int512_7830, %int12288_7831 : (!torch.int, !torch.int) -> !torch.list<int>
    %5588 = torch.aten.view %5586, %5587 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.15.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.15.txt_mlp.2.weight : tensor<3072x12288xf16>
    %5589 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_7832 = torch.constant.int 0
    %int1_7833 = torch.constant.int 1
    %5590 = torch.aten.transpose.int %5589, %int0_7832, %int1_7833 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.15.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.15.txt_mlp.2.bias : tensor<3072xf16>
    %5591 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7834 = torch.constant.int 6
    %5592 = torch.prims.convert_element_type %5591, %int6_7834 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7835 = torch.constant.int 6
    %5593 = torch.prims.convert_element_type %5588, %int6_7835 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_7836 = torch.constant.int 6
    %5594 = torch.prims.convert_element_type %5590, %int6_7836 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %5595 = torch.aten.mm %5593, %5594 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_7837 = torch.constant.int 1
    %5596 = torch.aten.mul.Scalar %5595, %int1_7837 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_7838 = torch.constant.int 1
    %5597 = torch.aten.mul.Scalar %5592, %int1_7838 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7839 = torch.constant.int 1
    %5598 = torch.aten.add.Tensor %5596, %5597, %int1_7839 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_7840 = torch.constant.int 5
    %5599 = torch.prims.convert_element_type %5598, %int5_7840 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_7841 = torch.constant.int 1
    %int512_7842 = torch.constant.int 512
    %int3072_7843 = torch.constant.int 3072
    %5600 = torch.prim.ListConstruct %int1_7841, %int512_7842, %int3072_7843 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5601 = torch.aten.view %5599, %5600 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %5602 = torch.aten.mul.Tensor %5308, %5601 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7844 = torch.constant.int 1
    %5603 = torch.aten.add.Tensor %5560, %5602, %int1_7844 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %5604 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.16.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.16.img_mod.lin.weight : tensor<18432x3072xf16>
    %5605 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_7845 = torch.constant.int 0
    %int1_7846 = torch.constant.int 1
    %5606 = torch.aten.transpose.int %5605, %int0_7845, %int1_7846 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.16.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.16.img_mod.lin.bias : tensor<18432xf16>
    %5607 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_7847 = torch.constant.int 6
    %5608 = torch.prims.convert_element_type %5607, %int6_7847 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_7848 = torch.constant.int 6
    %5609 = torch.prims.convert_element_type %5604, %int6_7848 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_7849 = torch.constant.int 6
    %5610 = torch.prims.convert_element_type %5606, %int6_7849 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %5611 = torch.aten.mm %5609, %5610 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_7850 = torch.constant.int 1
    %5612 = torch.aten.mul.Scalar %5611, %int1_7850 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_7851 = torch.constant.int 1
    %5613 = torch.aten.mul.Scalar %5608, %int1_7851 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_7852 = torch.constant.int 1
    %5614 = torch.aten.add.Tensor %5612, %5613, %int1_7852 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_7853 = torch.constant.int 5
    %5615 = torch.prims.convert_element_type %5614, %int5_7853 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_7854 = torch.constant.int 0
    %int0_7855 = torch.constant.int 0
    %int9223372036854775807_7856 = torch.constant.int 9223372036854775807
    %int1_7857 = torch.constant.int 1
    %5616 = torch.aten.slice.Tensor %5615, %int0_7854, %int0_7855, %int9223372036854775807_7856, %int1_7857 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_7858 = torch.constant.int 1
    %5617 = torch.aten.unsqueeze %5616, %int1_7858 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_7859 = torch.constant.int 2
    %int0_7860 = torch.constant.int 0
    %int9223372036854775807_7861 = torch.constant.int 9223372036854775807
    %int1_7862 = torch.constant.int 1
    %5618 = torch.aten.slice.Tensor %5617, %int2_7859, %int0_7860, %int9223372036854775807_7861, %int1_7862 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_7863 = torch.constant.int -1
    %int0_7864 = torch.constant.int 0
    %int3072_7865 = torch.constant.int 3072
    %int1_7866 = torch.constant.int 1
    %5619 = torch.aten.slice.Tensor %5618, %int-1_7863, %int0_7864, %int3072_7865, %int1_7866 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7867 = torch.constant.int -1
    %int3072_7868 = torch.constant.int 3072
    %int6144_7869 = torch.constant.int 6144
    %int1_7870 = torch.constant.int 1
    %5620 = torch.aten.slice.Tensor %5618, %int-1_7867, %int3072_7868, %int6144_7869, %int1_7870 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7871 = torch.constant.int -1
    %int6144_7872 = torch.constant.int 6144
    %int9216_7873 = torch.constant.int 9216
    %int1_7874 = torch.constant.int 1
    %5621 = torch.aten.slice.Tensor %5618, %int-1_7871, %int6144_7872, %int9216_7873, %int1_7874 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7875 = torch.constant.int -1
    %int9216_7876 = torch.constant.int 9216
    %int12288_7877 = torch.constant.int 12288
    %int1_7878 = torch.constant.int 1
    %5622 = torch.aten.slice.Tensor %5618, %int-1_7875, %int9216_7876, %int12288_7877, %int1_7878 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7879 = torch.constant.int -1
    %int12288_7880 = torch.constant.int 12288
    %int15360_7881 = torch.constant.int 15360
    %int1_7882 = torch.constant.int 1
    %5623 = torch.aten.slice.Tensor %5618, %int-1_7879, %int12288_7880, %int15360_7881, %int1_7882 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7883 = torch.constant.int -1
    %int15360_7884 = torch.constant.int 15360
    %int18432_7885 = torch.constant.int 18432
    %int1_7886 = torch.constant.int 1
    %5624 = torch.aten.slice.Tensor %5618, %int-1_7883, %int15360_7884, %int18432_7885, %int1_7886 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5625 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.16.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.16.txt_mod.lin.weight : tensor<18432x3072xf16>
    %5626 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_7887 = torch.constant.int 0
    %int1_7888 = torch.constant.int 1
    %5627 = torch.aten.transpose.int %5626, %int0_7887, %int1_7888 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.16.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.16.txt_mod.lin.bias : tensor<18432xf16>
    %5628 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_7889 = torch.constant.int 6
    %5629 = torch.prims.convert_element_type %5628, %int6_7889 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_7890 = torch.constant.int 6
    %5630 = torch.prims.convert_element_type %5625, %int6_7890 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_7891 = torch.constant.int 6
    %5631 = torch.prims.convert_element_type %5627, %int6_7891 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %5632 = torch.aten.mm %5630, %5631 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_7892 = torch.constant.int 1
    %5633 = torch.aten.mul.Scalar %5632, %int1_7892 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_7893 = torch.constant.int 1
    %5634 = torch.aten.mul.Scalar %5629, %int1_7893 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_7894 = torch.constant.int 1
    %5635 = torch.aten.add.Tensor %5633, %5634, %int1_7894 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_7895 = torch.constant.int 5
    %5636 = torch.prims.convert_element_type %5635, %int5_7895 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_7896 = torch.constant.int 0
    %int0_7897 = torch.constant.int 0
    %int9223372036854775807_7898 = torch.constant.int 9223372036854775807
    %int1_7899 = torch.constant.int 1
    %5637 = torch.aten.slice.Tensor %5636, %int0_7896, %int0_7897, %int9223372036854775807_7898, %int1_7899 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_7900 = torch.constant.int 1
    %5638 = torch.aten.unsqueeze %5637, %int1_7900 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_7901 = torch.constant.int 2
    %int0_7902 = torch.constant.int 0
    %int9223372036854775807_7903 = torch.constant.int 9223372036854775807
    %int1_7904 = torch.constant.int 1
    %5639 = torch.aten.slice.Tensor %5638, %int2_7901, %int0_7902, %int9223372036854775807_7903, %int1_7904 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_7905 = torch.constant.int -1
    %int0_7906 = torch.constant.int 0
    %int3072_7907 = torch.constant.int 3072
    %int1_7908 = torch.constant.int 1
    %5640 = torch.aten.slice.Tensor %5639, %int-1_7905, %int0_7906, %int3072_7907, %int1_7908 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7909 = torch.constant.int -1
    %int3072_7910 = torch.constant.int 3072
    %int6144_7911 = torch.constant.int 6144
    %int1_7912 = torch.constant.int 1
    %5641 = torch.aten.slice.Tensor %5639, %int-1_7909, %int3072_7910, %int6144_7911, %int1_7912 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7913 = torch.constant.int -1
    %int6144_7914 = torch.constant.int 6144
    %int9216_7915 = torch.constant.int 9216
    %int1_7916 = torch.constant.int 1
    %5642 = torch.aten.slice.Tensor %5639, %int-1_7913, %int6144_7914, %int9216_7915, %int1_7916 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7917 = torch.constant.int -1
    %int9216_7918 = torch.constant.int 9216
    %int12288_7919 = torch.constant.int 12288
    %int1_7920 = torch.constant.int 1
    %5643 = torch.aten.slice.Tensor %5639, %int-1_7917, %int9216_7918, %int12288_7919, %int1_7920 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7921 = torch.constant.int -1
    %int12288_7922 = torch.constant.int 12288
    %int15360_7923 = torch.constant.int 15360
    %int1_7924 = torch.constant.int 1
    %5644 = torch.aten.slice.Tensor %5639, %int-1_7921, %int12288_7922, %int15360_7923, %int1_7924 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7925 = torch.constant.int -1
    %int15360_7926 = torch.constant.int 15360
    %int18432_7927 = torch.constant.int 18432
    %int1_7928 = torch.constant.int 1
    %5645 = torch.aten.slice.Tensor %5639, %int-1_7925, %int15360_7926, %int18432_7927, %int1_7928 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_7929 = torch.constant.int 6
    %5646 = torch.prims.convert_element_type %5543, %int6_7929 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_7930 = torch.constant.int 2
    %5647 = torch.prim.ListConstruct %int2_7930 : (!torch.int) -> !torch.list<int>
    %int0_7931 = torch.constant.int 0
    %true_7932 = torch.constant.bool true
    %result0_7933, %result1_7934 = torch.aten.var_mean.correction %5646, %5647, %int0_7931, %true_7932 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_7935 = torch.constant.float 9.9999999999999995E-7
    %int1_7936 = torch.constant.int 1
    %5648 = torch.aten.add.Scalar %result0_7933, %float9.999990e-07_7935, %int1_7936 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %5649 = torch.aten.rsqrt %5648 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_7937 = torch.constant.int 1
    %5650 = torch.aten.sub.Tensor %5543, %result1_7934, %int1_7937 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %5651 = torch.aten.mul.Tensor %5650, %5649 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_7938 = torch.constant.int 5
    %5652 = torch.prims.convert_element_type %5651, %int5_7938 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7939 = torch.constant.int 1
    %int1_7940 = torch.constant.int 1
    %5653 = torch.aten.add.Scalar %5620, %int1_7939, %int1_7940 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5654 = torch.aten.mul.Tensor %5653, %5652 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7941 = torch.constant.int 1
    %5655 = torch.aten.add.Tensor %5654, %5619, %int1_7941 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_7942 = torch.constant.int 4096
    %int3072_7943 = torch.constant.int 3072
    %5656 = torch.prim.ListConstruct %int4096_7942, %int3072_7943 : (!torch.int, !torch.int) -> !torch.list<int>
    %5657 = torch.aten.view %5655, %5656 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.16.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.16.img_attn.qkv.weight : tensor<9216x3072xf16>
    %5658 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_7944 = torch.constant.int 0
    %int1_7945 = torch.constant.int 1
    %5659 = torch.aten.transpose.int %5658, %int0_7944, %int1_7945 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.16.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.16.img_attn.qkv.bias : tensor<9216xf16>
    %5660 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_7946 = torch.constant.int 6
    %5661 = torch.prims.convert_element_type %5660, %int6_7946 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_7947 = torch.constant.int 6
    %5662 = torch.prims.convert_element_type %5657, %int6_7947 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_7948 = torch.constant.int 6
    %5663 = torch.prims.convert_element_type %5659, %int6_7948 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %5664 = torch.aten.mm %5662, %5663 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_7949 = torch.constant.int 1
    %5665 = torch.aten.mul.Scalar %5664, %int1_7949 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_7950 = torch.constant.int 1
    %5666 = torch.aten.mul.Scalar %5661, %int1_7950 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_7951 = torch.constant.int 1
    %5667 = torch.aten.add.Tensor %5665, %5666, %int1_7951 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_7952 = torch.constant.int 5
    %5668 = torch.prims.convert_element_type %5667, %int5_7952 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_7953 = torch.constant.int 1
    %int4096_7954 = torch.constant.int 4096
    %int9216_7955 = torch.constant.int 9216
    %5669 = torch.prim.ListConstruct %int1_7953, %int4096_7954, %int9216_7955 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5670 = torch.aten.view %5668, %5669 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %5671 = torch_c.to_builtin_tensor %5670 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_7956 = tensor.cast %5671 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_7957 = arith.constant 0 : index
    %dim_7958 = tensor.dim %cast_7956, %c0_7957 : tensor<?x?x?xf16>
    %c1_7959 = arith.constant 1 : index
    %dim_7960 = tensor.dim %cast_7956, %c1_7959 : tensor<?x?x?xf16>
    %c2_7961 = arith.constant 2 : index
    %dim_7962 = tensor.dim %cast_7956, %c2_7961 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_7956 : tensor<?x?x?xf16>{%dim_7958, %dim_7960, %dim_7962}]
    %cast_7963 = tensor.cast %cast_7956 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %5672 = torch_c.from_builtin_tensor %cast_7963 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_7964 = torch.constant.int 1
    %int4096_7965 = torch.constant.int 4096
    %int3_7966 = torch.constant.int 3
    %int24_7967 = torch.constant.int 24
    %int128_7968 = torch.constant.int 128
    %5673 = torch.prim.ListConstruct %int1_7964, %int4096_7965, %int3_7966, %int24_7967, %int128_7968 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5674 = torch.aten.view %5672, %5673 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_7969 = torch.constant.int 2
    %int0_7970 = torch.constant.int 0
    %int3_7971 = torch.constant.int 3
    %int1_7972 = torch.constant.int 1
    %int4_7973 = torch.constant.int 4
    %5675 = torch.prim.ListConstruct %int2_7969, %int0_7970, %int3_7971, %int1_7972, %int4_7973 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5676 = torch.aten.permute %5674, %5675 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_7974 = torch.constant.int 0
    %int0_7975 = torch.constant.int 0
    %5677 = torch.aten.select.int %5676, %int0_7974, %int0_7975 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_7976 = torch.constant.int 6
    %5678 = torch.prims.convert_element_type %5677, %int6_7976 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_7977 = torch.constant.int 2
    %5679 = torch.aten.pow.Tensor_Scalar %5678, %int2_7977 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_7978 = torch.constant.int -1
    %5680 = torch.prim.ListConstruct %int-1_7978 : (!torch.int) -> !torch.list<int>
    %true_7979 = torch.constant.bool true
    %none_7980 = torch.constant.none
    %5681 = torch.aten.mean.dim %5679, %5680, %true_7979, %none_7980 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_7981 = torch.constant.float 9.9999999999999995E-7
    %int1_7982 = torch.constant.int 1
    %5682 = torch.aten.add.Scalar %5681, %float9.999990e-07_7981, %int1_7982 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %5683 = torch.aten.rsqrt %5682 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %5684 = torch.aten.mul.Tensor %5678, %5683 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_7983 = torch.constant.int 5
    %5685 = torch.prims.convert_element_type %5684, %int5_7983 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.16.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.16.img_attn.norm.query_norm.scale : tensor<128xf16>
    %5686 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5687 = torch.aten.mul.Tensor %5685, %5686 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_7984 = torch.constant.int 1
    %int4096_7985 = torch.constant.int 4096
    %int3_7986 = torch.constant.int 3
    %int24_7987 = torch.constant.int 24
    %int128_7988 = torch.constant.int 128
    %5688 = torch.prim.ListConstruct %int1_7984, %int4096_7985, %int3_7986, %int24_7987, %int128_7988 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5689 = torch.aten.view %5672, %5688 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_7989 = torch.constant.int 2
    %int0_7990 = torch.constant.int 0
    %int3_7991 = torch.constant.int 3
    %int1_7992 = torch.constant.int 1
    %int4_7993 = torch.constant.int 4
    %5690 = torch.prim.ListConstruct %int2_7989, %int0_7990, %int3_7991, %int1_7992, %int4_7993 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5691 = torch.aten.permute %5689, %5690 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_7994 = torch.constant.int 0
    %int1_7995 = torch.constant.int 1
    %5692 = torch.aten.select.int %5691, %int0_7994, %int1_7995 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_7996 = torch.constant.int 6
    %5693 = torch.prims.convert_element_type %5692, %int6_7996 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_7997 = torch.constant.int 2
    %5694 = torch.aten.pow.Tensor_Scalar %5693, %int2_7997 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_7998 = torch.constant.int -1
    %5695 = torch.prim.ListConstruct %int-1_7998 : (!torch.int) -> !torch.list<int>
    %true_7999 = torch.constant.bool true
    %none_8000 = torch.constant.none
    %5696 = torch.aten.mean.dim %5694, %5695, %true_7999, %none_8000 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_8001 = torch.constant.float 9.9999999999999995E-7
    %int1_8002 = torch.constant.int 1
    %5697 = torch.aten.add.Scalar %5696, %float9.999990e-07_8001, %int1_8002 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %5698 = torch.aten.rsqrt %5697 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %5699 = torch.aten.mul.Tensor %5693, %5698 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_8003 = torch.constant.int 5
    %5700 = torch.prims.convert_element_type %5699, %int5_8003 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.16.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.16.img_attn.norm.key_norm.scale : tensor<128xf16>
    %5701 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5702 = torch.aten.mul.Tensor %5700, %5701 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_8004 = torch.constant.int 5
    %5703 = torch.prims.convert_element_type %5687, %int5_8004 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_8005 = torch.constant.int 5
    %5704 = torch.prims.convert_element_type %5702, %int5_8005 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_8006 = torch.constant.int 6
    %5705 = torch.prims.convert_element_type %5603, %int6_8006 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_8007 = torch.constant.int 2
    %5706 = torch.prim.ListConstruct %int2_8007 : (!torch.int) -> !torch.list<int>
    %int0_8008 = torch.constant.int 0
    %true_8009 = torch.constant.bool true
    %result0_8010, %result1_8011 = torch.aten.var_mean.correction %5705, %5706, %int0_8008, %true_8009 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_8012 = torch.constant.float 9.9999999999999995E-7
    %int1_8013 = torch.constant.int 1
    %5707 = torch.aten.add.Scalar %result0_8010, %float9.999990e-07_8012, %int1_8013 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %5708 = torch.aten.rsqrt %5707 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_8014 = torch.constant.int 1
    %5709 = torch.aten.sub.Tensor %5603, %result1_8011, %int1_8014 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %5710 = torch.aten.mul.Tensor %5709, %5708 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_8015 = torch.constant.int 5
    %5711 = torch.prims.convert_element_type %5710, %int5_8015 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_8016 = torch.constant.int 1
    %int1_8017 = torch.constant.int 1
    %5712 = torch.aten.add.Scalar %5641, %int1_8016, %int1_8017 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5713 = torch.aten.mul.Tensor %5712, %5711 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_8018 = torch.constant.int 1
    %5714 = torch.aten.add.Tensor %5713, %5640, %int1_8018 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_8019 = torch.constant.int 512
    %int3072_8020 = torch.constant.int 3072
    %5715 = torch.prim.ListConstruct %int512_8019, %int3072_8020 : (!torch.int, !torch.int) -> !torch.list<int>
    %5716 = torch.aten.view %5714, %5715 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.16.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.16.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %5717 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_8021 = torch.constant.int 0
    %int1_8022 = torch.constant.int 1
    %5718 = torch.aten.transpose.int %5717, %int0_8021, %int1_8022 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.16.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.16.txt_attn.qkv.bias : tensor<9216xf16>
    %5719 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_8023 = torch.constant.int 6
    %5720 = torch.prims.convert_element_type %5719, %int6_8023 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_8024 = torch.constant.int 6
    %5721 = torch.prims.convert_element_type %5716, %int6_8024 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_8025 = torch.constant.int 6
    %5722 = torch.prims.convert_element_type %5718, %int6_8025 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %5723 = torch.aten.mm %5721, %5722 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_8026 = torch.constant.int 1
    %5724 = torch.aten.mul.Scalar %5723, %int1_8026 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_8027 = torch.constant.int 1
    %5725 = torch.aten.mul.Scalar %5720, %int1_8027 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_8028 = torch.constant.int 1
    %5726 = torch.aten.add.Tensor %5724, %5725, %int1_8028 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_8029 = torch.constant.int 5
    %5727 = torch.prims.convert_element_type %5726, %int5_8029 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_8030 = torch.constant.int 1
    %int512_8031 = torch.constant.int 512
    %int9216_8032 = torch.constant.int 9216
    %5728 = torch.prim.ListConstruct %int1_8030, %int512_8031, %int9216_8032 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5729 = torch.aten.view %5727, %5728 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %5730 = torch_c.to_builtin_tensor %5729 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_8033 = tensor.cast %5730 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_8034 = arith.constant 0 : index
    %dim_8035 = tensor.dim %cast_8033, %c0_8034 : tensor<?x?x?xf16>
    %c1_8036 = arith.constant 1 : index
    %dim_8037 = tensor.dim %cast_8033, %c1_8036 : tensor<?x?x?xf16>
    %c2_8038 = arith.constant 2 : index
    %dim_8039 = tensor.dim %cast_8033, %c2_8038 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_8033 : tensor<?x?x?xf16>{%dim_8035, %dim_8037, %dim_8039}]
    %cast_8040 = tensor.cast %cast_8033 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %5731 = torch_c.from_builtin_tensor %cast_8040 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_8041 = torch.constant.int 1
    %int512_8042 = torch.constant.int 512
    %int3_8043 = torch.constant.int 3
    %int24_8044 = torch.constant.int 24
    %int128_8045 = torch.constant.int 128
    %5732 = torch.prim.ListConstruct %int1_8041, %int512_8042, %int3_8043, %int24_8044, %int128_8045 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5733 = torch.aten.view %5731, %5732 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_8046 = torch.constant.int 2
    %int0_8047 = torch.constant.int 0
    %int3_8048 = torch.constant.int 3
    %int1_8049 = torch.constant.int 1
    %int4_8050 = torch.constant.int 4
    %5734 = torch.prim.ListConstruct %int2_8046, %int0_8047, %int3_8048, %int1_8049, %int4_8050 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5735 = torch.aten.permute %5733, %5734 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_8051 = torch.constant.int 0
    %int0_8052 = torch.constant.int 0
    %5736 = torch.aten.select.int %5735, %int0_8051, %int0_8052 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_8053 = torch.constant.int 6
    %5737 = torch.prims.convert_element_type %5736, %int6_8053 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_8054 = torch.constant.int 2
    %5738 = torch.aten.pow.Tensor_Scalar %5737, %int2_8054 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_8055 = torch.constant.int -1
    %5739 = torch.prim.ListConstruct %int-1_8055 : (!torch.int) -> !torch.list<int>
    %true_8056 = torch.constant.bool true
    %none_8057 = torch.constant.none
    %5740 = torch.aten.mean.dim %5738, %5739, %true_8056, %none_8057 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_8058 = torch.constant.float 9.9999999999999995E-7
    %int1_8059 = torch.constant.int 1
    %5741 = torch.aten.add.Scalar %5740, %float9.999990e-07_8058, %int1_8059 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %5742 = torch.aten.rsqrt %5741 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %5743 = torch.aten.mul.Tensor %5737, %5742 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_8060 = torch.constant.int 5
    %5744 = torch.prims.convert_element_type %5743, %int5_8060 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.16.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.16.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %5745 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5746 = torch.aten.mul.Tensor %5744, %5745 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_8061 = torch.constant.int 1
    %int512_8062 = torch.constant.int 512
    %int3_8063 = torch.constant.int 3
    %int24_8064 = torch.constant.int 24
    %int128_8065 = torch.constant.int 128
    %5747 = torch.prim.ListConstruct %int1_8061, %int512_8062, %int3_8063, %int24_8064, %int128_8065 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5748 = torch.aten.view %5731, %5747 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_8066 = torch.constant.int 2
    %int0_8067 = torch.constant.int 0
    %int3_8068 = torch.constant.int 3
    %int1_8069 = torch.constant.int 1
    %int4_8070 = torch.constant.int 4
    %5749 = torch.prim.ListConstruct %int2_8066, %int0_8067, %int3_8068, %int1_8069, %int4_8070 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5750 = torch.aten.permute %5748, %5749 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_8071 = torch.constant.int 0
    %int1_8072 = torch.constant.int 1
    %5751 = torch.aten.select.int %5750, %int0_8071, %int1_8072 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_8073 = torch.constant.int 6
    %5752 = torch.prims.convert_element_type %5751, %int6_8073 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_8074 = torch.constant.int 2
    %5753 = torch.aten.pow.Tensor_Scalar %5752, %int2_8074 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_8075 = torch.constant.int -1
    %5754 = torch.prim.ListConstruct %int-1_8075 : (!torch.int) -> !torch.list<int>
    %true_8076 = torch.constant.bool true
    %none_8077 = torch.constant.none
    %5755 = torch.aten.mean.dim %5753, %5754, %true_8076, %none_8077 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_8078 = torch.constant.float 9.9999999999999995E-7
    %int1_8079 = torch.constant.int 1
    %5756 = torch.aten.add.Scalar %5755, %float9.999990e-07_8078, %int1_8079 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %5757 = torch.aten.rsqrt %5756 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %5758 = torch.aten.mul.Tensor %5752, %5757 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_8080 = torch.constant.int 5
    %5759 = torch.prims.convert_element_type %5758, %int5_8080 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.16.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.16.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %5760 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5761 = torch.aten.mul.Tensor %5759, %5760 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_8081 = torch.constant.int 5
    %5762 = torch.prims.convert_element_type %5746, %int5_8081 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_8082 = torch.constant.int 5
    %5763 = torch.prims.convert_element_type %5761, %int5_8082 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %5764 = torch.prim.ListConstruct %5762, %5703 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_8083 = torch.constant.int 2
    %5765 = torch.aten.cat %5764, %int2_8083 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %5766 = torch.prim.ListConstruct %5763, %5704 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_8084 = torch.constant.int 2
    %5767 = torch.aten.cat %5766, %int2_8084 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_8085 = torch.constant.int 1
    %int512_8086 = torch.constant.int 512
    %int3_8087 = torch.constant.int 3
    %int24_8088 = torch.constant.int 24
    %int128_8089 = torch.constant.int 128
    %5768 = torch.prim.ListConstruct %int1_8085, %int512_8086, %int3_8087, %int24_8088, %int128_8089 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5769 = torch.aten.view %5731, %5768 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_8090 = torch.constant.int 2
    %int0_8091 = torch.constant.int 0
    %int3_8092 = torch.constant.int 3
    %int1_8093 = torch.constant.int 1
    %int4_8094 = torch.constant.int 4
    %5770 = torch.prim.ListConstruct %int2_8090, %int0_8091, %int3_8092, %int1_8093, %int4_8094 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5771 = torch.aten.permute %5769, %5770 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_8095 = torch.constant.int 0
    %int2_8096 = torch.constant.int 2
    %5772 = torch.aten.select.int %5771, %int0_8095, %int2_8096 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_8097 = torch.constant.int 1
    %int4096_8098 = torch.constant.int 4096
    %int3_8099 = torch.constant.int 3
    %int24_8100 = torch.constant.int 24
    %int128_8101 = torch.constant.int 128
    %5773 = torch.prim.ListConstruct %int1_8097, %int4096_8098, %int3_8099, %int24_8100, %int128_8101 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5774 = torch.aten.view %5672, %5773 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_8102 = torch.constant.int 2
    %int0_8103 = torch.constant.int 0
    %int3_8104 = torch.constant.int 3
    %int1_8105 = torch.constant.int 1
    %int4_8106 = torch.constant.int 4
    %5775 = torch.prim.ListConstruct %int2_8102, %int0_8103, %int3_8104, %int1_8105, %int4_8106 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5776 = torch.aten.permute %5774, %5775 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_8107 = torch.constant.int 0
    %int2_8108 = torch.constant.int 2
    %5777 = torch.aten.select.int %5776, %int0_8107, %int2_8108 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %5778 = torch.prim.ListConstruct %5772, %5777 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_8109 = torch.constant.int 2
    %5779 = torch.aten.cat %5778, %int2_8109 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %5780 = torch_c.to_builtin_tensor %5765 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_8110 = tensor.cast %5780 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_8111 = arith.constant 0 : index
    %dim_8112 = tensor.dim %cast_8110, %c0_8111 : tensor<?x?x?x?xf16>
    %c1_8113 = arith.constant 1 : index
    %dim_8114 = tensor.dim %cast_8110, %c1_8113 : tensor<?x?x?x?xf16>
    %c2_8115 = arith.constant 2 : index
    %dim_8116 = tensor.dim %cast_8110, %c2_8115 : tensor<?x?x?x?xf16>
    %c3_8117 = arith.constant 3 : index
    %dim_8118 = tensor.dim %cast_8110, %c3_8117 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_8110 : tensor<?x?x?x?xf16>{%dim_8112, %dim_8114, %dim_8116, %dim_8118}]
    %cast_8119 = tensor.cast %cast_8110 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %5781 = torch_c.from_builtin_tensor %cast_8119 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %5782 = torch_c.to_builtin_tensor %5767 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_8120 = tensor.cast %5782 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_8121 = arith.constant 0 : index
    %dim_8122 = tensor.dim %cast_8120, %c0_8121 : tensor<?x?x?x?xf16>
    %c1_8123 = arith.constant 1 : index
    %dim_8124 = tensor.dim %cast_8120, %c1_8123 : tensor<?x?x?x?xf16>
    %c2_8125 = arith.constant 2 : index
    %dim_8126 = tensor.dim %cast_8120, %c2_8125 : tensor<?x?x?x?xf16>
    %c3_8127 = arith.constant 3 : index
    %dim_8128 = tensor.dim %cast_8120, %c3_8127 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_8120 : tensor<?x?x?x?xf16>{%dim_8122, %dim_8124, %dim_8126, %dim_8128}]
    %cast_8129 = tensor.cast %cast_8120 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %5783 = torch_c.from_builtin_tensor %cast_8129 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %5784 = torch_c.to_builtin_tensor %5779 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_8130 = tensor.cast %5784 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_8131 = arith.constant 0 : index
    %dim_8132 = tensor.dim %cast_8130, %c0_8131 : tensor<?x?x?x?xf16>
    %c1_8133 = arith.constant 1 : index
    %dim_8134 = tensor.dim %cast_8130, %c1_8133 : tensor<?x?x?x?xf16>
    %c2_8135 = arith.constant 2 : index
    %dim_8136 = tensor.dim %cast_8130, %c2_8135 : tensor<?x?x?x?xf16>
    %c3_8137 = arith.constant 3 : index
    %dim_8138 = tensor.dim %cast_8130, %c3_8137 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_8130 : tensor<?x?x?x?xf16>{%dim_8132, %dim_8134, %dim_8136, %dim_8138}]
    %cast_8139 = tensor.cast %cast_8130 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %5785 = torch_c.from_builtin_tensor %cast_8139 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8140 = torch.constant.int 6
    %5786 = torch.prims.convert_element_type %5781, %int6_8140 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8141 = torch.constant.int 1
    %int24_8142 = torch.constant.int 24
    %int4608_8143 = torch.constant.int 4608
    %int-1_8144 = torch.constant.int -1
    %int1_8145 = torch.constant.int 1
    %int2_8146 = torch.constant.int 2
    %5787 = torch.prim.ListConstruct %int1_8141, %int24_8142, %int4608_8143, %int-1_8144, %int1_8145, %int2_8146 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5788 = torch.aten.view %5786, %5787 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_8147 = torch.constant.int 6
    %5789 = torch.prims.convert_element_type %5783, %int6_8147 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8148 = torch.constant.int 1
    %int24_8149 = torch.constant.int 24
    %int4608_8150 = torch.constant.int 4608
    %int-1_8151 = torch.constant.int -1
    %int1_8152 = torch.constant.int 1
    %int2_8153 = torch.constant.int 2
    %5790 = torch.prim.ListConstruct %int1_8148, %int24_8149, %int4608_8150, %int-1_8151, %int1_8152, %int2_8153 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5791 = torch.aten.view %5789, %5790 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_8154 = torch.constant.int 5
    %int0_8155 = torch.constant.int 0
    %5792 = torch.aten.select.int %211, %int5_8154, %int0_8155 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8156 = torch.constant.int 5
    %int0_8157 = torch.constant.int 0
    %5793 = torch.aten.select.int %5788, %int5_8156, %int0_8157 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5794 = torch.aten.mul.Tensor %5792, %5793 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8158 = torch.constant.int 5
    %int1_8159 = torch.constant.int 1
    %5795 = torch.aten.select.int %211, %int5_8158, %int1_8159 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8160 = torch.constant.int 5
    %int1_8161 = torch.constant.int 1
    %5796 = torch.aten.select.int %5788, %int5_8160, %int1_8161 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5797 = torch.aten.mul.Tensor %5795, %5796 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8162 = torch.constant.int 1
    %5798 = torch.aten.add.Tensor %5794, %5797, %int1_8162 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8163 = torch.constant.int 5
    %int0_8164 = torch.constant.int 0
    %5799 = torch.aten.select.int %211, %int5_8163, %int0_8164 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8165 = torch.constant.int 5
    %int0_8166 = torch.constant.int 0
    %5800 = torch.aten.select.int %5791, %int5_8165, %int0_8166 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5801 = torch.aten.mul.Tensor %5799, %5800 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8167 = torch.constant.int 5
    %int1_8168 = torch.constant.int 1
    %5802 = torch.aten.select.int %211, %int5_8167, %int1_8168 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8169 = torch.constant.int 5
    %int1_8170 = torch.constant.int 1
    %5803 = torch.aten.select.int %5791, %int5_8169, %int1_8170 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5804 = torch.aten.mul.Tensor %5802, %5803 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8171 = torch.constant.int 1
    %5805 = torch.aten.add.Tensor %5801, %5804, %int1_8171 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8172 = torch.constant.int 1
    %int24_8173 = torch.constant.int 24
    %int4608_8174 = torch.constant.int 4608
    %int128_8175 = torch.constant.int 128
    %5806 = torch.prim.ListConstruct %int1_8172, %int24_8173, %int4608_8174, %int128_8175 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5807 = torch.aten.view %5798, %5806 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8176 = torch.constant.int 5
    %5808 = torch.prims.convert_element_type %5807, %int5_8176 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_8177 = torch.constant.int 1
    %int24_8178 = torch.constant.int 24
    %int4608_8179 = torch.constant.int 4608
    %int128_8180 = torch.constant.int 128
    %5809 = torch.prim.ListConstruct %int1_8177, %int24_8178, %int4608_8179, %int128_8180 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5810 = torch.aten.view %5805, %5809 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8181 = torch.constant.int 5
    %5811 = torch.prims.convert_element_type %5810, %int5_8181 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_8182 = torch.constant.float 0.000000e+00
    %false_8183 = torch.constant.bool false
    %none_8184 = torch.constant.none
    %none_8185 = torch.constant.none
    %5812:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%5808, %5811, %5785, %float0.000000e00_8182, %false_8183, %none_8184, %none_8185) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_8186 = torch.constant.int 0
    %int2_8187 = torch.constant.int 2
    %int1_8188 = torch.constant.int 1
    %int3_8189 = torch.constant.int 3
    %5813 = torch.prim.ListConstruct %int0_8186, %int2_8187, %int1_8188, %int3_8189 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5814 = torch.aten.permute %5812#0, %5813 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_8190 = torch.constant.int 1
    %int4608_8191 = torch.constant.int 4608
    %int3072_8192 = torch.constant.int 3072
    %5815 = torch.prim.ListConstruct %int1_8190, %int4608_8191, %int3072_8192 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5816 = torch.aten.view %5814, %5815 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_8193 = torch.constant.int 0
    %int0_8194 = torch.constant.int 0
    %int9223372036854775807_8195 = torch.constant.int 9223372036854775807
    %int1_8196 = torch.constant.int 1
    %5817 = torch.aten.slice.Tensor %5816, %int0_8193, %int0_8194, %int9223372036854775807_8195, %int1_8196 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_8197 = torch.constant.int 1
    %int0_8198 = torch.constant.int 0
    %int512_8199 = torch.constant.int 512
    %int1_8200 = torch.constant.int 1
    %5818 = torch.aten.slice.Tensor %5817, %int1_8197, %int0_8198, %int512_8199, %int1_8200 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_8201 = torch.constant.int 0
    %int0_8202 = torch.constant.int 0
    %int9223372036854775807_8203 = torch.constant.int 9223372036854775807
    %int1_8204 = torch.constant.int 1
    %5819 = torch.aten.slice.Tensor %5816, %int0_8201, %int0_8202, %int9223372036854775807_8203, %int1_8204 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_8205 = torch.constant.int 1
    %int512_8206 = torch.constant.int 512
    %int9223372036854775807_8207 = torch.constant.int 9223372036854775807
    %int1_8208 = torch.constant.int 1
    %5820 = torch.aten.slice.Tensor %5819, %int1_8205, %int512_8206, %int9223372036854775807_8207, %int1_8208 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_8209 = torch.constant.int 4096
    %int3072_8210 = torch.constant.int 3072
    %5821 = torch.prim.ListConstruct %int4096_8209, %int3072_8210 : (!torch.int, !torch.int) -> !torch.list<int>
    %5822 = torch.aten.view %5820, %5821 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.16.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.16.img_attn.proj.weight : tensor<3072x3072xf16>
    %5823 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_8211 = torch.constant.int 0
    %int1_8212 = torch.constant.int 1
    %5824 = torch.aten.transpose.int %5823, %int0_8211, %int1_8212 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.16.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.16.img_attn.proj.bias : tensor<3072xf16>
    %5825 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_8213 = torch.constant.int 6
    %5826 = torch.prims.convert_element_type %5825, %int6_8213 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_8214 = torch.constant.int 6
    %5827 = torch.prims.convert_element_type %5822, %int6_8214 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_8215 = torch.constant.int 6
    %5828 = torch.prims.convert_element_type %5824, %int6_8215 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %5829 = torch.aten.mm %5827, %5828 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_8216 = torch.constant.int 1
    %5830 = torch.aten.mul.Scalar %5829, %int1_8216 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_8217 = torch.constant.int 1
    %5831 = torch.aten.mul.Scalar %5826, %int1_8217 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_8218 = torch.constant.int 1
    %5832 = torch.aten.add.Tensor %5830, %5831, %int1_8218 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_8219 = torch.constant.int 5
    %5833 = torch.prims.convert_element_type %5832, %int5_8219 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_8220 = torch.constant.int 1
    %int4096_8221 = torch.constant.int 4096
    %int3072_8222 = torch.constant.int 3072
    %5834 = torch.prim.ListConstruct %int1_8220, %int4096_8221, %int3072_8222 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5835 = torch.aten.view %5833, %5834 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %5836 = torch.aten.mul.Tensor %5621, %5835 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_8223 = torch.constant.int 1
    %5837 = torch.aten.add.Tensor %5543, %5836, %int1_8223 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_8224 = torch.constant.int 1
    %int1_8225 = torch.constant.int 1
    %5838 = torch.aten.add.Scalar %5623, %int1_8224, %int1_8225 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_8226 = torch.constant.int 6
    %5839 = torch.prims.convert_element_type %5837, %int6_8226 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_8227 = torch.constant.int 2
    %5840 = torch.prim.ListConstruct %int2_8227 : (!torch.int) -> !torch.list<int>
    %int0_8228 = torch.constant.int 0
    %true_8229 = torch.constant.bool true
    %result0_8230, %result1_8231 = torch.aten.var_mean.correction %5839, %5840, %int0_8228, %true_8229 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_8232 = torch.constant.float 9.9999999999999995E-7
    %int1_8233 = torch.constant.int 1
    %5841 = torch.aten.add.Scalar %result0_8230, %float9.999990e-07_8232, %int1_8233 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %5842 = torch.aten.rsqrt %5841 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_8234 = torch.constant.int 1
    %5843 = torch.aten.sub.Tensor %5837, %result1_8231, %int1_8234 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %5844 = torch.aten.mul.Tensor %5843, %5842 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_8235 = torch.constant.int 5
    %5845 = torch.prims.convert_element_type %5844, %int5_8235 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %5846 = torch.aten.mul.Tensor %5838, %5845 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_8236 = torch.constant.int 1
    %5847 = torch.aten.add.Tensor %5846, %5622, %int1_8236 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_8237 = torch.constant.int 4096
    %int3072_8238 = torch.constant.int 3072
    %5848 = torch.prim.ListConstruct %int4096_8237, %int3072_8238 : (!torch.int, !torch.int) -> !torch.list<int>
    %5849 = torch.aten.view %5847, %5848 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.16.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.16.img_mlp.0.weight : tensor<12288x3072xf16>
    %5850 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_8239 = torch.constant.int 0
    %int1_8240 = torch.constant.int 1
    %5851 = torch.aten.transpose.int %5850, %int0_8239, %int1_8240 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.16.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.16.img_mlp.0.bias : tensor<12288xf16>
    %5852 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_8241 = torch.constant.int 6
    %5853 = torch.prims.convert_element_type %5852, %int6_8241 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_8242 = torch.constant.int 6
    %5854 = torch.prims.convert_element_type %5849, %int6_8242 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_8243 = torch.constant.int 6
    %5855 = torch.prims.convert_element_type %5851, %int6_8243 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %5856 = torch.aten.mm %5854, %5855 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_8244 = torch.constant.int 1
    %5857 = torch.aten.mul.Scalar %5856, %int1_8244 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_8245 = torch.constant.int 1
    %5858 = torch.aten.mul.Scalar %5853, %int1_8245 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_8246 = torch.constant.int 1
    %5859 = torch.aten.add.Tensor %5857, %5858, %int1_8246 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_8247 = torch.constant.int 5
    %5860 = torch.prims.convert_element_type %5859, %int5_8247 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_8248 = torch.constant.int 1
    %int4096_8249 = torch.constant.int 4096
    %int12288_8250 = torch.constant.int 12288
    %5861 = torch.prim.ListConstruct %int1_8248, %int4096_8249, %int12288_8250 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5862 = torch.aten.view %5860, %5861 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_8251 = torch.constant.str "tanh"
    %5863 = torch.aten.gelu %5862, %str_8251 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_8252 = torch.constant.int 4096
    %int12288_8253 = torch.constant.int 12288
    %5864 = torch.prim.ListConstruct %int4096_8252, %int12288_8253 : (!torch.int, !torch.int) -> !torch.list<int>
    %5865 = torch.aten.view %5863, %5864 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.16.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.16.img_mlp.2.weight : tensor<3072x12288xf16>
    %5866 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_8254 = torch.constant.int 0
    %int1_8255 = torch.constant.int 1
    %5867 = torch.aten.transpose.int %5866, %int0_8254, %int1_8255 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.16.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.16.img_mlp.2.bias : tensor<3072xf16>
    %5868 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_8256 = torch.constant.int 6
    %5869 = torch.prims.convert_element_type %5868, %int6_8256 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_8257 = torch.constant.int 6
    %5870 = torch.prims.convert_element_type %5865, %int6_8257 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_8258 = torch.constant.int 6
    %5871 = torch.prims.convert_element_type %5867, %int6_8258 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %5872 = torch.aten.mm %5870, %5871 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_8259 = torch.constant.int 1
    %5873 = torch.aten.mul.Scalar %5872, %int1_8259 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_8260 = torch.constant.int 1
    %5874 = torch.aten.mul.Scalar %5869, %int1_8260 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_8261 = torch.constant.int 1
    %5875 = torch.aten.add.Tensor %5873, %5874, %int1_8261 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_8262 = torch.constant.int 5
    %5876 = torch.prims.convert_element_type %5875, %int5_8262 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_8263 = torch.constant.int 1
    %int4096_8264 = torch.constant.int 4096
    %int3072_8265 = torch.constant.int 3072
    %5877 = torch.prim.ListConstruct %int1_8263, %int4096_8264, %int3072_8265 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5878 = torch.aten.view %5876, %5877 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %5879 = torch.aten.mul.Tensor %5624, %5878 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_8266 = torch.constant.int 1
    %5880 = torch.aten.add.Tensor %5837, %5879, %int1_8266 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_8267 = torch.constant.int 512
    %int3072_8268 = torch.constant.int 3072
    %5881 = torch.prim.ListConstruct %int512_8267, %int3072_8268 : (!torch.int, !torch.int) -> !torch.list<int>
    %5882 = torch.aten.view %5818, %5881 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.16.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.16.txt_attn.proj.weight : tensor<3072x3072xf16>
    %5883 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_8269 = torch.constant.int 0
    %int1_8270 = torch.constant.int 1
    %5884 = torch.aten.transpose.int %5883, %int0_8269, %int1_8270 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.16.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.16.txt_attn.proj.bias : tensor<3072xf16>
    %5885 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_8271 = torch.constant.int 6
    %5886 = torch.prims.convert_element_type %5885, %int6_8271 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_8272 = torch.constant.int 6
    %5887 = torch.prims.convert_element_type %5882, %int6_8272 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_8273 = torch.constant.int 6
    %5888 = torch.prims.convert_element_type %5884, %int6_8273 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %5889 = torch.aten.mm %5887, %5888 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_8274 = torch.constant.int 1
    %5890 = torch.aten.mul.Scalar %5889, %int1_8274 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_8275 = torch.constant.int 1
    %5891 = torch.aten.mul.Scalar %5886, %int1_8275 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_8276 = torch.constant.int 1
    %5892 = torch.aten.add.Tensor %5890, %5891, %int1_8276 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_8277 = torch.constant.int 5
    %5893 = torch.prims.convert_element_type %5892, %int5_8277 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_8278 = torch.constant.int 1
    %int512_8279 = torch.constant.int 512
    %int3072_8280 = torch.constant.int 3072
    %5894 = torch.prim.ListConstruct %int1_8278, %int512_8279, %int3072_8280 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5895 = torch.aten.view %5893, %5894 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %5896 = torch.aten.mul.Tensor %5642, %5895 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_8281 = torch.constant.int 1
    %5897 = torch.aten.add.Tensor %5603, %5896, %int1_8281 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_8282 = torch.constant.int 1
    %int1_8283 = torch.constant.int 1
    %5898 = torch.aten.add.Scalar %5644, %int1_8282, %int1_8283 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_8284 = torch.constant.int 6
    %5899 = torch.prims.convert_element_type %5897, %int6_8284 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_8285 = torch.constant.int 2
    %5900 = torch.prim.ListConstruct %int2_8285 : (!torch.int) -> !torch.list<int>
    %int0_8286 = torch.constant.int 0
    %true_8287 = torch.constant.bool true
    %result0_8288, %result1_8289 = torch.aten.var_mean.correction %5899, %5900, %int0_8286, %true_8287 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_8290 = torch.constant.float 9.9999999999999995E-7
    %int1_8291 = torch.constant.int 1
    %5901 = torch.aten.add.Scalar %result0_8288, %float9.999990e-07_8290, %int1_8291 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %5902 = torch.aten.rsqrt %5901 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_8292 = torch.constant.int 1
    %5903 = torch.aten.sub.Tensor %5897, %result1_8289, %int1_8292 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %5904 = torch.aten.mul.Tensor %5903, %5902 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_8293 = torch.constant.int 5
    %5905 = torch.prims.convert_element_type %5904, %int5_8293 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %5906 = torch.aten.mul.Tensor %5898, %5905 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_8294 = torch.constant.int 1
    %5907 = torch.aten.add.Tensor %5906, %5643, %int1_8294 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_8295 = torch.constant.int 512
    %int3072_8296 = torch.constant.int 3072
    %5908 = torch.prim.ListConstruct %int512_8295, %int3072_8296 : (!torch.int, !torch.int) -> !torch.list<int>
    %5909 = torch.aten.view %5907, %5908 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.16.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.16.txt_mlp.0.weight : tensor<12288x3072xf16>
    %5910 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_8297 = torch.constant.int 0
    %int1_8298 = torch.constant.int 1
    %5911 = torch.aten.transpose.int %5910, %int0_8297, %int1_8298 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.16.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.16.txt_mlp.0.bias : tensor<12288xf16>
    %5912 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_8299 = torch.constant.int 6
    %5913 = torch.prims.convert_element_type %5912, %int6_8299 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_8300 = torch.constant.int 6
    %5914 = torch.prims.convert_element_type %5909, %int6_8300 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_8301 = torch.constant.int 6
    %5915 = torch.prims.convert_element_type %5911, %int6_8301 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %5916 = torch.aten.mm %5914, %5915 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_8302 = torch.constant.int 1
    %5917 = torch.aten.mul.Scalar %5916, %int1_8302 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_8303 = torch.constant.int 1
    %5918 = torch.aten.mul.Scalar %5913, %int1_8303 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_8304 = torch.constant.int 1
    %5919 = torch.aten.add.Tensor %5917, %5918, %int1_8304 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_8305 = torch.constant.int 5
    %5920 = torch.prims.convert_element_type %5919, %int5_8305 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_8306 = torch.constant.int 1
    %int512_8307 = torch.constant.int 512
    %int12288_8308 = torch.constant.int 12288
    %5921 = torch.prim.ListConstruct %int1_8306, %int512_8307, %int12288_8308 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5922 = torch.aten.view %5920, %5921 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_8309 = torch.constant.str "tanh"
    %5923 = torch.aten.gelu %5922, %str_8309 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_8310 = torch.constant.int 512
    %int12288_8311 = torch.constant.int 12288
    %5924 = torch.prim.ListConstruct %int512_8310, %int12288_8311 : (!torch.int, !torch.int) -> !torch.list<int>
    %5925 = torch.aten.view %5923, %5924 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.16.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.16.txt_mlp.2.weight : tensor<3072x12288xf16>
    %5926 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_8312 = torch.constant.int 0
    %int1_8313 = torch.constant.int 1
    %5927 = torch.aten.transpose.int %5926, %int0_8312, %int1_8313 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.16.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.16.txt_mlp.2.bias : tensor<3072xf16>
    %5928 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_8314 = torch.constant.int 6
    %5929 = torch.prims.convert_element_type %5928, %int6_8314 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_8315 = torch.constant.int 6
    %5930 = torch.prims.convert_element_type %5925, %int6_8315 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_8316 = torch.constant.int 6
    %5931 = torch.prims.convert_element_type %5927, %int6_8316 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %5932 = torch.aten.mm %5930, %5931 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_8317 = torch.constant.int 1
    %5933 = torch.aten.mul.Scalar %5932, %int1_8317 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_8318 = torch.constant.int 1
    %5934 = torch.aten.mul.Scalar %5929, %int1_8318 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_8319 = torch.constant.int 1
    %5935 = torch.aten.add.Tensor %5933, %5934, %int1_8319 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_8320 = torch.constant.int 5
    %5936 = torch.prims.convert_element_type %5935, %int5_8320 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_8321 = torch.constant.int 1
    %int512_8322 = torch.constant.int 512
    %int3072_8323 = torch.constant.int 3072
    %5937 = torch.prim.ListConstruct %int1_8321, %int512_8322, %int3072_8323 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5938 = torch.aten.view %5936, %5937 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %5939 = torch.aten.mul.Tensor %5645, %5938 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_8324 = torch.constant.int 1
    %5940 = torch.aten.add.Tensor %5897, %5939, %int1_8324 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %5941 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.17.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.17.img_mod.lin.weight : tensor<18432x3072xf16>
    %5942 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_8325 = torch.constant.int 0
    %int1_8326 = torch.constant.int 1
    %5943 = torch.aten.transpose.int %5942, %int0_8325, %int1_8326 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.17.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.17.img_mod.lin.bias : tensor<18432xf16>
    %5944 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_8327 = torch.constant.int 6
    %5945 = torch.prims.convert_element_type %5944, %int6_8327 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_8328 = torch.constant.int 6
    %5946 = torch.prims.convert_element_type %5941, %int6_8328 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_8329 = torch.constant.int 6
    %5947 = torch.prims.convert_element_type %5943, %int6_8329 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %5948 = torch.aten.mm %5946, %5947 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_8330 = torch.constant.int 1
    %5949 = torch.aten.mul.Scalar %5948, %int1_8330 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_8331 = torch.constant.int 1
    %5950 = torch.aten.mul.Scalar %5945, %int1_8331 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_8332 = torch.constant.int 1
    %5951 = torch.aten.add.Tensor %5949, %5950, %int1_8332 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_8333 = torch.constant.int 5
    %5952 = torch.prims.convert_element_type %5951, %int5_8333 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_8334 = torch.constant.int 0
    %int0_8335 = torch.constant.int 0
    %int9223372036854775807_8336 = torch.constant.int 9223372036854775807
    %int1_8337 = torch.constant.int 1
    %5953 = torch.aten.slice.Tensor %5952, %int0_8334, %int0_8335, %int9223372036854775807_8336, %int1_8337 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_8338 = torch.constant.int 1
    %5954 = torch.aten.unsqueeze %5953, %int1_8338 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_8339 = torch.constant.int 2
    %int0_8340 = torch.constant.int 0
    %int9223372036854775807_8341 = torch.constant.int 9223372036854775807
    %int1_8342 = torch.constant.int 1
    %5955 = torch.aten.slice.Tensor %5954, %int2_8339, %int0_8340, %int9223372036854775807_8341, %int1_8342 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_8343 = torch.constant.int -1
    %int0_8344 = torch.constant.int 0
    %int3072_8345 = torch.constant.int 3072
    %int1_8346 = torch.constant.int 1
    %5956 = torch.aten.slice.Tensor %5955, %int-1_8343, %int0_8344, %int3072_8345, %int1_8346 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8347 = torch.constant.int -1
    %int3072_8348 = torch.constant.int 3072
    %int6144_8349 = torch.constant.int 6144
    %int1_8350 = torch.constant.int 1
    %5957 = torch.aten.slice.Tensor %5955, %int-1_8347, %int3072_8348, %int6144_8349, %int1_8350 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8351 = torch.constant.int -1
    %int6144_8352 = torch.constant.int 6144
    %int9216_8353 = torch.constant.int 9216
    %int1_8354 = torch.constant.int 1
    %5958 = torch.aten.slice.Tensor %5955, %int-1_8351, %int6144_8352, %int9216_8353, %int1_8354 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8355 = torch.constant.int -1
    %int9216_8356 = torch.constant.int 9216
    %int12288_8357 = torch.constant.int 12288
    %int1_8358 = torch.constant.int 1
    %5959 = torch.aten.slice.Tensor %5955, %int-1_8355, %int9216_8356, %int12288_8357, %int1_8358 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8359 = torch.constant.int -1
    %int12288_8360 = torch.constant.int 12288
    %int15360_8361 = torch.constant.int 15360
    %int1_8362 = torch.constant.int 1
    %5960 = torch.aten.slice.Tensor %5955, %int-1_8359, %int12288_8360, %int15360_8361, %int1_8362 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8363 = torch.constant.int -1
    %int15360_8364 = torch.constant.int 15360
    %int18432_8365 = torch.constant.int 18432
    %int1_8366 = torch.constant.int 1
    %5961 = torch.aten.slice.Tensor %5955, %int-1_8363, %int15360_8364, %int18432_8365, %int1_8366 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5962 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.17.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.17.txt_mod.lin.weight : tensor<18432x3072xf16>
    %5963 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_8367 = torch.constant.int 0
    %int1_8368 = torch.constant.int 1
    %5964 = torch.aten.transpose.int %5963, %int0_8367, %int1_8368 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.17.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.17.txt_mod.lin.bias : tensor<18432xf16>
    %5965 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_8369 = torch.constant.int 6
    %5966 = torch.prims.convert_element_type %5965, %int6_8369 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_8370 = torch.constant.int 6
    %5967 = torch.prims.convert_element_type %5962, %int6_8370 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_8371 = torch.constant.int 6
    %5968 = torch.prims.convert_element_type %5964, %int6_8371 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %5969 = torch.aten.mm %5967, %5968 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_8372 = torch.constant.int 1
    %5970 = torch.aten.mul.Scalar %5969, %int1_8372 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_8373 = torch.constant.int 1
    %5971 = torch.aten.mul.Scalar %5966, %int1_8373 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_8374 = torch.constant.int 1
    %5972 = torch.aten.add.Tensor %5970, %5971, %int1_8374 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_8375 = torch.constant.int 5
    %5973 = torch.prims.convert_element_type %5972, %int5_8375 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_8376 = torch.constant.int 0
    %int0_8377 = torch.constant.int 0
    %int9223372036854775807_8378 = torch.constant.int 9223372036854775807
    %int1_8379 = torch.constant.int 1
    %5974 = torch.aten.slice.Tensor %5973, %int0_8376, %int0_8377, %int9223372036854775807_8378, %int1_8379 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_8380 = torch.constant.int 1
    %5975 = torch.aten.unsqueeze %5974, %int1_8380 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_8381 = torch.constant.int 2
    %int0_8382 = torch.constant.int 0
    %int9223372036854775807_8383 = torch.constant.int 9223372036854775807
    %int1_8384 = torch.constant.int 1
    %5976 = torch.aten.slice.Tensor %5975, %int2_8381, %int0_8382, %int9223372036854775807_8383, %int1_8384 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_8385 = torch.constant.int -1
    %int0_8386 = torch.constant.int 0
    %int3072_8387 = torch.constant.int 3072
    %int1_8388 = torch.constant.int 1
    %5977 = torch.aten.slice.Tensor %5976, %int-1_8385, %int0_8386, %int3072_8387, %int1_8388 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8389 = torch.constant.int -1
    %int3072_8390 = torch.constant.int 3072
    %int6144_8391 = torch.constant.int 6144
    %int1_8392 = torch.constant.int 1
    %5978 = torch.aten.slice.Tensor %5976, %int-1_8389, %int3072_8390, %int6144_8391, %int1_8392 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8393 = torch.constant.int -1
    %int6144_8394 = torch.constant.int 6144
    %int9216_8395 = torch.constant.int 9216
    %int1_8396 = torch.constant.int 1
    %5979 = torch.aten.slice.Tensor %5976, %int-1_8393, %int6144_8394, %int9216_8395, %int1_8396 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8397 = torch.constant.int -1
    %int9216_8398 = torch.constant.int 9216
    %int12288_8399 = torch.constant.int 12288
    %int1_8400 = torch.constant.int 1
    %5980 = torch.aten.slice.Tensor %5976, %int-1_8397, %int9216_8398, %int12288_8399, %int1_8400 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8401 = torch.constant.int -1
    %int12288_8402 = torch.constant.int 12288
    %int15360_8403 = torch.constant.int 15360
    %int1_8404 = torch.constant.int 1
    %5981 = torch.aten.slice.Tensor %5976, %int-1_8401, %int12288_8402, %int15360_8403, %int1_8404 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8405 = torch.constant.int -1
    %int15360_8406 = torch.constant.int 15360
    %int18432_8407 = torch.constant.int 18432
    %int1_8408 = torch.constant.int 1
    %5982 = torch.aten.slice.Tensor %5976, %int-1_8405, %int15360_8406, %int18432_8407, %int1_8408 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_8409 = torch.constant.int 6
    %5983 = torch.prims.convert_element_type %5880, %int6_8409 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_8410 = torch.constant.int 2
    %5984 = torch.prim.ListConstruct %int2_8410 : (!torch.int) -> !torch.list<int>
    %int0_8411 = torch.constant.int 0
    %true_8412 = torch.constant.bool true
    %result0_8413, %result1_8414 = torch.aten.var_mean.correction %5983, %5984, %int0_8411, %true_8412 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_8415 = torch.constant.float 9.9999999999999995E-7
    %int1_8416 = torch.constant.int 1
    %5985 = torch.aten.add.Scalar %result0_8413, %float9.999990e-07_8415, %int1_8416 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %5986 = torch.aten.rsqrt %5985 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_8417 = torch.constant.int 1
    %5987 = torch.aten.sub.Tensor %5880, %result1_8414, %int1_8417 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %5988 = torch.aten.mul.Tensor %5987, %5986 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_8418 = torch.constant.int 5
    %5989 = torch.prims.convert_element_type %5988, %int5_8418 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_8419 = torch.constant.int 1
    %int1_8420 = torch.constant.int 1
    %5990 = torch.aten.add.Scalar %5957, %int1_8419, %int1_8420 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5991 = torch.aten.mul.Tensor %5990, %5989 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_8421 = torch.constant.int 1
    %5992 = torch.aten.add.Tensor %5991, %5956, %int1_8421 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_8422 = torch.constant.int 4096
    %int3072_8423 = torch.constant.int 3072
    %5993 = torch.prim.ListConstruct %int4096_8422, %int3072_8423 : (!torch.int, !torch.int) -> !torch.list<int>
    %5994 = torch.aten.view %5992, %5993 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.17.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.17.img_attn.qkv.weight : tensor<9216x3072xf16>
    %5995 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_8424 = torch.constant.int 0
    %int1_8425 = torch.constant.int 1
    %5996 = torch.aten.transpose.int %5995, %int0_8424, %int1_8425 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.17.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.17.img_attn.qkv.bias : tensor<9216xf16>
    %5997 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_8426 = torch.constant.int 6
    %5998 = torch.prims.convert_element_type %5997, %int6_8426 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_8427 = torch.constant.int 6
    %5999 = torch.prims.convert_element_type %5994, %int6_8427 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_8428 = torch.constant.int 6
    %6000 = torch.prims.convert_element_type %5996, %int6_8428 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %6001 = torch.aten.mm %5999, %6000 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_8429 = torch.constant.int 1
    %6002 = torch.aten.mul.Scalar %6001, %int1_8429 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_8430 = torch.constant.int 1
    %6003 = torch.aten.mul.Scalar %5998, %int1_8430 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_8431 = torch.constant.int 1
    %6004 = torch.aten.add.Tensor %6002, %6003, %int1_8431 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_8432 = torch.constant.int 5
    %6005 = torch.prims.convert_element_type %6004, %int5_8432 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_8433 = torch.constant.int 1
    %int4096_8434 = torch.constant.int 4096
    %int9216_8435 = torch.constant.int 9216
    %6006 = torch.prim.ListConstruct %int1_8433, %int4096_8434, %int9216_8435 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6007 = torch.aten.view %6005, %6006 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %6008 = torch_c.to_builtin_tensor %6007 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_8436 = tensor.cast %6008 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_8437 = arith.constant 0 : index
    %dim_8438 = tensor.dim %cast_8436, %c0_8437 : tensor<?x?x?xf16>
    %c1_8439 = arith.constant 1 : index
    %dim_8440 = tensor.dim %cast_8436, %c1_8439 : tensor<?x?x?xf16>
    %c2_8441 = arith.constant 2 : index
    %dim_8442 = tensor.dim %cast_8436, %c2_8441 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_8436 : tensor<?x?x?xf16>{%dim_8438, %dim_8440, %dim_8442}]
    %cast_8443 = tensor.cast %cast_8436 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %6009 = torch_c.from_builtin_tensor %cast_8443 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_8444 = torch.constant.int 1
    %int4096_8445 = torch.constant.int 4096
    %int3_8446 = torch.constant.int 3
    %int24_8447 = torch.constant.int 24
    %int128_8448 = torch.constant.int 128
    %6010 = torch.prim.ListConstruct %int1_8444, %int4096_8445, %int3_8446, %int24_8447, %int128_8448 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6011 = torch.aten.view %6009, %6010 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_8449 = torch.constant.int 2
    %int0_8450 = torch.constant.int 0
    %int3_8451 = torch.constant.int 3
    %int1_8452 = torch.constant.int 1
    %int4_8453 = torch.constant.int 4
    %6012 = torch.prim.ListConstruct %int2_8449, %int0_8450, %int3_8451, %int1_8452, %int4_8453 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6013 = torch.aten.permute %6011, %6012 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_8454 = torch.constant.int 0
    %int0_8455 = torch.constant.int 0
    %6014 = torch.aten.select.int %6013, %int0_8454, %int0_8455 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_8456 = torch.constant.int 6
    %6015 = torch.prims.convert_element_type %6014, %int6_8456 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_8457 = torch.constant.int 2
    %6016 = torch.aten.pow.Tensor_Scalar %6015, %int2_8457 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_8458 = torch.constant.int -1
    %6017 = torch.prim.ListConstruct %int-1_8458 : (!torch.int) -> !torch.list<int>
    %true_8459 = torch.constant.bool true
    %none_8460 = torch.constant.none
    %6018 = torch.aten.mean.dim %6016, %6017, %true_8459, %none_8460 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_8461 = torch.constant.float 9.9999999999999995E-7
    %int1_8462 = torch.constant.int 1
    %6019 = torch.aten.add.Scalar %6018, %float9.999990e-07_8461, %int1_8462 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %6020 = torch.aten.rsqrt %6019 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %6021 = torch.aten.mul.Tensor %6015, %6020 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_8463 = torch.constant.int 5
    %6022 = torch.prims.convert_element_type %6021, %int5_8463 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.17.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.17.img_attn.norm.query_norm.scale : tensor<128xf16>
    %6023 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6024 = torch.aten.mul.Tensor %6022, %6023 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_8464 = torch.constant.int 1
    %int4096_8465 = torch.constant.int 4096
    %int3_8466 = torch.constant.int 3
    %int24_8467 = torch.constant.int 24
    %int128_8468 = torch.constant.int 128
    %6025 = torch.prim.ListConstruct %int1_8464, %int4096_8465, %int3_8466, %int24_8467, %int128_8468 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6026 = torch.aten.view %6009, %6025 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_8469 = torch.constant.int 2
    %int0_8470 = torch.constant.int 0
    %int3_8471 = torch.constant.int 3
    %int1_8472 = torch.constant.int 1
    %int4_8473 = torch.constant.int 4
    %6027 = torch.prim.ListConstruct %int2_8469, %int0_8470, %int3_8471, %int1_8472, %int4_8473 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6028 = torch.aten.permute %6026, %6027 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_8474 = torch.constant.int 0
    %int1_8475 = torch.constant.int 1
    %6029 = torch.aten.select.int %6028, %int0_8474, %int1_8475 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_8476 = torch.constant.int 6
    %6030 = torch.prims.convert_element_type %6029, %int6_8476 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_8477 = torch.constant.int 2
    %6031 = torch.aten.pow.Tensor_Scalar %6030, %int2_8477 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_8478 = torch.constant.int -1
    %6032 = torch.prim.ListConstruct %int-1_8478 : (!torch.int) -> !torch.list<int>
    %true_8479 = torch.constant.bool true
    %none_8480 = torch.constant.none
    %6033 = torch.aten.mean.dim %6031, %6032, %true_8479, %none_8480 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_8481 = torch.constant.float 9.9999999999999995E-7
    %int1_8482 = torch.constant.int 1
    %6034 = torch.aten.add.Scalar %6033, %float9.999990e-07_8481, %int1_8482 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %6035 = torch.aten.rsqrt %6034 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %6036 = torch.aten.mul.Tensor %6030, %6035 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_8483 = torch.constant.int 5
    %6037 = torch.prims.convert_element_type %6036, %int5_8483 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.17.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.17.img_attn.norm.key_norm.scale : tensor<128xf16>
    %6038 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6039 = torch.aten.mul.Tensor %6037, %6038 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_8484 = torch.constant.int 5
    %6040 = torch.prims.convert_element_type %6024, %int5_8484 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_8485 = torch.constant.int 5
    %6041 = torch.prims.convert_element_type %6039, %int5_8485 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_8486 = torch.constant.int 6
    %6042 = torch.prims.convert_element_type %5940, %int6_8486 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_8487 = torch.constant.int 2
    %6043 = torch.prim.ListConstruct %int2_8487 : (!torch.int) -> !torch.list<int>
    %int0_8488 = torch.constant.int 0
    %true_8489 = torch.constant.bool true
    %result0_8490, %result1_8491 = torch.aten.var_mean.correction %6042, %6043, %int0_8488, %true_8489 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_8492 = torch.constant.float 9.9999999999999995E-7
    %int1_8493 = torch.constant.int 1
    %6044 = torch.aten.add.Scalar %result0_8490, %float9.999990e-07_8492, %int1_8493 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %6045 = torch.aten.rsqrt %6044 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_8494 = torch.constant.int 1
    %6046 = torch.aten.sub.Tensor %5940, %result1_8491, %int1_8494 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %6047 = torch.aten.mul.Tensor %6046, %6045 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_8495 = torch.constant.int 5
    %6048 = torch.prims.convert_element_type %6047, %int5_8495 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_8496 = torch.constant.int 1
    %int1_8497 = torch.constant.int 1
    %6049 = torch.aten.add.Scalar %5978, %int1_8496, %int1_8497 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %6050 = torch.aten.mul.Tensor %6049, %6048 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_8498 = torch.constant.int 1
    %6051 = torch.aten.add.Tensor %6050, %5977, %int1_8498 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_8499 = torch.constant.int 512
    %int3072_8500 = torch.constant.int 3072
    %6052 = torch.prim.ListConstruct %int512_8499, %int3072_8500 : (!torch.int, !torch.int) -> !torch.list<int>
    %6053 = torch.aten.view %6051, %6052 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.17.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.17.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %6054 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_8501 = torch.constant.int 0
    %int1_8502 = torch.constant.int 1
    %6055 = torch.aten.transpose.int %6054, %int0_8501, %int1_8502 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.17.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.17.txt_attn.qkv.bias : tensor<9216xf16>
    %6056 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_8503 = torch.constant.int 6
    %6057 = torch.prims.convert_element_type %6056, %int6_8503 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_8504 = torch.constant.int 6
    %6058 = torch.prims.convert_element_type %6053, %int6_8504 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_8505 = torch.constant.int 6
    %6059 = torch.prims.convert_element_type %6055, %int6_8505 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %6060 = torch.aten.mm %6058, %6059 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_8506 = torch.constant.int 1
    %6061 = torch.aten.mul.Scalar %6060, %int1_8506 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_8507 = torch.constant.int 1
    %6062 = torch.aten.mul.Scalar %6057, %int1_8507 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_8508 = torch.constant.int 1
    %6063 = torch.aten.add.Tensor %6061, %6062, %int1_8508 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_8509 = torch.constant.int 5
    %6064 = torch.prims.convert_element_type %6063, %int5_8509 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_8510 = torch.constant.int 1
    %int512_8511 = torch.constant.int 512
    %int9216_8512 = torch.constant.int 9216
    %6065 = torch.prim.ListConstruct %int1_8510, %int512_8511, %int9216_8512 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6066 = torch.aten.view %6064, %6065 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %6067 = torch_c.to_builtin_tensor %6066 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_8513 = tensor.cast %6067 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_8514 = arith.constant 0 : index
    %dim_8515 = tensor.dim %cast_8513, %c0_8514 : tensor<?x?x?xf16>
    %c1_8516 = arith.constant 1 : index
    %dim_8517 = tensor.dim %cast_8513, %c1_8516 : tensor<?x?x?xf16>
    %c2_8518 = arith.constant 2 : index
    %dim_8519 = tensor.dim %cast_8513, %c2_8518 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_8513 : tensor<?x?x?xf16>{%dim_8515, %dim_8517, %dim_8519}]
    %cast_8520 = tensor.cast %cast_8513 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %6068 = torch_c.from_builtin_tensor %cast_8520 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_8521 = torch.constant.int 1
    %int512_8522 = torch.constant.int 512
    %int3_8523 = torch.constant.int 3
    %int24_8524 = torch.constant.int 24
    %int128_8525 = torch.constant.int 128
    %6069 = torch.prim.ListConstruct %int1_8521, %int512_8522, %int3_8523, %int24_8524, %int128_8525 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6070 = torch.aten.view %6068, %6069 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_8526 = torch.constant.int 2
    %int0_8527 = torch.constant.int 0
    %int3_8528 = torch.constant.int 3
    %int1_8529 = torch.constant.int 1
    %int4_8530 = torch.constant.int 4
    %6071 = torch.prim.ListConstruct %int2_8526, %int0_8527, %int3_8528, %int1_8529, %int4_8530 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6072 = torch.aten.permute %6070, %6071 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_8531 = torch.constant.int 0
    %int0_8532 = torch.constant.int 0
    %6073 = torch.aten.select.int %6072, %int0_8531, %int0_8532 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_8533 = torch.constant.int 6
    %6074 = torch.prims.convert_element_type %6073, %int6_8533 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_8534 = torch.constant.int 2
    %6075 = torch.aten.pow.Tensor_Scalar %6074, %int2_8534 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_8535 = torch.constant.int -1
    %6076 = torch.prim.ListConstruct %int-1_8535 : (!torch.int) -> !torch.list<int>
    %true_8536 = torch.constant.bool true
    %none_8537 = torch.constant.none
    %6077 = torch.aten.mean.dim %6075, %6076, %true_8536, %none_8537 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_8538 = torch.constant.float 9.9999999999999995E-7
    %int1_8539 = torch.constant.int 1
    %6078 = torch.aten.add.Scalar %6077, %float9.999990e-07_8538, %int1_8539 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %6079 = torch.aten.rsqrt %6078 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %6080 = torch.aten.mul.Tensor %6074, %6079 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_8540 = torch.constant.int 5
    %6081 = torch.prims.convert_element_type %6080, %int5_8540 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.17.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.17.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %6082 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6083 = torch.aten.mul.Tensor %6081, %6082 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_8541 = torch.constant.int 1
    %int512_8542 = torch.constant.int 512
    %int3_8543 = torch.constant.int 3
    %int24_8544 = torch.constant.int 24
    %int128_8545 = torch.constant.int 128
    %6084 = torch.prim.ListConstruct %int1_8541, %int512_8542, %int3_8543, %int24_8544, %int128_8545 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6085 = torch.aten.view %6068, %6084 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_8546 = torch.constant.int 2
    %int0_8547 = torch.constant.int 0
    %int3_8548 = torch.constant.int 3
    %int1_8549 = torch.constant.int 1
    %int4_8550 = torch.constant.int 4
    %6086 = torch.prim.ListConstruct %int2_8546, %int0_8547, %int3_8548, %int1_8549, %int4_8550 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6087 = torch.aten.permute %6085, %6086 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_8551 = torch.constant.int 0
    %int1_8552 = torch.constant.int 1
    %6088 = torch.aten.select.int %6087, %int0_8551, %int1_8552 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_8553 = torch.constant.int 6
    %6089 = torch.prims.convert_element_type %6088, %int6_8553 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_8554 = torch.constant.int 2
    %6090 = torch.aten.pow.Tensor_Scalar %6089, %int2_8554 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_8555 = torch.constant.int -1
    %6091 = torch.prim.ListConstruct %int-1_8555 : (!torch.int) -> !torch.list<int>
    %true_8556 = torch.constant.bool true
    %none_8557 = torch.constant.none
    %6092 = torch.aten.mean.dim %6090, %6091, %true_8556, %none_8557 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_8558 = torch.constant.float 9.9999999999999995E-7
    %int1_8559 = torch.constant.int 1
    %6093 = torch.aten.add.Scalar %6092, %float9.999990e-07_8558, %int1_8559 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %6094 = torch.aten.rsqrt %6093 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %6095 = torch.aten.mul.Tensor %6089, %6094 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_8560 = torch.constant.int 5
    %6096 = torch.prims.convert_element_type %6095, %int5_8560 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.17.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.17.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %6097 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6098 = torch.aten.mul.Tensor %6096, %6097 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_8561 = torch.constant.int 5
    %6099 = torch.prims.convert_element_type %6083, %int5_8561 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_8562 = torch.constant.int 5
    %6100 = torch.prims.convert_element_type %6098, %int5_8562 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %6101 = torch.prim.ListConstruct %6099, %6040 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_8563 = torch.constant.int 2
    %6102 = torch.aten.cat %6101, %int2_8563 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %6103 = torch.prim.ListConstruct %6100, %6041 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_8564 = torch.constant.int 2
    %6104 = torch.aten.cat %6103, %int2_8564 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_8565 = torch.constant.int 1
    %int512_8566 = torch.constant.int 512
    %int3_8567 = torch.constant.int 3
    %int24_8568 = torch.constant.int 24
    %int128_8569 = torch.constant.int 128
    %6105 = torch.prim.ListConstruct %int1_8565, %int512_8566, %int3_8567, %int24_8568, %int128_8569 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6106 = torch.aten.view %6068, %6105 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_8570 = torch.constant.int 2
    %int0_8571 = torch.constant.int 0
    %int3_8572 = torch.constant.int 3
    %int1_8573 = torch.constant.int 1
    %int4_8574 = torch.constant.int 4
    %6107 = torch.prim.ListConstruct %int2_8570, %int0_8571, %int3_8572, %int1_8573, %int4_8574 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6108 = torch.aten.permute %6106, %6107 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_8575 = torch.constant.int 0
    %int2_8576 = torch.constant.int 2
    %6109 = torch.aten.select.int %6108, %int0_8575, %int2_8576 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_8577 = torch.constant.int 1
    %int4096_8578 = torch.constant.int 4096
    %int3_8579 = torch.constant.int 3
    %int24_8580 = torch.constant.int 24
    %int128_8581 = torch.constant.int 128
    %6110 = torch.prim.ListConstruct %int1_8577, %int4096_8578, %int3_8579, %int24_8580, %int128_8581 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6111 = torch.aten.view %6009, %6110 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_8582 = torch.constant.int 2
    %int0_8583 = torch.constant.int 0
    %int3_8584 = torch.constant.int 3
    %int1_8585 = torch.constant.int 1
    %int4_8586 = torch.constant.int 4
    %6112 = torch.prim.ListConstruct %int2_8582, %int0_8583, %int3_8584, %int1_8585, %int4_8586 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6113 = torch.aten.permute %6111, %6112 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_8587 = torch.constant.int 0
    %int2_8588 = torch.constant.int 2
    %6114 = torch.aten.select.int %6113, %int0_8587, %int2_8588 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %6115 = torch.prim.ListConstruct %6109, %6114 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_8589 = torch.constant.int 2
    %6116 = torch.aten.cat %6115, %int2_8589 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %6117 = torch_c.to_builtin_tensor %6102 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_8590 = tensor.cast %6117 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_8591 = arith.constant 0 : index
    %dim_8592 = tensor.dim %cast_8590, %c0_8591 : tensor<?x?x?x?xf16>
    %c1_8593 = arith.constant 1 : index
    %dim_8594 = tensor.dim %cast_8590, %c1_8593 : tensor<?x?x?x?xf16>
    %c2_8595 = arith.constant 2 : index
    %dim_8596 = tensor.dim %cast_8590, %c2_8595 : tensor<?x?x?x?xf16>
    %c3_8597 = arith.constant 3 : index
    %dim_8598 = tensor.dim %cast_8590, %c3_8597 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_8590 : tensor<?x?x?x?xf16>{%dim_8592, %dim_8594, %dim_8596, %dim_8598}]
    %cast_8599 = tensor.cast %cast_8590 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %6118 = torch_c.from_builtin_tensor %cast_8599 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %6119 = torch_c.to_builtin_tensor %6104 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_8600 = tensor.cast %6119 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_8601 = arith.constant 0 : index
    %dim_8602 = tensor.dim %cast_8600, %c0_8601 : tensor<?x?x?x?xf16>
    %c1_8603 = arith.constant 1 : index
    %dim_8604 = tensor.dim %cast_8600, %c1_8603 : tensor<?x?x?x?xf16>
    %c2_8605 = arith.constant 2 : index
    %dim_8606 = tensor.dim %cast_8600, %c2_8605 : tensor<?x?x?x?xf16>
    %c3_8607 = arith.constant 3 : index
    %dim_8608 = tensor.dim %cast_8600, %c3_8607 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_8600 : tensor<?x?x?x?xf16>{%dim_8602, %dim_8604, %dim_8606, %dim_8608}]
    %cast_8609 = tensor.cast %cast_8600 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %6120 = torch_c.from_builtin_tensor %cast_8609 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %6121 = torch_c.to_builtin_tensor %6116 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_8610 = tensor.cast %6121 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_8611 = arith.constant 0 : index
    %dim_8612 = tensor.dim %cast_8610, %c0_8611 : tensor<?x?x?x?xf16>
    %c1_8613 = arith.constant 1 : index
    %dim_8614 = tensor.dim %cast_8610, %c1_8613 : tensor<?x?x?x?xf16>
    %c2_8615 = arith.constant 2 : index
    %dim_8616 = tensor.dim %cast_8610, %c2_8615 : tensor<?x?x?x?xf16>
    %c3_8617 = arith.constant 3 : index
    %dim_8618 = tensor.dim %cast_8610, %c3_8617 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_8610 : tensor<?x?x?x?xf16>{%dim_8612, %dim_8614, %dim_8616, %dim_8618}]
    %cast_8619 = tensor.cast %cast_8610 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %6122 = torch_c.from_builtin_tensor %cast_8619 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8620 = torch.constant.int 6
    %6123 = torch.prims.convert_element_type %6118, %int6_8620 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8621 = torch.constant.int 1
    %int24_8622 = torch.constant.int 24
    %int4608_8623 = torch.constant.int 4608
    %int-1_8624 = torch.constant.int -1
    %int1_8625 = torch.constant.int 1
    %int2_8626 = torch.constant.int 2
    %6124 = torch.prim.ListConstruct %int1_8621, %int24_8622, %int4608_8623, %int-1_8624, %int1_8625, %int2_8626 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6125 = torch.aten.view %6123, %6124 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_8627 = torch.constant.int 6
    %6126 = torch.prims.convert_element_type %6120, %int6_8627 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8628 = torch.constant.int 1
    %int24_8629 = torch.constant.int 24
    %int4608_8630 = torch.constant.int 4608
    %int-1_8631 = torch.constant.int -1
    %int1_8632 = torch.constant.int 1
    %int2_8633 = torch.constant.int 2
    %6127 = torch.prim.ListConstruct %int1_8628, %int24_8629, %int4608_8630, %int-1_8631, %int1_8632, %int2_8633 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6128 = torch.aten.view %6126, %6127 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_8634 = torch.constant.int 5
    %int0_8635 = torch.constant.int 0
    %6129 = torch.aten.select.int %211, %int5_8634, %int0_8635 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8636 = torch.constant.int 5
    %int0_8637 = torch.constant.int 0
    %6130 = torch.aten.select.int %6125, %int5_8636, %int0_8637 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6131 = torch.aten.mul.Tensor %6129, %6130 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8638 = torch.constant.int 5
    %int1_8639 = torch.constant.int 1
    %6132 = torch.aten.select.int %211, %int5_8638, %int1_8639 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8640 = torch.constant.int 5
    %int1_8641 = torch.constant.int 1
    %6133 = torch.aten.select.int %6125, %int5_8640, %int1_8641 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6134 = torch.aten.mul.Tensor %6132, %6133 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8642 = torch.constant.int 1
    %6135 = torch.aten.add.Tensor %6131, %6134, %int1_8642 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8643 = torch.constant.int 5
    %int0_8644 = torch.constant.int 0
    %6136 = torch.aten.select.int %211, %int5_8643, %int0_8644 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8645 = torch.constant.int 5
    %int0_8646 = torch.constant.int 0
    %6137 = torch.aten.select.int %6128, %int5_8645, %int0_8646 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6138 = torch.aten.mul.Tensor %6136, %6137 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8647 = torch.constant.int 5
    %int1_8648 = torch.constant.int 1
    %6139 = torch.aten.select.int %211, %int5_8647, %int1_8648 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8649 = torch.constant.int 5
    %int1_8650 = torch.constant.int 1
    %6140 = torch.aten.select.int %6128, %int5_8649, %int1_8650 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6141 = torch.aten.mul.Tensor %6139, %6140 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8651 = torch.constant.int 1
    %6142 = torch.aten.add.Tensor %6138, %6141, %int1_8651 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8652 = torch.constant.int 1
    %int24_8653 = torch.constant.int 24
    %int4608_8654 = torch.constant.int 4608
    %int128_8655 = torch.constant.int 128
    %6143 = torch.prim.ListConstruct %int1_8652, %int24_8653, %int4608_8654, %int128_8655 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6144 = torch.aten.view %6135, %6143 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8656 = torch.constant.int 5
    %6145 = torch.prims.convert_element_type %6144, %int5_8656 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_8657 = torch.constant.int 1
    %int24_8658 = torch.constant.int 24
    %int4608_8659 = torch.constant.int 4608
    %int128_8660 = torch.constant.int 128
    %6146 = torch.prim.ListConstruct %int1_8657, %int24_8658, %int4608_8659, %int128_8660 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6147 = torch.aten.view %6142, %6146 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8661 = torch.constant.int 5
    %6148 = torch.prims.convert_element_type %6147, %int5_8661 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_8662 = torch.constant.float 0.000000e+00
    %false_8663 = torch.constant.bool false
    %none_8664 = torch.constant.none
    %none_8665 = torch.constant.none
    %6149:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%6145, %6148, %6122, %float0.000000e00_8662, %false_8663, %none_8664, %none_8665) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_8666 = torch.constant.int 0
    %int2_8667 = torch.constant.int 2
    %int1_8668 = torch.constant.int 1
    %int3_8669 = torch.constant.int 3
    %6150 = torch.prim.ListConstruct %int0_8666, %int2_8667, %int1_8668, %int3_8669 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6151 = torch.aten.permute %6149#0, %6150 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_8670 = torch.constant.int 1
    %int4608_8671 = torch.constant.int 4608
    %int3072_8672 = torch.constant.int 3072
    %6152 = torch.prim.ListConstruct %int1_8670, %int4608_8671, %int3072_8672 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6153 = torch.aten.view %6151, %6152 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_8673 = torch.constant.int 0
    %int0_8674 = torch.constant.int 0
    %int9223372036854775807_8675 = torch.constant.int 9223372036854775807
    %int1_8676 = torch.constant.int 1
    %6154 = torch.aten.slice.Tensor %6153, %int0_8673, %int0_8674, %int9223372036854775807_8675, %int1_8676 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_8677 = torch.constant.int 1
    %int0_8678 = torch.constant.int 0
    %int512_8679 = torch.constant.int 512
    %int1_8680 = torch.constant.int 1
    %6155 = torch.aten.slice.Tensor %6154, %int1_8677, %int0_8678, %int512_8679, %int1_8680 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_8681 = torch.constant.int 0
    %int0_8682 = torch.constant.int 0
    %int9223372036854775807_8683 = torch.constant.int 9223372036854775807
    %int1_8684 = torch.constant.int 1
    %6156 = torch.aten.slice.Tensor %6153, %int0_8681, %int0_8682, %int9223372036854775807_8683, %int1_8684 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_8685 = torch.constant.int 1
    %int512_8686 = torch.constant.int 512
    %int9223372036854775807_8687 = torch.constant.int 9223372036854775807
    %int1_8688 = torch.constant.int 1
    %6157 = torch.aten.slice.Tensor %6156, %int1_8685, %int512_8686, %int9223372036854775807_8687, %int1_8688 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_8689 = torch.constant.int 4096
    %int3072_8690 = torch.constant.int 3072
    %6158 = torch.prim.ListConstruct %int4096_8689, %int3072_8690 : (!torch.int, !torch.int) -> !torch.list<int>
    %6159 = torch.aten.view %6157, %6158 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.17.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.17.img_attn.proj.weight : tensor<3072x3072xf16>
    %6160 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_8691 = torch.constant.int 0
    %int1_8692 = torch.constant.int 1
    %6161 = torch.aten.transpose.int %6160, %int0_8691, %int1_8692 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.17.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.17.img_attn.proj.bias : tensor<3072xf16>
    %6162 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_8693 = torch.constant.int 6
    %6163 = torch.prims.convert_element_type %6162, %int6_8693 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_8694 = torch.constant.int 6
    %6164 = torch.prims.convert_element_type %6159, %int6_8694 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_8695 = torch.constant.int 6
    %6165 = torch.prims.convert_element_type %6161, %int6_8695 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %6166 = torch.aten.mm %6164, %6165 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_8696 = torch.constant.int 1
    %6167 = torch.aten.mul.Scalar %6166, %int1_8696 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_8697 = torch.constant.int 1
    %6168 = torch.aten.mul.Scalar %6163, %int1_8697 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_8698 = torch.constant.int 1
    %6169 = torch.aten.add.Tensor %6167, %6168, %int1_8698 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_8699 = torch.constant.int 5
    %6170 = torch.prims.convert_element_type %6169, %int5_8699 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_8700 = torch.constant.int 1
    %int4096_8701 = torch.constant.int 4096
    %int3072_8702 = torch.constant.int 3072
    %6171 = torch.prim.ListConstruct %int1_8700, %int4096_8701, %int3072_8702 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6172 = torch.aten.view %6170, %6171 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %6173 = torch.aten.mul.Tensor %5958, %6172 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_8703 = torch.constant.int 1
    %6174 = torch.aten.add.Tensor %5880, %6173, %int1_8703 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_8704 = torch.constant.int 1
    %int1_8705 = torch.constant.int 1
    %6175 = torch.aten.add.Scalar %5960, %int1_8704, %int1_8705 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_8706 = torch.constant.int 6
    %6176 = torch.prims.convert_element_type %6174, %int6_8706 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_8707 = torch.constant.int 2
    %6177 = torch.prim.ListConstruct %int2_8707 : (!torch.int) -> !torch.list<int>
    %int0_8708 = torch.constant.int 0
    %true_8709 = torch.constant.bool true
    %result0_8710, %result1_8711 = torch.aten.var_mean.correction %6176, %6177, %int0_8708, %true_8709 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_8712 = torch.constant.float 9.9999999999999995E-7
    %int1_8713 = torch.constant.int 1
    %6178 = torch.aten.add.Scalar %result0_8710, %float9.999990e-07_8712, %int1_8713 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %6179 = torch.aten.rsqrt %6178 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_8714 = torch.constant.int 1
    %6180 = torch.aten.sub.Tensor %6174, %result1_8711, %int1_8714 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %6181 = torch.aten.mul.Tensor %6180, %6179 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_8715 = torch.constant.int 5
    %6182 = torch.prims.convert_element_type %6181, %int5_8715 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %6183 = torch.aten.mul.Tensor %6175, %6182 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_8716 = torch.constant.int 1
    %6184 = torch.aten.add.Tensor %6183, %5959, %int1_8716 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_8717 = torch.constant.int 4096
    %int3072_8718 = torch.constant.int 3072
    %6185 = torch.prim.ListConstruct %int4096_8717, %int3072_8718 : (!torch.int, !torch.int) -> !torch.list<int>
    %6186 = torch.aten.view %6184, %6185 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.17.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.17.img_mlp.0.weight : tensor<12288x3072xf16>
    %6187 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_8719 = torch.constant.int 0
    %int1_8720 = torch.constant.int 1
    %6188 = torch.aten.transpose.int %6187, %int0_8719, %int1_8720 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.17.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.17.img_mlp.0.bias : tensor<12288xf16>
    %6189 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_8721 = torch.constant.int 6
    %6190 = torch.prims.convert_element_type %6189, %int6_8721 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_8722 = torch.constant.int 6
    %6191 = torch.prims.convert_element_type %6186, %int6_8722 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_8723 = torch.constant.int 6
    %6192 = torch.prims.convert_element_type %6188, %int6_8723 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %6193 = torch.aten.mm %6191, %6192 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_8724 = torch.constant.int 1
    %6194 = torch.aten.mul.Scalar %6193, %int1_8724 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_8725 = torch.constant.int 1
    %6195 = torch.aten.mul.Scalar %6190, %int1_8725 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_8726 = torch.constant.int 1
    %6196 = torch.aten.add.Tensor %6194, %6195, %int1_8726 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_8727 = torch.constant.int 5
    %6197 = torch.prims.convert_element_type %6196, %int5_8727 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_8728 = torch.constant.int 1
    %int4096_8729 = torch.constant.int 4096
    %int12288_8730 = torch.constant.int 12288
    %6198 = torch.prim.ListConstruct %int1_8728, %int4096_8729, %int12288_8730 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6199 = torch.aten.view %6197, %6198 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_8731 = torch.constant.str "tanh"
    %6200 = torch.aten.gelu %6199, %str_8731 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_8732 = torch.constant.int 4096
    %int12288_8733 = torch.constant.int 12288
    %6201 = torch.prim.ListConstruct %int4096_8732, %int12288_8733 : (!torch.int, !torch.int) -> !torch.list<int>
    %6202 = torch.aten.view %6200, %6201 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.17.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.17.img_mlp.2.weight : tensor<3072x12288xf16>
    %6203 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_8734 = torch.constant.int 0
    %int1_8735 = torch.constant.int 1
    %6204 = torch.aten.transpose.int %6203, %int0_8734, %int1_8735 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.17.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.17.img_mlp.2.bias : tensor<3072xf16>
    %6205 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_8736 = torch.constant.int 6
    %6206 = torch.prims.convert_element_type %6205, %int6_8736 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_8737 = torch.constant.int 6
    %6207 = torch.prims.convert_element_type %6202, %int6_8737 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_8738 = torch.constant.int 6
    %6208 = torch.prims.convert_element_type %6204, %int6_8738 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %6209 = torch.aten.mm %6207, %6208 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_8739 = torch.constant.int 1
    %6210 = torch.aten.mul.Scalar %6209, %int1_8739 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_8740 = torch.constant.int 1
    %6211 = torch.aten.mul.Scalar %6206, %int1_8740 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_8741 = torch.constant.int 1
    %6212 = torch.aten.add.Tensor %6210, %6211, %int1_8741 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_8742 = torch.constant.int 5
    %6213 = torch.prims.convert_element_type %6212, %int5_8742 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_8743 = torch.constant.int 1
    %int4096_8744 = torch.constant.int 4096
    %int3072_8745 = torch.constant.int 3072
    %6214 = torch.prim.ListConstruct %int1_8743, %int4096_8744, %int3072_8745 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6215 = torch.aten.view %6213, %6214 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %6216 = torch.aten.mul.Tensor %5961, %6215 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_8746 = torch.constant.int 1
    %6217 = torch.aten.add.Tensor %6174, %6216, %int1_8746 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_8747 = torch.constant.int 512
    %int3072_8748 = torch.constant.int 3072
    %6218 = torch.prim.ListConstruct %int512_8747, %int3072_8748 : (!torch.int, !torch.int) -> !torch.list<int>
    %6219 = torch.aten.view %6155, %6218 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.17.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.17.txt_attn.proj.weight : tensor<3072x3072xf16>
    %6220 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_8749 = torch.constant.int 0
    %int1_8750 = torch.constant.int 1
    %6221 = torch.aten.transpose.int %6220, %int0_8749, %int1_8750 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.17.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.17.txt_attn.proj.bias : tensor<3072xf16>
    %6222 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_8751 = torch.constant.int 6
    %6223 = torch.prims.convert_element_type %6222, %int6_8751 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_8752 = torch.constant.int 6
    %6224 = torch.prims.convert_element_type %6219, %int6_8752 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_8753 = torch.constant.int 6
    %6225 = torch.prims.convert_element_type %6221, %int6_8753 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %6226 = torch.aten.mm %6224, %6225 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_8754 = torch.constant.int 1
    %6227 = torch.aten.mul.Scalar %6226, %int1_8754 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_8755 = torch.constant.int 1
    %6228 = torch.aten.mul.Scalar %6223, %int1_8755 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_8756 = torch.constant.int 1
    %6229 = torch.aten.add.Tensor %6227, %6228, %int1_8756 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_8757 = torch.constant.int 5
    %6230 = torch.prims.convert_element_type %6229, %int5_8757 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_8758 = torch.constant.int 1
    %int512_8759 = torch.constant.int 512
    %int3072_8760 = torch.constant.int 3072
    %6231 = torch.prim.ListConstruct %int1_8758, %int512_8759, %int3072_8760 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6232 = torch.aten.view %6230, %6231 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %6233 = torch.aten.mul.Tensor %5979, %6232 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_8761 = torch.constant.int 1
    %6234 = torch.aten.add.Tensor %5940, %6233, %int1_8761 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_8762 = torch.constant.int 1
    %int1_8763 = torch.constant.int 1
    %6235 = torch.aten.add.Scalar %5981, %int1_8762, %int1_8763 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_8764 = torch.constant.int 6
    %6236 = torch.prims.convert_element_type %6234, %int6_8764 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_8765 = torch.constant.int 2
    %6237 = torch.prim.ListConstruct %int2_8765 : (!torch.int) -> !torch.list<int>
    %int0_8766 = torch.constant.int 0
    %true_8767 = torch.constant.bool true
    %result0_8768, %result1_8769 = torch.aten.var_mean.correction %6236, %6237, %int0_8766, %true_8767 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_8770 = torch.constant.float 9.9999999999999995E-7
    %int1_8771 = torch.constant.int 1
    %6238 = torch.aten.add.Scalar %result0_8768, %float9.999990e-07_8770, %int1_8771 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %6239 = torch.aten.rsqrt %6238 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_8772 = torch.constant.int 1
    %6240 = torch.aten.sub.Tensor %6234, %result1_8769, %int1_8772 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %6241 = torch.aten.mul.Tensor %6240, %6239 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_8773 = torch.constant.int 5
    %6242 = torch.prims.convert_element_type %6241, %int5_8773 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %6243 = torch.aten.mul.Tensor %6235, %6242 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_8774 = torch.constant.int 1
    %6244 = torch.aten.add.Tensor %6243, %5980, %int1_8774 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_8775 = torch.constant.int 512
    %int3072_8776 = torch.constant.int 3072
    %6245 = torch.prim.ListConstruct %int512_8775, %int3072_8776 : (!torch.int, !torch.int) -> !torch.list<int>
    %6246 = torch.aten.view %6244, %6245 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.17.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.17.txt_mlp.0.weight : tensor<12288x3072xf16>
    %6247 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_8777 = torch.constant.int 0
    %int1_8778 = torch.constant.int 1
    %6248 = torch.aten.transpose.int %6247, %int0_8777, %int1_8778 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.17.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.17.txt_mlp.0.bias : tensor<12288xf16>
    %6249 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_8779 = torch.constant.int 6
    %6250 = torch.prims.convert_element_type %6249, %int6_8779 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_8780 = torch.constant.int 6
    %6251 = torch.prims.convert_element_type %6246, %int6_8780 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_8781 = torch.constant.int 6
    %6252 = torch.prims.convert_element_type %6248, %int6_8781 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %6253 = torch.aten.mm %6251, %6252 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_8782 = torch.constant.int 1
    %6254 = torch.aten.mul.Scalar %6253, %int1_8782 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_8783 = torch.constant.int 1
    %6255 = torch.aten.mul.Scalar %6250, %int1_8783 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_8784 = torch.constant.int 1
    %6256 = torch.aten.add.Tensor %6254, %6255, %int1_8784 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_8785 = torch.constant.int 5
    %6257 = torch.prims.convert_element_type %6256, %int5_8785 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_8786 = torch.constant.int 1
    %int512_8787 = torch.constant.int 512
    %int12288_8788 = torch.constant.int 12288
    %6258 = torch.prim.ListConstruct %int1_8786, %int512_8787, %int12288_8788 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6259 = torch.aten.view %6257, %6258 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_8789 = torch.constant.str "tanh"
    %6260 = torch.aten.gelu %6259, %str_8789 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_8790 = torch.constant.int 512
    %int12288_8791 = torch.constant.int 12288
    %6261 = torch.prim.ListConstruct %int512_8790, %int12288_8791 : (!torch.int, !torch.int) -> !torch.list<int>
    %6262 = torch.aten.view %6260, %6261 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.17.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.17.txt_mlp.2.weight : tensor<3072x12288xf16>
    %6263 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_8792 = torch.constant.int 0
    %int1_8793 = torch.constant.int 1
    %6264 = torch.aten.transpose.int %6263, %int0_8792, %int1_8793 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.17.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.17.txt_mlp.2.bias : tensor<3072xf16>
    %6265 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_8794 = torch.constant.int 6
    %6266 = torch.prims.convert_element_type %6265, %int6_8794 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_8795 = torch.constant.int 6
    %6267 = torch.prims.convert_element_type %6262, %int6_8795 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_8796 = torch.constant.int 6
    %6268 = torch.prims.convert_element_type %6264, %int6_8796 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %6269 = torch.aten.mm %6267, %6268 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_8797 = torch.constant.int 1
    %6270 = torch.aten.mul.Scalar %6269, %int1_8797 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_8798 = torch.constant.int 1
    %6271 = torch.aten.mul.Scalar %6266, %int1_8798 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_8799 = torch.constant.int 1
    %6272 = torch.aten.add.Tensor %6270, %6271, %int1_8799 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_8800 = torch.constant.int 5
    %6273 = torch.prims.convert_element_type %6272, %int5_8800 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_8801 = torch.constant.int 1
    %int512_8802 = torch.constant.int 512
    %int3072_8803 = torch.constant.int 3072
    %6274 = torch.prim.ListConstruct %int1_8801, %int512_8802, %int3072_8803 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6275 = torch.aten.view %6273, %6274 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %6276 = torch.aten.mul.Tensor %5982, %6275 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_8804 = torch.constant.int 1
    %6277 = torch.aten.add.Tensor %6234, %6276, %int1_8804 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %6278 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.18.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.18.img_mod.lin.weight : tensor<18432x3072xf16>
    %6279 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_8805 = torch.constant.int 0
    %int1_8806 = torch.constant.int 1
    %6280 = torch.aten.transpose.int %6279, %int0_8805, %int1_8806 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.18.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.18.img_mod.lin.bias : tensor<18432xf16>
    %6281 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_8807 = torch.constant.int 6
    %6282 = torch.prims.convert_element_type %6281, %int6_8807 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_8808 = torch.constant.int 6
    %6283 = torch.prims.convert_element_type %6278, %int6_8808 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_8809 = torch.constant.int 6
    %6284 = torch.prims.convert_element_type %6280, %int6_8809 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %6285 = torch.aten.mm %6283, %6284 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_8810 = torch.constant.int 1
    %6286 = torch.aten.mul.Scalar %6285, %int1_8810 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_8811 = torch.constant.int 1
    %6287 = torch.aten.mul.Scalar %6282, %int1_8811 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_8812 = torch.constant.int 1
    %6288 = torch.aten.add.Tensor %6286, %6287, %int1_8812 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_8813 = torch.constant.int 5
    %6289 = torch.prims.convert_element_type %6288, %int5_8813 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_8814 = torch.constant.int 0
    %int0_8815 = torch.constant.int 0
    %int9223372036854775807_8816 = torch.constant.int 9223372036854775807
    %int1_8817 = torch.constant.int 1
    %6290 = torch.aten.slice.Tensor %6289, %int0_8814, %int0_8815, %int9223372036854775807_8816, %int1_8817 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_8818 = torch.constant.int 1
    %6291 = torch.aten.unsqueeze %6290, %int1_8818 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_8819 = torch.constant.int 2
    %int0_8820 = torch.constant.int 0
    %int9223372036854775807_8821 = torch.constant.int 9223372036854775807
    %int1_8822 = torch.constant.int 1
    %6292 = torch.aten.slice.Tensor %6291, %int2_8819, %int0_8820, %int9223372036854775807_8821, %int1_8822 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_8823 = torch.constant.int -1
    %int0_8824 = torch.constant.int 0
    %int3072_8825 = torch.constant.int 3072
    %int1_8826 = torch.constant.int 1
    %6293 = torch.aten.slice.Tensor %6292, %int-1_8823, %int0_8824, %int3072_8825, %int1_8826 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8827 = torch.constant.int -1
    %int3072_8828 = torch.constant.int 3072
    %int6144_8829 = torch.constant.int 6144
    %int1_8830 = torch.constant.int 1
    %6294 = torch.aten.slice.Tensor %6292, %int-1_8827, %int3072_8828, %int6144_8829, %int1_8830 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8831 = torch.constant.int -1
    %int6144_8832 = torch.constant.int 6144
    %int9216_8833 = torch.constant.int 9216
    %int1_8834 = torch.constant.int 1
    %6295 = torch.aten.slice.Tensor %6292, %int-1_8831, %int6144_8832, %int9216_8833, %int1_8834 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8835 = torch.constant.int -1
    %int9216_8836 = torch.constant.int 9216
    %int12288_8837 = torch.constant.int 12288
    %int1_8838 = torch.constant.int 1
    %6296 = torch.aten.slice.Tensor %6292, %int-1_8835, %int9216_8836, %int12288_8837, %int1_8838 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8839 = torch.constant.int -1
    %int12288_8840 = torch.constant.int 12288
    %int15360_8841 = torch.constant.int 15360
    %int1_8842 = torch.constant.int 1
    %6297 = torch.aten.slice.Tensor %6292, %int-1_8839, %int12288_8840, %int15360_8841, %int1_8842 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8843 = torch.constant.int -1
    %int15360_8844 = torch.constant.int 15360
    %int18432_8845 = torch.constant.int 18432
    %int1_8846 = torch.constant.int 1
    %6298 = torch.aten.slice.Tensor %6292, %int-1_8843, %int15360_8844, %int18432_8845, %int1_8846 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %6299 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.18.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.18.txt_mod.lin.weight : tensor<18432x3072xf16>
    %6300 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_8847 = torch.constant.int 0
    %int1_8848 = torch.constant.int 1
    %6301 = torch.aten.transpose.int %6300, %int0_8847, %int1_8848 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.18.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.18.txt_mod.lin.bias : tensor<18432xf16>
    %6302 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_8849 = torch.constant.int 6
    %6303 = torch.prims.convert_element_type %6302, %int6_8849 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_8850 = torch.constant.int 6
    %6304 = torch.prims.convert_element_type %6299, %int6_8850 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_8851 = torch.constant.int 6
    %6305 = torch.prims.convert_element_type %6301, %int6_8851 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %6306 = torch.aten.mm %6304, %6305 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_8852 = torch.constant.int 1
    %6307 = torch.aten.mul.Scalar %6306, %int1_8852 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_8853 = torch.constant.int 1
    %6308 = torch.aten.mul.Scalar %6303, %int1_8853 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_8854 = torch.constant.int 1
    %6309 = torch.aten.add.Tensor %6307, %6308, %int1_8854 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_8855 = torch.constant.int 5
    %6310 = torch.prims.convert_element_type %6309, %int5_8855 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_8856 = torch.constant.int 0
    %int0_8857 = torch.constant.int 0
    %int9223372036854775807_8858 = torch.constant.int 9223372036854775807
    %int1_8859 = torch.constant.int 1
    %6311 = torch.aten.slice.Tensor %6310, %int0_8856, %int0_8857, %int9223372036854775807_8858, %int1_8859 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_8860 = torch.constant.int 1
    %6312 = torch.aten.unsqueeze %6311, %int1_8860 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_8861 = torch.constant.int 2
    %int0_8862 = torch.constant.int 0
    %int9223372036854775807_8863 = torch.constant.int 9223372036854775807
    %int1_8864 = torch.constant.int 1
    %6313 = torch.aten.slice.Tensor %6312, %int2_8861, %int0_8862, %int9223372036854775807_8863, %int1_8864 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_8865 = torch.constant.int -1
    %int0_8866 = torch.constant.int 0
    %int3072_8867 = torch.constant.int 3072
    %int1_8868 = torch.constant.int 1
    %6314 = torch.aten.slice.Tensor %6313, %int-1_8865, %int0_8866, %int3072_8867, %int1_8868 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8869 = torch.constant.int -1
    %int3072_8870 = torch.constant.int 3072
    %int6144_8871 = torch.constant.int 6144
    %int1_8872 = torch.constant.int 1
    %6315 = torch.aten.slice.Tensor %6313, %int-1_8869, %int3072_8870, %int6144_8871, %int1_8872 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8873 = torch.constant.int -1
    %int6144_8874 = torch.constant.int 6144
    %int9216_8875 = torch.constant.int 9216
    %int1_8876 = torch.constant.int 1
    %6316 = torch.aten.slice.Tensor %6313, %int-1_8873, %int6144_8874, %int9216_8875, %int1_8876 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8877 = torch.constant.int -1
    %int9216_8878 = torch.constant.int 9216
    %int12288_8879 = torch.constant.int 12288
    %int1_8880 = torch.constant.int 1
    %6317 = torch.aten.slice.Tensor %6313, %int-1_8877, %int9216_8878, %int12288_8879, %int1_8880 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8881 = torch.constant.int -1
    %int12288_8882 = torch.constant.int 12288
    %int15360_8883 = torch.constant.int 15360
    %int1_8884 = torch.constant.int 1
    %6318 = torch.aten.slice.Tensor %6313, %int-1_8881, %int12288_8882, %int15360_8883, %int1_8884 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8885 = torch.constant.int -1
    %int15360_8886 = torch.constant.int 15360
    %int18432_8887 = torch.constant.int 18432
    %int1_8888 = torch.constant.int 1
    %6319 = torch.aten.slice.Tensor %6313, %int-1_8885, %int15360_8886, %int18432_8887, %int1_8888 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_8889 = torch.constant.int 6
    %6320 = torch.prims.convert_element_type %6217, %int6_8889 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_8890 = torch.constant.int 2
    %6321 = torch.prim.ListConstruct %int2_8890 : (!torch.int) -> !torch.list<int>
    %int0_8891 = torch.constant.int 0
    %true_8892 = torch.constant.bool true
    %result0_8893, %result1_8894 = torch.aten.var_mean.correction %6320, %6321, %int0_8891, %true_8892 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_8895 = torch.constant.float 9.9999999999999995E-7
    %int1_8896 = torch.constant.int 1
    %6322 = torch.aten.add.Scalar %result0_8893, %float9.999990e-07_8895, %int1_8896 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %6323 = torch.aten.rsqrt %6322 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_8897 = torch.constant.int 1
    %6324 = torch.aten.sub.Tensor %6217, %result1_8894, %int1_8897 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %6325 = torch.aten.mul.Tensor %6324, %6323 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_8898 = torch.constant.int 5
    %6326 = torch.prims.convert_element_type %6325, %int5_8898 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_8899 = torch.constant.int 1
    %int1_8900 = torch.constant.int 1
    %6327 = torch.aten.add.Scalar %6294, %int1_8899, %int1_8900 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %6328 = torch.aten.mul.Tensor %6327, %6326 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_8901 = torch.constant.int 1
    %6329 = torch.aten.add.Tensor %6328, %6293, %int1_8901 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_8902 = torch.constant.int 4096
    %int3072_8903 = torch.constant.int 3072
    %6330 = torch.prim.ListConstruct %int4096_8902, %int3072_8903 : (!torch.int, !torch.int) -> !torch.list<int>
    %6331 = torch.aten.view %6329, %6330 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.18.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.18.img_attn.qkv.weight : tensor<9216x3072xf16>
    %6332 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_8904 = torch.constant.int 0
    %int1_8905 = torch.constant.int 1
    %6333 = torch.aten.transpose.int %6332, %int0_8904, %int1_8905 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.18.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.18.img_attn.qkv.bias : tensor<9216xf16>
    %6334 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_8906 = torch.constant.int 6
    %6335 = torch.prims.convert_element_type %6334, %int6_8906 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_8907 = torch.constant.int 6
    %6336 = torch.prims.convert_element_type %6331, %int6_8907 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_8908 = torch.constant.int 6
    %6337 = torch.prims.convert_element_type %6333, %int6_8908 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %6338 = torch.aten.mm %6336, %6337 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_8909 = torch.constant.int 1
    %6339 = torch.aten.mul.Scalar %6338, %int1_8909 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_8910 = torch.constant.int 1
    %6340 = torch.aten.mul.Scalar %6335, %int1_8910 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_8911 = torch.constant.int 1
    %6341 = torch.aten.add.Tensor %6339, %6340, %int1_8911 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_8912 = torch.constant.int 5
    %6342 = torch.prims.convert_element_type %6341, %int5_8912 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_8913 = torch.constant.int 1
    %int4096_8914 = torch.constant.int 4096
    %int9216_8915 = torch.constant.int 9216
    %6343 = torch.prim.ListConstruct %int1_8913, %int4096_8914, %int9216_8915 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6344 = torch.aten.view %6342, %6343 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %6345 = torch_c.to_builtin_tensor %6344 : !torch.vtensor<[1,4096,9216],f16> -> tensor<1x4096x9216xf16>
    %cast_8916 = tensor.cast %6345 : tensor<1x4096x9216xf16> to tensor<?x?x?xf16>
    %c0_8917 = arith.constant 0 : index
    %dim_8918 = tensor.dim %cast_8916, %c0_8917 : tensor<?x?x?xf16>
    %c1_8919 = arith.constant 1 : index
    %dim_8920 = tensor.dim %cast_8916, %c1_8919 : tensor<?x?x?xf16>
    %c2_8921 = arith.constant 2 : index
    %dim_8922 = tensor.dim %cast_8916, %c2_8921 : tensor<?x?x?xf16>
    flow.tensor.trace "img_qkv" = [%cast_8916 : tensor<?x?x?xf16>{%dim_8918, %dim_8920, %dim_8922}]
    %cast_8923 = tensor.cast %cast_8916 : tensor<?x?x?xf16> to tensor<1x4096x9216xf16>
    %6346 = torch_c.from_builtin_tensor %cast_8923 : tensor<1x4096x9216xf16> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_8924 = torch.constant.int 1
    %int4096_8925 = torch.constant.int 4096
    %int3_8926 = torch.constant.int 3
    %int24_8927 = torch.constant.int 24
    %int128_8928 = torch.constant.int 128
    %6347 = torch.prim.ListConstruct %int1_8924, %int4096_8925, %int3_8926, %int24_8927, %int128_8928 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6348 = torch.aten.view %6346, %6347 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_8929 = torch.constant.int 2
    %int0_8930 = torch.constant.int 0
    %int3_8931 = torch.constant.int 3
    %int1_8932 = torch.constant.int 1
    %int4_8933 = torch.constant.int 4
    %6349 = torch.prim.ListConstruct %int2_8929, %int0_8930, %int3_8931, %int1_8932, %int4_8933 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6350 = torch.aten.permute %6348, %6349 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_8934 = torch.constant.int 0
    %int0_8935 = torch.constant.int 0
    %6351 = torch.aten.select.int %6350, %int0_8934, %int0_8935 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_8936 = torch.constant.int 6
    %6352 = torch.prims.convert_element_type %6351, %int6_8936 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_8937 = torch.constant.int 2
    %6353 = torch.aten.pow.Tensor_Scalar %6352, %int2_8937 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_8938 = torch.constant.int -1
    %6354 = torch.prim.ListConstruct %int-1_8938 : (!torch.int) -> !torch.list<int>
    %true_8939 = torch.constant.bool true
    %none_8940 = torch.constant.none
    %6355 = torch.aten.mean.dim %6353, %6354, %true_8939, %none_8940 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_8941 = torch.constant.float 9.9999999999999995E-7
    %int1_8942 = torch.constant.int 1
    %6356 = torch.aten.add.Scalar %6355, %float9.999990e-07_8941, %int1_8942 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %6357 = torch.aten.rsqrt %6356 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %6358 = torch.aten.mul.Tensor %6352, %6357 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_8943 = torch.constant.int 5
    %6359 = torch.prims.convert_element_type %6358, %int5_8943 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.18.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.18.img_attn.norm.query_norm.scale : tensor<128xf16>
    %6360 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6361 = torch.aten.mul.Tensor %6359, %6360 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int1_8944 = torch.constant.int 1
    %int4096_8945 = torch.constant.int 4096
    %int3_8946 = torch.constant.int 3
    %int24_8947 = torch.constant.int 24
    %int128_8948 = torch.constant.int 128
    %6362 = torch.prim.ListConstruct %int1_8944, %int4096_8945, %int3_8946, %int24_8947, %int128_8948 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6363 = torch.aten.view %6346, %6362 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_8949 = torch.constant.int 2
    %int0_8950 = torch.constant.int 0
    %int3_8951 = torch.constant.int 3
    %int1_8952 = torch.constant.int 1
    %int4_8953 = torch.constant.int 4
    %6364 = torch.prim.ListConstruct %int2_8949, %int0_8950, %int3_8951, %int1_8952, %int4_8953 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6365 = torch.aten.permute %6363, %6364 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_8954 = torch.constant.int 0
    %int1_8955 = torch.constant.int 1
    %6366 = torch.aten.select.int %6365, %int0_8954, %int1_8955 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_8956 = torch.constant.int 6
    %6367 = torch.prims.convert_element_type %6366, %int6_8956 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_8957 = torch.constant.int 2
    %6368 = torch.aten.pow.Tensor_Scalar %6367, %int2_8957 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_8958 = torch.constant.int -1
    %6369 = torch.prim.ListConstruct %int-1_8958 : (!torch.int) -> !torch.list<int>
    %true_8959 = torch.constant.bool true
    %none_8960 = torch.constant.none
    %6370 = torch.aten.mean.dim %6368, %6369, %true_8959, %none_8960 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_8961 = torch.constant.float 9.9999999999999995E-7
    %int1_8962 = torch.constant.int 1
    %6371 = torch.aten.add.Scalar %6370, %float9.999990e-07_8961, %int1_8962 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %6372 = torch.aten.rsqrt %6371 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %6373 = torch.aten.mul.Tensor %6367, %6372 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_8963 = torch.constant.int 5
    %6374 = torch.prims.convert_element_type %6373, %int5_8963 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.18.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.18.img_attn.norm.key_norm.scale : tensor<128xf16>
    %6375 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6376 = torch.aten.mul.Tensor %6374, %6375 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_8964 = torch.constant.int 5
    %6377 = torch.prims.convert_element_type %6361, %int5_8964 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_8965 = torch.constant.int 5
    %6378 = torch.prims.convert_element_type %6376, %int5_8965 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_8966 = torch.constant.int 6
    %6379 = torch.prims.convert_element_type %6277, %int6_8966 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_8967 = torch.constant.int 2
    %6380 = torch.prim.ListConstruct %int2_8967 : (!torch.int) -> !torch.list<int>
    %int0_8968 = torch.constant.int 0
    %true_8969 = torch.constant.bool true
    %result0_8970, %result1_8971 = torch.aten.var_mean.correction %6379, %6380, %int0_8968, %true_8969 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_8972 = torch.constant.float 9.9999999999999995E-7
    %int1_8973 = torch.constant.int 1
    %6381 = torch.aten.add.Scalar %result0_8970, %float9.999990e-07_8972, %int1_8973 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %6382 = torch.aten.rsqrt %6381 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_8974 = torch.constant.int 1
    %6383 = torch.aten.sub.Tensor %6277, %result1_8971, %int1_8974 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %6384 = torch.aten.mul.Tensor %6383, %6382 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_8975 = torch.constant.int 5
    %6385 = torch.prims.convert_element_type %6384, %int5_8975 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_8976 = torch.constant.int 1
    %int1_8977 = torch.constant.int 1
    %6386 = torch.aten.add.Scalar %6315, %int1_8976, %int1_8977 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %6387 = torch.aten.mul.Tensor %6386, %6385 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_8978 = torch.constant.int 1
    %6388 = torch.aten.add.Tensor %6387, %6314, %int1_8978 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_8979 = torch.constant.int 512
    %int3072_8980 = torch.constant.int 3072
    %6389 = torch.prim.ListConstruct %int512_8979, %int3072_8980 : (!torch.int, !torch.int) -> !torch.list<int>
    %6390 = torch.aten.view %6388, %6389 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.18.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.18.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %6391 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_8981 = torch.constant.int 0
    %int1_8982 = torch.constant.int 1
    %6392 = torch.aten.transpose.int %6391, %int0_8981, %int1_8982 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.18.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.18.txt_attn.qkv.bias : tensor<9216xf16>
    %6393 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_8983 = torch.constant.int 6
    %6394 = torch.prims.convert_element_type %6393, %int6_8983 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_8984 = torch.constant.int 6
    %6395 = torch.prims.convert_element_type %6390, %int6_8984 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_8985 = torch.constant.int 6
    %6396 = torch.prims.convert_element_type %6392, %int6_8985 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %6397 = torch.aten.mm %6395, %6396 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_8986 = torch.constant.int 1
    %6398 = torch.aten.mul.Scalar %6397, %int1_8986 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_8987 = torch.constant.int 1
    %6399 = torch.aten.mul.Scalar %6394, %int1_8987 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_8988 = torch.constant.int 1
    %6400 = torch.aten.add.Tensor %6398, %6399, %int1_8988 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_8989 = torch.constant.int 5
    %6401 = torch.prims.convert_element_type %6400, %int5_8989 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_8990 = torch.constant.int 1
    %int512_8991 = torch.constant.int 512
    %int9216_8992 = torch.constant.int 9216
    %6402 = torch.prim.ListConstruct %int1_8990, %int512_8991, %int9216_8992 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6403 = torch.aten.view %6401, %6402 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %6404 = torch_c.to_builtin_tensor %6403 : !torch.vtensor<[1,512,9216],f16> -> tensor<1x512x9216xf16>
    %cast_8993 = tensor.cast %6404 : tensor<1x512x9216xf16> to tensor<?x?x?xf16>
    %c0_8994 = arith.constant 0 : index
    %dim_8995 = tensor.dim %cast_8993, %c0_8994 : tensor<?x?x?xf16>
    %c1_8996 = arith.constant 1 : index
    %dim_8997 = tensor.dim %cast_8993, %c1_8996 : tensor<?x?x?xf16>
    %c2_8998 = arith.constant 2 : index
    %dim_8999 = tensor.dim %cast_8993, %c2_8998 : tensor<?x?x?xf16>
    flow.tensor.trace "txt_qkv" = [%cast_8993 : tensor<?x?x?xf16>{%dim_8995, %dim_8997, %dim_8999}]
    %cast_9000 = tensor.cast %cast_8993 : tensor<?x?x?xf16> to tensor<1x512x9216xf16>
    %6405 = torch_c.from_builtin_tensor %cast_9000 : tensor<1x512x9216xf16> -> !torch.vtensor<[1,512,9216],f16>
    %int1_9001 = torch.constant.int 1
    %int512_9002 = torch.constant.int 512
    %int3_9003 = torch.constant.int 3
    %int24_9004 = torch.constant.int 24
    %int128_9005 = torch.constant.int 128
    %6406 = torch.prim.ListConstruct %int1_9001, %int512_9002, %int3_9003, %int24_9004, %int128_9005 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6407 = torch.aten.view %6405, %6406 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_9006 = torch.constant.int 2
    %int0_9007 = torch.constant.int 0
    %int3_9008 = torch.constant.int 3
    %int1_9009 = torch.constant.int 1
    %int4_9010 = torch.constant.int 4
    %6408 = torch.prim.ListConstruct %int2_9006, %int0_9007, %int3_9008, %int1_9009, %int4_9010 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6409 = torch.aten.permute %6407, %6408 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_9011 = torch.constant.int 0
    %int0_9012 = torch.constant.int 0
    %6410 = torch.aten.select.int %6409, %int0_9011, %int0_9012 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_9013 = torch.constant.int 6
    %6411 = torch.prims.convert_element_type %6410, %int6_9013 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_9014 = torch.constant.int 2
    %6412 = torch.aten.pow.Tensor_Scalar %6411, %int2_9014 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_9015 = torch.constant.int -1
    %6413 = torch.prim.ListConstruct %int-1_9015 : (!torch.int) -> !torch.list<int>
    %true_9016 = torch.constant.bool true
    %none_9017 = torch.constant.none
    %6414 = torch.aten.mean.dim %6412, %6413, %true_9016, %none_9017 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_9018 = torch.constant.float 9.9999999999999995E-7
    %int1_9019 = torch.constant.int 1
    %6415 = torch.aten.add.Scalar %6414, %float9.999990e-07_9018, %int1_9019 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %6416 = torch.aten.rsqrt %6415 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %6417 = torch.aten.mul.Tensor %6411, %6416 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_9020 = torch.constant.int 5
    %6418 = torch.prims.convert_element_type %6417, %int5_9020 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.18.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.18.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %6419 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6420 = torch.aten.mul.Tensor %6418, %6419 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int1_9021 = torch.constant.int 1
    %int512_9022 = torch.constant.int 512
    %int3_9023 = torch.constant.int 3
    %int24_9024 = torch.constant.int 24
    %int128_9025 = torch.constant.int 128
    %6421 = torch.prim.ListConstruct %int1_9021, %int512_9022, %int3_9023, %int24_9024, %int128_9025 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6422 = torch.aten.view %6405, %6421 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_9026 = torch.constant.int 2
    %int0_9027 = torch.constant.int 0
    %int3_9028 = torch.constant.int 3
    %int1_9029 = torch.constant.int 1
    %int4_9030 = torch.constant.int 4
    %6423 = torch.prim.ListConstruct %int2_9026, %int0_9027, %int3_9028, %int1_9029, %int4_9030 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6424 = torch.aten.permute %6422, %6423 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_9031 = torch.constant.int 0
    %int1_9032 = torch.constant.int 1
    %6425 = torch.aten.select.int %6424, %int0_9031, %int1_9032 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_9033 = torch.constant.int 6
    %6426 = torch.prims.convert_element_type %6425, %int6_9033 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_9034 = torch.constant.int 2
    %6427 = torch.aten.pow.Tensor_Scalar %6426, %int2_9034 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_9035 = torch.constant.int -1
    %6428 = torch.prim.ListConstruct %int-1_9035 : (!torch.int) -> !torch.list<int>
    %true_9036 = torch.constant.bool true
    %none_9037 = torch.constant.none
    %6429 = torch.aten.mean.dim %6427, %6428, %true_9036, %none_9037 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_9038 = torch.constant.float 9.9999999999999995E-7
    %int1_9039 = torch.constant.int 1
    %6430 = torch.aten.add.Scalar %6429, %float9.999990e-07_9038, %int1_9039 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %6431 = torch.aten.rsqrt %6430 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %6432 = torch.aten.mul.Tensor %6426, %6431 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_9040 = torch.constant.int 5
    %6433 = torch.prims.convert_element_type %6432, %int5_9040 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.18.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.18.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %6434 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6435 = torch.aten.mul.Tensor %6433, %6434 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_9041 = torch.constant.int 5
    %6436 = torch.prims.convert_element_type %6420, %int5_9041 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_9042 = torch.constant.int 5
    %6437 = torch.prims.convert_element_type %6435, %int5_9042 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %6438 = torch.prim.ListConstruct %6436, %6377 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_9043 = torch.constant.int 2
    %6439 = torch.aten.cat %6438, %int2_9043 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %6440 = torch.prim.ListConstruct %6437, %6378 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_9044 = torch.constant.int 2
    %6441 = torch.aten.cat %6440, %int2_9044 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_9045 = torch.constant.int 1
    %int512_9046 = torch.constant.int 512
    %int3_9047 = torch.constant.int 3
    %int24_9048 = torch.constant.int 24
    %int128_9049 = torch.constant.int 128
    %6442 = torch.prim.ListConstruct %int1_9045, %int512_9046, %int3_9047, %int24_9048, %int128_9049 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6443 = torch.aten.view %6405, %6442 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_9050 = torch.constant.int 2
    %int0_9051 = torch.constant.int 0
    %int3_9052 = torch.constant.int 3
    %int1_9053 = torch.constant.int 1
    %int4_9054 = torch.constant.int 4
    %6444 = torch.prim.ListConstruct %int2_9050, %int0_9051, %int3_9052, %int1_9053, %int4_9054 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6445 = torch.aten.permute %6443, %6444 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_9055 = torch.constant.int 0
    %int2_9056 = torch.constant.int 2
    %6446 = torch.aten.select.int %6445, %int0_9055, %int2_9056 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int1_9057 = torch.constant.int 1
    %int4096_9058 = torch.constant.int 4096
    %int3_9059 = torch.constant.int 3
    %int24_9060 = torch.constant.int 24
    %int128_9061 = torch.constant.int 128
    %6447 = torch.prim.ListConstruct %int1_9057, %int4096_9058, %int3_9059, %int24_9060, %int128_9061 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6448 = torch.aten.view %6346, %6447 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_9062 = torch.constant.int 2
    %int0_9063 = torch.constant.int 0
    %int3_9064 = torch.constant.int 3
    %int1_9065 = torch.constant.int 1
    %int4_9066 = torch.constant.int 4
    %6449 = torch.prim.ListConstruct %int2_9062, %int0_9063, %int3_9064, %int1_9065, %int4_9066 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6450 = torch.aten.permute %6448, %6449 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_9067 = torch.constant.int 0
    %int2_9068 = torch.constant.int 2
    %6451 = torch.aten.select.int %6450, %int0_9067, %int2_9068 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %6452 = torch.prim.ListConstruct %6446, %6451 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_9069 = torch.constant.int 2
    %6453 = torch.aten.cat %6452, %int2_9069 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %6454 = torch_c.to_builtin_tensor %6439 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_9070 = tensor.cast %6454 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_9071 = arith.constant 0 : index
    %dim_9072 = tensor.dim %cast_9070, %c0_9071 : tensor<?x?x?x?xf16>
    %c1_9073 = arith.constant 1 : index
    %dim_9074 = tensor.dim %cast_9070, %c1_9073 : tensor<?x?x?x?xf16>
    %c2_9075 = arith.constant 2 : index
    %dim_9076 = tensor.dim %cast_9070, %c2_9075 : tensor<?x?x?x?xf16>
    %c3_9077 = arith.constant 3 : index
    %dim_9078 = tensor.dim %cast_9070, %c3_9077 : tensor<?x?x?x?xf16>
    flow.tensor.trace "q" = [%cast_9070 : tensor<?x?x?x?xf16>{%dim_9072, %dim_9074, %dim_9076, %dim_9078}]
    %cast_9079 = tensor.cast %cast_9070 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %6455 = torch_c.from_builtin_tensor %cast_9079 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %6456 = torch_c.to_builtin_tensor %6441 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_9080 = tensor.cast %6456 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_9081 = arith.constant 0 : index
    %dim_9082 = tensor.dim %cast_9080, %c0_9081 : tensor<?x?x?x?xf16>
    %c1_9083 = arith.constant 1 : index
    %dim_9084 = tensor.dim %cast_9080, %c1_9083 : tensor<?x?x?x?xf16>
    %c2_9085 = arith.constant 2 : index
    %dim_9086 = tensor.dim %cast_9080, %c2_9085 : tensor<?x?x?x?xf16>
    %c3_9087 = arith.constant 3 : index
    %dim_9088 = tensor.dim %cast_9080, %c3_9087 : tensor<?x?x?x?xf16>
    flow.tensor.trace "k" = [%cast_9080 : tensor<?x?x?x?xf16>{%dim_9082, %dim_9084, %dim_9086, %dim_9088}]
    %cast_9089 = tensor.cast %cast_9080 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %6457 = torch_c.from_builtin_tensor %cast_9089 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %6458 = torch_c.to_builtin_tensor %6453 : !torch.vtensor<[1,24,4608,128],f16> -> tensor<1x24x4608x128xf16>
    %cast_9090 = tensor.cast %6458 : tensor<1x24x4608x128xf16> to tensor<?x?x?x?xf16>
    %c0_9091 = arith.constant 0 : index
    %dim_9092 = tensor.dim %cast_9090, %c0_9091 : tensor<?x?x?x?xf16>
    %c1_9093 = arith.constant 1 : index
    %dim_9094 = tensor.dim %cast_9090, %c1_9093 : tensor<?x?x?x?xf16>
    %c2_9095 = arith.constant 2 : index
    %dim_9096 = tensor.dim %cast_9090, %c2_9095 : tensor<?x?x?x?xf16>
    %c3_9097 = arith.constant 3 : index
    %dim_9098 = tensor.dim %cast_9090, %c3_9097 : tensor<?x?x?x?xf16>
    flow.tensor.trace "v" = [%cast_9090 : tensor<?x?x?x?xf16>{%dim_9092, %dim_9094, %dim_9096, %dim_9098}]
    %cast_9099 = tensor.cast %cast_9090 : tensor<?x?x?x?xf16> to tensor<1x24x4608x128xf16>
    %6459 = torch_c.from_builtin_tensor %cast_9099 : tensor<1x24x4608x128xf16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9100 = torch.constant.int 6
    %6460 = torch.prims.convert_element_type %6455, %int6_9100 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9101 = torch.constant.int 1
    %int24_9102 = torch.constant.int 24
    %int4608_9103 = torch.constant.int 4608
    %int-1_9104 = torch.constant.int -1
    %int1_9105 = torch.constant.int 1
    %int2_9106 = torch.constant.int 2
    %6461 = torch.prim.ListConstruct %int1_9101, %int24_9102, %int4608_9103, %int-1_9104, %int1_9105, %int2_9106 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6462 = torch.aten.view %6460, %6461 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_9107 = torch.constant.int 6
    %6463 = torch.prims.convert_element_type %6457, %int6_9107 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9108 = torch.constant.int 1
    %int24_9109 = torch.constant.int 24
    %int4608_9110 = torch.constant.int 4608
    %int-1_9111 = torch.constant.int -1
    %int1_9112 = torch.constant.int 1
    %int2_9113 = torch.constant.int 2
    %6464 = torch.prim.ListConstruct %int1_9108, %int24_9109, %int4608_9110, %int-1_9111, %int1_9112, %int2_9113 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6465 = torch.aten.view %6463, %6464 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_9114 = torch.constant.int 5
    %int0_9115 = torch.constant.int 0
    %6466 = torch.aten.select.int %211, %int5_9114, %int0_9115 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9116 = torch.constant.int 5
    %int0_9117 = torch.constant.int 0
    %6467 = torch.aten.select.int %6462, %int5_9116, %int0_9117 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6468 = torch.aten.mul.Tensor %6466, %6467 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9118 = torch.constant.int 5
    %int1_9119 = torch.constant.int 1
    %6469 = torch.aten.select.int %211, %int5_9118, %int1_9119 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9120 = torch.constant.int 5
    %int1_9121 = torch.constant.int 1
    %6470 = torch.aten.select.int %6462, %int5_9120, %int1_9121 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6471 = torch.aten.mul.Tensor %6469, %6470 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9122 = torch.constant.int 1
    %6472 = torch.aten.add.Tensor %6468, %6471, %int1_9122 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9123 = torch.constant.int 5
    %int0_9124 = torch.constant.int 0
    %6473 = torch.aten.select.int %211, %int5_9123, %int0_9124 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9125 = torch.constant.int 5
    %int0_9126 = torch.constant.int 0
    %6474 = torch.aten.select.int %6465, %int5_9125, %int0_9126 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6475 = torch.aten.mul.Tensor %6473, %6474 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9127 = torch.constant.int 5
    %int1_9128 = torch.constant.int 1
    %6476 = torch.aten.select.int %211, %int5_9127, %int1_9128 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9129 = torch.constant.int 5
    %int1_9130 = torch.constant.int 1
    %6477 = torch.aten.select.int %6465, %int5_9129, %int1_9130 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6478 = torch.aten.mul.Tensor %6476, %6477 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9131 = torch.constant.int 1
    %6479 = torch.aten.add.Tensor %6475, %6478, %int1_9131 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9132 = torch.constant.int 1
    %int24_9133 = torch.constant.int 24
    %int4608_9134 = torch.constant.int 4608
    %int128_9135 = torch.constant.int 128
    %6480 = torch.prim.ListConstruct %int1_9132, %int24_9133, %int4608_9134, %int128_9135 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6481 = torch.aten.view %6472, %6480 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9136 = torch.constant.int 5
    %6482 = torch.prims.convert_element_type %6481, %int5_9136 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_9137 = torch.constant.int 1
    %int24_9138 = torch.constant.int 24
    %int4608_9139 = torch.constant.int 4608
    %int128_9140 = torch.constant.int 128
    %6483 = torch.prim.ListConstruct %int1_9137, %int24_9138, %int4608_9139, %int128_9140 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6484 = torch.aten.view %6479, %6483 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9141 = torch.constant.int 5
    %6485 = torch.prims.convert_element_type %6484, %int5_9141 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_9142 = torch.constant.float 0.000000e+00
    %false_9143 = torch.constant.bool false
    %none_9144 = torch.constant.none
    %none_9145 = torch.constant.none
    %6486:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%6482, %6485, %6459, %float0.000000e00_9142, %false_9143, %none_9144, %none_9145) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_9146 = torch.constant.int 0
    %int2_9147 = torch.constant.int 2
    %int1_9148 = torch.constant.int 1
    %int3_9149 = torch.constant.int 3
    %6487 = torch.prim.ListConstruct %int0_9146, %int2_9147, %int1_9148, %int3_9149 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6488 = torch.aten.permute %6486#0, %6487 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_9150 = torch.constant.int 1
    %int4608_9151 = torch.constant.int 4608
    %int3072_9152 = torch.constant.int 3072
    %6489 = torch.prim.ListConstruct %int1_9150, %int4608_9151, %int3072_9152 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6490 = torch.aten.view %6488, %6489 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_9153 = torch.constant.int 0
    %int0_9154 = torch.constant.int 0
    %int9223372036854775807_9155 = torch.constant.int 9223372036854775807
    %int1_9156 = torch.constant.int 1
    %6491 = torch.aten.slice.Tensor %6490, %int0_9153, %int0_9154, %int9223372036854775807_9155, %int1_9156 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9157 = torch.constant.int 1
    %int0_9158 = torch.constant.int 0
    %int512_9159 = torch.constant.int 512
    %int1_9160 = torch.constant.int 1
    %6492 = torch.aten.slice.Tensor %6491, %int1_9157, %int0_9158, %int512_9159, %int1_9160 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_9161 = torch.constant.int 0
    %int0_9162 = torch.constant.int 0
    %int9223372036854775807_9163 = torch.constant.int 9223372036854775807
    %int1_9164 = torch.constant.int 1
    %6493 = torch.aten.slice.Tensor %6490, %int0_9161, %int0_9162, %int9223372036854775807_9163, %int1_9164 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9165 = torch.constant.int 1
    %int512_9166 = torch.constant.int 512
    %int9223372036854775807_9167 = torch.constant.int 9223372036854775807
    %int1_9168 = torch.constant.int 1
    %6494 = torch.aten.slice.Tensor %6493, %int1_9165, %int512_9166, %int9223372036854775807_9167, %int1_9168 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_9169 = torch.constant.int 4096
    %int3072_9170 = torch.constant.int 3072
    %6495 = torch.prim.ListConstruct %int4096_9169, %int3072_9170 : (!torch.int, !torch.int) -> !torch.list<int>
    %6496 = torch.aten.view %6494, %6495 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.18.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.18.img_attn.proj.weight : tensor<3072x3072xf16>
    %6497 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_9171 = torch.constant.int 0
    %int1_9172 = torch.constant.int 1
    %6498 = torch.aten.transpose.int %6497, %int0_9171, %int1_9172 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.18.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.18.img_attn.proj.bias : tensor<3072xf16>
    %6499 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_9173 = torch.constant.int 6
    %6500 = torch.prims.convert_element_type %6499, %int6_9173 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_9174 = torch.constant.int 6
    %6501 = torch.prims.convert_element_type %6496, %int6_9174 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_9175 = torch.constant.int 6
    %6502 = torch.prims.convert_element_type %6498, %int6_9175 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %6503 = torch.aten.mm %6501, %6502 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_9176 = torch.constant.int 1
    %6504 = torch.aten.mul.Scalar %6503, %int1_9176 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_9177 = torch.constant.int 1
    %6505 = torch.aten.mul.Scalar %6500, %int1_9177 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_9178 = torch.constant.int 1
    %6506 = torch.aten.add.Tensor %6504, %6505, %int1_9178 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_9179 = torch.constant.int 5
    %6507 = torch.prims.convert_element_type %6506, %int5_9179 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_9180 = torch.constant.int 1
    %int4096_9181 = torch.constant.int 4096
    %int3072_9182 = torch.constant.int 3072
    %6508 = torch.prim.ListConstruct %int1_9180, %int4096_9181, %int3072_9182 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6509 = torch.aten.view %6507, %6508 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %6510 = torch.aten.mul.Tensor %6295, %6509 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_9183 = torch.constant.int 1
    %6511 = torch.aten.add.Tensor %6217, %6510, %int1_9183 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_9184 = torch.constant.int 1
    %int1_9185 = torch.constant.int 1
    %6512 = torch.aten.add.Scalar %6297, %int1_9184, %int1_9185 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_9186 = torch.constant.int 6
    %6513 = torch.prims.convert_element_type %6511, %int6_9186 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_9187 = torch.constant.int 2
    %6514 = torch.prim.ListConstruct %int2_9187 : (!torch.int) -> !torch.list<int>
    %int0_9188 = torch.constant.int 0
    %true_9189 = torch.constant.bool true
    %result0_9190, %result1_9191 = torch.aten.var_mean.correction %6513, %6514, %int0_9188, %true_9189 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_9192 = torch.constant.float 9.9999999999999995E-7
    %int1_9193 = torch.constant.int 1
    %6515 = torch.aten.add.Scalar %result0_9190, %float9.999990e-07_9192, %int1_9193 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %6516 = torch.aten.rsqrt %6515 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_9194 = torch.constant.int 1
    %6517 = torch.aten.sub.Tensor %6511, %result1_9191, %int1_9194 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %6518 = torch.aten.mul.Tensor %6517, %6516 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_9195 = torch.constant.int 5
    %6519 = torch.prims.convert_element_type %6518, %int5_9195 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %6520 = torch.aten.mul.Tensor %6512, %6519 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_9196 = torch.constant.int 1
    %6521 = torch.aten.add.Tensor %6520, %6296, %int1_9196 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_9197 = torch.constant.int 4096
    %int3072_9198 = torch.constant.int 3072
    %6522 = torch.prim.ListConstruct %int4096_9197, %int3072_9198 : (!torch.int, !torch.int) -> !torch.list<int>
    %6523 = torch.aten.view %6521, %6522 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.18.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.18.img_mlp.0.weight : tensor<12288x3072xf16>
    %6524 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_9199 = torch.constant.int 0
    %int1_9200 = torch.constant.int 1
    %6525 = torch.aten.transpose.int %6524, %int0_9199, %int1_9200 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.18.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.18.img_mlp.0.bias : tensor<12288xf16>
    %6526 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_9201 = torch.constant.int 6
    %6527 = torch.prims.convert_element_type %6526, %int6_9201 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_9202 = torch.constant.int 6
    %6528 = torch.prims.convert_element_type %6523, %int6_9202 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_9203 = torch.constant.int 6
    %6529 = torch.prims.convert_element_type %6525, %int6_9203 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %6530 = torch.aten.mm %6528, %6529 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_9204 = torch.constant.int 1
    %6531 = torch.aten.mul.Scalar %6530, %int1_9204 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_9205 = torch.constant.int 1
    %6532 = torch.aten.mul.Scalar %6527, %int1_9205 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_9206 = torch.constant.int 1
    %6533 = torch.aten.add.Tensor %6531, %6532, %int1_9206 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_9207 = torch.constant.int 5
    %6534 = torch.prims.convert_element_type %6533, %int5_9207 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_9208 = torch.constant.int 1
    %int4096_9209 = torch.constant.int 4096
    %int12288_9210 = torch.constant.int 12288
    %6535 = torch.prim.ListConstruct %int1_9208, %int4096_9209, %int12288_9210 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6536 = torch.aten.view %6534, %6535 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_9211 = torch.constant.str "tanh"
    %6537 = torch.aten.gelu %6536, %str_9211 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_9212 = torch.constant.int 4096
    %int12288_9213 = torch.constant.int 12288
    %6538 = torch.prim.ListConstruct %int4096_9212, %int12288_9213 : (!torch.int, !torch.int) -> !torch.list<int>
    %6539 = torch.aten.view %6537, %6538 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.18.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.18.img_mlp.2.weight : tensor<3072x12288xf16>
    %6540 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_9214 = torch.constant.int 0
    %int1_9215 = torch.constant.int 1
    %6541 = torch.aten.transpose.int %6540, %int0_9214, %int1_9215 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.18.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.18.img_mlp.2.bias : tensor<3072xf16>
    %6542 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_9216 = torch.constant.int 6
    %6543 = torch.prims.convert_element_type %6542, %int6_9216 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_9217 = torch.constant.int 6
    %6544 = torch.prims.convert_element_type %6539, %int6_9217 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_9218 = torch.constant.int 6
    %6545 = torch.prims.convert_element_type %6541, %int6_9218 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %6546 = torch.aten.mm %6544, %6545 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_9219 = torch.constant.int 1
    %6547 = torch.aten.mul.Scalar %6546, %int1_9219 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_9220 = torch.constant.int 1
    %6548 = torch.aten.mul.Scalar %6543, %int1_9220 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_9221 = torch.constant.int 1
    %6549 = torch.aten.add.Tensor %6547, %6548, %int1_9221 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_9222 = torch.constant.int 5
    %6550 = torch.prims.convert_element_type %6549, %int5_9222 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_9223 = torch.constant.int 1
    %int4096_9224 = torch.constant.int 4096
    %int3072_9225 = torch.constant.int 3072
    %6551 = torch.prim.ListConstruct %int1_9223, %int4096_9224, %int3072_9225 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6552 = torch.aten.view %6550, %6551 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %6553 = torch.aten.mul.Tensor %6298, %6552 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_9226 = torch.constant.int 1
    %6554 = torch.aten.add.Tensor %6511, %6553, %int1_9226 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_9227 = torch.constant.int 512
    %int3072_9228 = torch.constant.int 3072
    %6555 = torch.prim.ListConstruct %int512_9227, %int3072_9228 : (!torch.int, !torch.int) -> !torch.list<int>
    %6556 = torch.aten.view %6492, %6555 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.18.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.18.txt_attn.proj.weight : tensor<3072x3072xf16>
    %6557 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_9229 = torch.constant.int 0
    %int1_9230 = torch.constant.int 1
    %6558 = torch.aten.transpose.int %6557, %int0_9229, %int1_9230 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.18.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.18.txt_attn.proj.bias : tensor<3072xf16>
    %6559 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_9231 = torch.constant.int 6
    %6560 = torch.prims.convert_element_type %6559, %int6_9231 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_9232 = torch.constant.int 6
    %6561 = torch.prims.convert_element_type %6556, %int6_9232 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_9233 = torch.constant.int 6
    %6562 = torch.prims.convert_element_type %6558, %int6_9233 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %6563 = torch.aten.mm %6561, %6562 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_9234 = torch.constant.int 1
    %6564 = torch.aten.mul.Scalar %6563, %int1_9234 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_9235 = torch.constant.int 1
    %6565 = torch.aten.mul.Scalar %6560, %int1_9235 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_9236 = torch.constant.int 1
    %6566 = torch.aten.add.Tensor %6564, %6565, %int1_9236 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_9237 = torch.constant.int 5
    %6567 = torch.prims.convert_element_type %6566, %int5_9237 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_9238 = torch.constant.int 1
    %int512_9239 = torch.constant.int 512
    %int3072_9240 = torch.constant.int 3072
    %6568 = torch.prim.ListConstruct %int1_9238, %int512_9239, %int3072_9240 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6569 = torch.aten.view %6567, %6568 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %6570 = torch.aten.mul.Tensor %6316, %6569 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_9241 = torch.constant.int 1
    %6571 = torch.aten.add.Tensor %6277, %6570, %int1_9241 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_9242 = torch.constant.int 1
    %int1_9243 = torch.constant.int 1
    %6572 = torch.aten.add.Scalar %6318, %int1_9242, %int1_9243 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_9244 = torch.constant.int 6
    %6573 = torch.prims.convert_element_type %6571, %int6_9244 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_9245 = torch.constant.int 2
    %6574 = torch.prim.ListConstruct %int2_9245 : (!torch.int) -> !torch.list<int>
    %int0_9246 = torch.constant.int 0
    %true_9247 = torch.constant.bool true
    %result0_9248, %result1_9249 = torch.aten.var_mean.correction %6573, %6574, %int0_9246, %true_9247 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_9250 = torch.constant.float 9.9999999999999995E-7
    %int1_9251 = torch.constant.int 1
    %6575 = torch.aten.add.Scalar %result0_9248, %float9.999990e-07_9250, %int1_9251 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %6576 = torch.aten.rsqrt %6575 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_9252 = torch.constant.int 1
    %6577 = torch.aten.sub.Tensor %6571, %result1_9249, %int1_9252 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %6578 = torch.aten.mul.Tensor %6577, %6576 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_9253 = torch.constant.int 5
    %6579 = torch.prims.convert_element_type %6578, %int5_9253 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %6580 = torch.aten.mul.Tensor %6572, %6579 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_9254 = torch.constant.int 1
    %6581 = torch.aten.add.Tensor %6580, %6317, %int1_9254 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_9255 = torch.constant.int 512
    %int3072_9256 = torch.constant.int 3072
    %6582 = torch.prim.ListConstruct %int512_9255, %int3072_9256 : (!torch.int, !torch.int) -> !torch.list<int>
    %6583 = torch.aten.view %6581, %6582 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.18.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.18.txt_mlp.0.weight : tensor<12288x3072xf16>
    %6584 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_9257 = torch.constant.int 0
    %int1_9258 = torch.constant.int 1
    %6585 = torch.aten.transpose.int %6584, %int0_9257, %int1_9258 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.18.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.18.txt_mlp.0.bias : tensor<12288xf16>
    %6586 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_9259 = torch.constant.int 6
    %6587 = torch.prims.convert_element_type %6586, %int6_9259 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_9260 = torch.constant.int 6
    %6588 = torch.prims.convert_element_type %6583, %int6_9260 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_9261 = torch.constant.int 6
    %6589 = torch.prims.convert_element_type %6585, %int6_9261 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %6590 = torch.aten.mm %6588, %6589 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_9262 = torch.constant.int 1
    %6591 = torch.aten.mul.Scalar %6590, %int1_9262 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_9263 = torch.constant.int 1
    %6592 = torch.aten.mul.Scalar %6587, %int1_9263 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_9264 = torch.constant.int 1
    %6593 = torch.aten.add.Tensor %6591, %6592, %int1_9264 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_9265 = torch.constant.int 5
    %6594 = torch.prims.convert_element_type %6593, %int5_9265 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_9266 = torch.constant.int 1
    %int512_9267 = torch.constant.int 512
    %int12288_9268 = torch.constant.int 12288
    %6595 = torch.prim.ListConstruct %int1_9266, %int512_9267, %int12288_9268 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6596 = torch.aten.view %6594, %6595 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_9269 = torch.constant.str "tanh"
    %6597 = torch.aten.gelu %6596, %str_9269 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_9270 = torch.constant.int 512
    %int12288_9271 = torch.constant.int 12288
    %6598 = torch.prim.ListConstruct %int512_9270, %int12288_9271 : (!torch.int, !torch.int) -> !torch.list<int>
    %6599 = torch.aten.view %6597, %6598 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.18.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.18.txt_mlp.2.weight : tensor<3072x12288xf16>
    %6600 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_9272 = torch.constant.int 0
    %int1_9273 = torch.constant.int 1
    %6601 = torch.aten.transpose.int %6600, %int0_9272, %int1_9273 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.18.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.18.txt_mlp.2.bias : tensor<3072xf16>
    %6602 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_9274 = torch.constant.int 6
    %6603 = torch.prims.convert_element_type %6602, %int6_9274 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_9275 = torch.constant.int 6
    %6604 = torch.prims.convert_element_type %6599, %int6_9275 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_9276 = torch.constant.int 6
    %6605 = torch.prims.convert_element_type %6601, %int6_9276 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %6606 = torch.aten.mm %6604, %6605 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_9277 = torch.constant.int 1
    %6607 = torch.aten.mul.Scalar %6606, %int1_9277 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_9278 = torch.constant.int 1
    %6608 = torch.aten.mul.Scalar %6603, %int1_9278 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_9279 = torch.constant.int 1
    %6609 = torch.aten.add.Tensor %6607, %6608, %int1_9279 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_9280 = torch.constant.int 5
    %6610 = torch.prims.convert_element_type %6609, %int5_9280 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_9281 = torch.constant.int 1
    %int512_9282 = torch.constant.int 512
    %int3072_9283 = torch.constant.int 3072
    %6611 = torch.prim.ListConstruct %int1_9281, %int512_9282, %int3072_9283 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6612 = torch.aten.view %6610, %6611 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %6613 = torch.aten.mul.Tensor %6319, %6612 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_9284 = torch.constant.int 1
    %6614 = torch.aten.add.Tensor %6571, %6613, %int1_9284 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %6615 = torch.prim.ListConstruct %6614, %6554 : (!torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,4096,3072],f16>) -> !torch.list<vtensor>
    %int1_9285 = torch.constant.int 1
    %6616 = torch.aten.cat %6615, %int1_9285 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6617 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.0.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.0.modulation.lin.weight : tensor<9216x3072xf16>
    %6618 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_9286 = torch.constant.int 0
    %int1_9287 = torch.constant.int 1
    %6619 = torch.aten.transpose.int %6618, %int0_9286, %int1_9287 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.0.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.0.modulation.lin.bias : tensor<9216xf16>
    %6620 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_9288 = torch.constant.int 6
    %6621 = torch.prims.convert_element_type %6620, %int6_9288 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_9289 = torch.constant.int 6
    %6622 = torch.prims.convert_element_type %6617, %int6_9289 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_9290 = torch.constant.int 6
    %6623 = torch.prims.convert_element_type %6619, %int6_9290 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %6624 = torch.aten.mm %6622, %6623 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_9291 = torch.constant.int 1
    %6625 = torch.aten.mul.Scalar %6624, %int1_9291 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_9292 = torch.constant.int 1
    %6626 = torch.aten.mul.Scalar %6621, %int1_9292 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_9293 = torch.constant.int 1
    %6627 = torch.aten.add.Tensor %6625, %6626, %int1_9293 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_9294 = torch.constant.int 5
    %6628 = torch.prims.convert_element_type %6627, %int5_9294 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_9295 = torch.constant.int 0
    %int0_9296 = torch.constant.int 0
    %int9223372036854775807_9297 = torch.constant.int 9223372036854775807
    %int1_9298 = torch.constant.int 1
    %6629 = torch.aten.slice.Tensor %6628, %int0_9295, %int0_9296, %int9223372036854775807_9297, %int1_9298 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_9299 = torch.constant.int 1
    %6630 = torch.aten.unsqueeze %6629, %int1_9299 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_9300 = torch.constant.int 2
    %int0_9301 = torch.constant.int 0
    %int9223372036854775807_9302 = torch.constant.int 9223372036854775807
    %int1_9303 = torch.constant.int 1
    %6631 = torch.aten.slice.Tensor %6630, %int2_9300, %int0_9301, %int9223372036854775807_9302, %int1_9303 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_9304 = torch.constant.int -1
    %int0_9305 = torch.constant.int 0
    %int3072_9306 = torch.constant.int 3072
    %int1_9307 = torch.constant.int 1
    %6632 = torch.aten.slice.Tensor %6631, %int-1_9304, %int0_9305, %int3072_9306, %int1_9307 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9308 = torch.constant.int -1
    %int3072_9309 = torch.constant.int 3072
    %int6144_9310 = torch.constant.int 6144
    %int1_9311 = torch.constant.int 1
    %6633 = torch.aten.slice.Tensor %6631, %int-1_9308, %int3072_9309, %int6144_9310, %int1_9311 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9312 = torch.constant.int -1
    %int6144_9313 = torch.constant.int 6144
    %int9216_9314 = torch.constant.int 9216
    %int1_9315 = torch.constant.int 1
    %6634 = torch.aten.slice.Tensor %6631, %int-1_9312, %int6144_9313, %int9216_9314, %int1_9315 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_9316 = torch.constant.int 1
    %int1_9317 = torch.constant.int 1
    %6635 = torch.aten.add.Scalar %6633, %int1_9316, %int1_9317 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_9318 = torch.constant.int 6
    %6636 = torch.prims.convert_element_type %6616, %int6_9318 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_9319 = torch.constant.int 2
    %6637 = torch.prim.ListConstruct %int2_9319 : (!torch.int) -> !torch.list<int>
    %int0_9320 = torch.constant.int 0
    %true_9321 = torch.constant.bool true
    %result0_9322, %result1_9323 = torch.aten.var_mean.correction %6636, %6637, %int0_9320, %true_9321 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_9324 = torch.constant.float 9.9999999999999995E-7
    %int1_9325 = torch.constant.int 1
    %6638 = torch.aten.add.Scalar %result0_9322, %float9.999990e-07_9324, %int1_9325 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %6639 = torch.aten.rsqrt %6638 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_9326 = torch.constant.int 1
    %6640 = torch.aten.sub.Tensor %6616, %result1_9323, %int1_9326 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %6641 = torch.aten.mul.Tensor %6640, %6639 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_9327 = torch.constant.int 5
    %6642 = torch.prims.convert_element_type %6641, %int5_9327 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6643 = torch.aten.mul.Tensor %6635, %6642 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9328 = torch.constant.int 1
    %6644 = torch.aten.add.Tensor %6643, %6632, %int1_9328 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_9329 = torch.constant.int 4608
    %int3072_9330 = torch.constant.int 3072
    %6645 = torch.prim.ListConstruct %int4608_9329, %int3072_9330 : (!torch.int, !torch.int) -> !torch.list<int>
    %6646 = torch.aten.view %6644, %6645 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.0.linear1.weight = util.global.load @__auto.sampler.single_blocks.0.linear1.weight : tensor<21504x3072xf16>
    %6647 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_9331 = torch.constant.int 0
    %int1_9332 = torch.constant.int 1
    %6648 = torch.aten.transpose.int %6647, %int0_9331, %int1_9332 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.0.linear1.bias = util.global.load @__auto.sampler.single_blocks.0.linear1.bias : tensor<21504xf16>
    %6649 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_9333 = torch.constant.int 6
    %6650 = torch.prims.convert_element_type %6649, %int6_9333 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_9334 = torch.constant.int 6
    %6651 = torch.prims.convert_element_type %6646, %int6_9334 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_9335 = torch.constant.int 6
    %6652 = torch.prims.convert_element_type %6648, %int6_9335 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %6653 = torch.aten.mm %6651, %6652 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_9336 = torch.constant.int 1
    %6654 = torch.aten.mul.Scalar %6653, %int1_9336 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_9337 = torch.constant.int 1
    %6655 = torch.aten.mul.Scalar %6650, %int1_9337 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_9338 = torch.constant.int 1
    %6656 = torch.aten.add.Tensor %6654, %6655, %int1_9338 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_9339 = torch.constant.int 5
    %6657 = torch.prims.convert_element_type %6656, %int5_9339 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_9340 = torch.constant.int 1
    %int4608_9341 = torch.constant.int 4608
    %int21504 = torch.constant.int 21504
    %6658 = torch.prim.ListConstruct %int1_9340, %int4608_9341, %int21504 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6659 = torch.aten.view %6657, %6658 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_9342 = torch.constant.int -1
    %int0_9343 = torch.constant.int 0
    %int9216_9344 = torch.constant.int 9216
    %int1_9345 = torch.constant.int 1
    %6660 = torch.aten.slice.Tensor %6659, %int-1_9342, %int0_9343, %int9216_9344, %int1_9345 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_9346 = torch.constant.int -1
    %int9216_9347 = torch.constant.int 9216
    %int21504_9348 = torch.constant.int 21504
    %int1_9349 = torch.constant.int 1
    %6661 = torch.aten.slice.Tensor %6659, %int-1_9346, %int9216_9347, %int21504_9348, %int1_9349 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_9350 = torch.constant.int 1
    %int4608_9351 = torch.constant.int 4608
    %int3_9352 = torch.constant.int 3
    %int24_9353 = torch.constant.int 24
    %int128_9354 = torch.constant.int 128
    %6662 = torch.prim.ListConstruct %int1_9350, %int4608_9351, %int3_9352, %int24_9353, %int128_9354 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6663 = torch.aten.view %6660, %6662 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_9355 = torch.constant.int 2
    %int0_9356 = torch.constant.int 0
    %int3_9357 = torch.constant.int 3
    %int1_9358 = torch.constant.int 1
    %int4_9359 = torch.constant.int 4
    %6664 = torch.prim.ListConstruct %int2_9355, %int0_9356, %int3_9357, %int1_9358, %int4_9359 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6665 = torch.aten.permute %6663, %6664 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_9360 = torch.constant.int 0
    %int0_9361 = torch.constant.int 0
    %6666 = torch.aten.select.int %6665, %int0_9360, %int0_9361 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9362 = torch.constant.int 0
    %int1_9363 = torch.constant.int 1
    %6667 = torch.aten.select.int %6665, %int0_9362, %int1_9363 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9364 = torch.constant.int 0
    %int2_9365 = torch.constant.int 2
    %6668 = torch.aten.select.int %6665, %int0_9364, %int2_9365 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9366 = torch.constant.int 6
    %6669 = torch.prims.convert_element_type %6666, %int6_9366 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9367 = torch.constant.int 2
    %6670 = torch.aten.pow.Tensor_Scalar %6669, %int2_9367 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9368 = torch.constant.int -1
    %6671 = torch.prim.ListConstruct %int-1_9368 : (!torch.int) -> !torch.list<int>
    %true_9369 = torch.constant.bool true
    %none_9370 = torch.constant.none
    %6672 = torch.aten.mean.dim %6670, %6671, %true_9369, %none_9370 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9371 = torch.constant.float 9.9999999999999995E-7
    %int1_9372 = torch.constant.int 1
    %6673 = torch.aten.add.Scalar %6672, %float9.999990e-07_9371, %int1_9372 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6674 = torch.aten.rsqrt %6673 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6675 = torch.aten.mul.Tensor %6669, %6674 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9373 = torch.constant.int 5
    %6676 = torch.prims.convert_element_type %6675, %int5_9373 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.0.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.0.norm.query_norm.scale : tensor<128xf16>
    %6677 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6678 = torch.aten.mul.Tensor %6676, %6677 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9374 = torch.constant.int 6
    %6679 = torch.prims.convert_element_type %6667, %int6_9374 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9375 = torch.constant.int 2
    %6680 = torch.aten.pow.Tensor_Scalar %6679, %int2_9375 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9376 = torch.constant.int -1
    %6681 = torch.prim.ListConstruct %int-1_9376 : (!torch.int) -> !torch.list<int>
    %true_9377 = torch.constant.bool true
    %none_9378 = torch.constant.none
    %6682 = torch.aten.mean.dim %6680, %6681, %true_9377, %none_9378 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9379 = torch.constant.float 9.9999999999999995E-7
    %int1_9380 = torch.constant.int 1
    %6683 = torch.aten.add.Scalar %6682, %float9.999990e-07_9379, %int1_9380 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6684 = torch.aten.rsqrt %6683 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6685 = torch.aten.mul.Tensor %6679, %6684 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9381 = torch.constant.int 5
    %6686 = torch.prims.convert_element_type %6685, %int5_9381 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.0.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.0.norm.key_norm.scale : tensor<128xf16>
    %6687 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6688 = torch.aten.mul.Tensor %6686, %6687 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9382 = torch.constant.int 5
    %6689 = torch.prims.convert_element_type %6678, %int5_9382 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9383 = torch.constant.int 5
    %6690 = torch.prims.convert_element_type %6688, %int5_9383 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9384 = torch.constant.int 6
    %6691 = torch.prims.convert_element_type %6689, %int6_9384 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9385 = torch.constant.int 1
    %int24_9386 = torch.constant.int 24
    %int4608_9387 = torch.constant.int 4608
    %int64_9388 = torch.constant.int 64
    %int1_9389 = torch.constant.int 1
    %int2_9390 = torch.constant.int 2
    %6692 = torch.prim.ListConstruct %int1_9385, %int24_9386, %int4608_9387, %int64_9388, %int1_9389, %int2_9390 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6693 = torch.aten.view %6691, %6692 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_9391 = torch.constant.int 6
    %6694 = torch.prims.convert_element_type %6690, %int6_9391 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9392 = torch.constant.int 1
    %int24_9393 = torch.constant.int 24
    %int4608_9394 = torch.constant.int 4608
    %int64_9395 = torch.constant.int 64
    %int1_9396 = torch.constant.int 1
    %int2_9397 = torch.constant.int 2
    %6695 = torch.prim.ListConstruct %int1_9392, %int24_9393, %int4608_9394, %int64_9395, %int1_9396, %int2_9397 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6696 = torch.aten.view %6694, %6695 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_9398 = torch.constant.int 5
    %int0_9399 = torch.constant.int 0
    %6697 = torch.aten.select.int %211, %int5_9398, %int0_9399 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9400 = torch.constant.int 5
    %int0_9401 = torch.constant.int 0
    %6698 = torch.aten.select.int %6693, %int5_9400, %int0_9401 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6699 = torch.aten.mul.Tensor %6697, %6698 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9402 = torch.constant.int 5
    %int1_9403 = torch.constant.int 1
    %6700 = torch.aten.select.int %211, %int5_9402, %int1_9403 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9404 = torch.constant.int 5
    %int1_9405 = torch.constant.int 1
    %6701 = torch.aten.select.int %6693, %int5_9404, %int1_9405 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6702 = torch.aten.mul.Tensor %6700, %6701 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9406 = torch.constant.int 1
    %6703 = torch.aten.add.Tensor %6699, %6702, %int1_9406 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9407 = torch.constant.int 5
    %int0_9408 = torch.constant.int 0
    %6704 = torch.aten.select.int %211, %int5_9407, %int0_9408 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9409 = torch.constant.int 5
    %int0_9410 = torch.constant.int 0
    %6705 = torch.aten.select.int %6696, %int5_9409, %int0_9410 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6706 = torch.aten.mul.Tensor %6704, %6705 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9411 = torch.constant.int 5
    %int1_9412 = torch.constant.int 1
    %6707 = torch.aten.select.int %211, %int5_9411, %int1_9412 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9413 = torch.constant.int 5
    %int1_9414 = torch.constant.int 1
    %6708 = torch.aten.select.int %6696, %int5_9413, %int1_9414 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6709 = torch.aten.mul.Tensor %6707, %6708 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9415 = torch.constant.int 1
    %6710 = torch.aten.add.Tensor %6706, %6709, %int1_9415 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9416 = torch.constant.int 1
    %int24_9417 = torch.constant.int 24
    %int4608_9418 = torch.constant.int 4608
    %int128_9419 = torch.constant.int 128
    %6711 = torch.prim.ListConstruct %int1_9416, %int24_9417, %int4608_9418, %int128_9419 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6712 = torch.aten.view %6703, %6711 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9420 = torch.constant.int 5
    %6713 = torch.prims.convert_element_type %6712, %int5_9420 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_9421 = torch.constant.int 1
    %int24_9422 = torch.constant.int 24
    %int4608_9423 = torch.constant.int 4608
    %int128_9424 = torch.constant.int 128
    %6714 = torch.prim.ListConstruct %int1_9421, %int24_9422, %int4608_9423, %int128_9424 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6715 = torch.aten.view %6710, %6714 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9425 = torch.constant.int 5
    %6716 = torch.prims.convert_element_type %6715, %int5_9425 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_9426 = torch.constant.float 0.000000e+00
    %false_9427 = torch.constant.bool false
    %none_9428 = torch.constant.none
    %none_9429 = torch.constant.none
    %6717:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%6713, %6716, %6668, %float0.000000e00_9426, %false_9427, %none_9428, %none_9429) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_9430 = torch.constant.int 0
    %int2_9431 = torch.constant.int 2
    %int1_9432 = torch.constant.int 1
    %int3_9433 = torch.constant.int 3
    %6718 = torch.prim.ListConstruct %int0_9430, %int2_9431, %int1_9432, %int3_9433 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6719 = torch.aten.permute %6717#0, %6718 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_9434 = torch.constant.int 1
    %int4608_9435 = torch.constant.int 4608
    %int3072_9436 = torch.constant.int 3072
    %6720 = torch.prim.ListConstruct %int1_9434, %int4608_9435, %int3072_9436 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6721 = torch.aten.view %6719, %6720 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_9437 = torch.constant.str "tanh"
    %6722 = torch.aten.gelu %6661, %str_9437 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %6723 = torch.prim.ListConstruct %6721, %6722 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_9438 = torch.constant.int 2
    %6724 = torch.aten.cat %6723, %int2_9438 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_9439 = torch.constant.int 4608
    %int15360_9440 = torch.constant.int 15360
    %6725 = torch.prim.ListConstruct %int4608_9439, %int15360_9440 : (!torch.int, !torch.int) -> !torch.list<int>
    %6726 = torch.aten.view %6724, %6725 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.0.linear2.weight = util.global.load @__auto.sampler.single_blocks.0.linear2.weight : tensor<3072x15360xf16>
    %6727 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_9441 = torch.constant.int 0
    %int1_9442 = torch.constant.int 1
    %6728 = torch.aten.transpose.int %6727, %int0_9441, %int1_9442 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.0.linear2.bias = util.global.load @__auto.sampler.single_blocks.0.linear2.bias : tensor<3072xf16>
    %6729 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_9443 = torch.constant.int 6
    %6730 = torch.prims.convert_element_type %6729, %int6_9443 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_9444 = torch.constant.int 6
    %6731 = torch.prims.convert_element_type %6726, %int6_9444 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_9445 = torch.constant.int 6
    %6732 = torch.prims.convert_element_type %6728, %int6_9445 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %6733 = torch.aten.mm %6731, %6732 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_9446 = torch.constant.int 1
    %6734 = torch.aten.mul.Scalar %6733, %int1_9446 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_9447 = torch.constant.int 1
    %6735 = torch.aten.mul.Scalar %6730, %int1_9447 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_9448 = torch.constant.int 1
    %6736 = torch.aten.add.Tensor %6734, %6735, %int1_9448 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_9449 = torch.constant.int 5
    %6737 = torch.prims.convert_element_type %6736, %int5_9449 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_9450 = torch.constant.int 1
    %int4608_9451 = torch.constant.int 4608
    %int3072_9452 = torch.constant.int 3072
    %6738 = torch.prim.ListConstruct %int1_9450, %int4608_9451, %int3072_9452 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6739 = torch.aten.view %6737, %6738 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %6740 = torch.aten.mul.Tensor %6634, %6739 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9453 = torch.constant.int 1
    %6741 = torch.aten.add.Tensor %6616, %6740, %int1_9453 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6742 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.1.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.1.modulation.lin.weight : tensor<9216x3072xf16>
    %6743 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_9454 = torch.constant.int 0
    %int1_9455 = torch.constant.int 1
    %6744 = torch.aten.transpose.int %6743, %int0_9454, %int1_9455 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.1.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.1.modulation.lin.bias : tensor<9216xf16>
    %6745 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_9456 = torch.constant.int 6
    %6746 = torch.prims.convert_element_type %6745, %int6_9456 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_9457 = torch.constant.int 6
    %6747 = torch.prims.convert_element_type %6742, %int6_9457 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_9458 = torch.constant.int 6
    %6748 = torch.prims.convert_element_type %6744, %int6_9458 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %6749 = torch.aten.mm %6747, %6748 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_9459 = torch.constant.int 1
    %6750 = torch.aten.mul.Scalar %6749, %int1_9459 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_9460 = torch.constant.int 1
    %6751 = torch.aten.mul.Scalar %6746, %int1_9460 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_9461 = torch.constant.int 1
    %6752 = torch.aten.add.Tensor %6750, %6751, %int1_9461 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_9462 = torch.constant.int 5
    %6753 = torch.prims.convert_element_type %6752, %int5_9462 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_9463 = torch.constant.int 0
    %int0_9464 = torch.constant.int 0
    %int9223372036854775807_9465 = torch.constant.int 9223372036854775807
    %int1_9466 = torch.constant.int 1
    %6754 = torch.aten.slice.Tensor %6753, %int0_9463, %int0_9464, %int9223372036854775807_9465, %int1_9466 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_9467 = torch.constant.int 1
    %6755 = torch.aten.unsqueeze %6754, %int1_9467 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_9468 = torch.constant.int 2
    %int0_9469 = torch.constant.int 0
    %int9223372036854775807_9470 = torch.constant.int 9223372036854775807
    %int1_9471 = torch.constant.int 1
    %6756 = torch.aten.slice.Tensor %6755, %int2_9468, %int0_9469, %int9223372036854775807_9470, %int1_9471 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_9472 = torch.constant.int -1
    %int0_9473 = torch.constant.int 0
    %int3072_9474 = torch.constant.int 3072
    %int1_9475 = torch.constant.int 1
    %6757 = torch.aten.slice.Tensor %6756, %int-1_9472, %int0_9473, %int3072_9474, %int1_9475 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9476 = torch.constant.int -1
    %int3072_9477 = torch.constant.int 3072
    %int6144_9478 = torch.constant.int 6144
    %int1_9479 = torch.constant.int 1
    %6758 = torch.aten.slice.Tensor %6756, %int-1_9476, %int3072_9477, %int6144_9478, %int1_9479 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9480 = torch.constant.int -1
    %int6144_9481 = torch.constant.int 6144
    %int9216_9482 = torch.constant.int 9216
    %int1_9483 = torch.constant.int 1
    %6759 = torch.aten.slice.Tensor %6756, %int-1_9480, %int6144_9481, %int9216_9482, %int1_9483 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_9484 = torch.constant.int 1
    %int1_9485 = torch.constant.int 1
    %6760 = torch.aten.add.Scalar %6758, %int1_9484, %int1_9485 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_9486 = torch.constant.int 6
    %6761 = torch.prims.convert_element_type %6741, %int6_9486 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_9487 = torch.constant.int 2
    %6762 = torch.prim.ListConstruct %int2_9487 : (!torch.int) -> !torch.list<int>
    %int0_9488 = torch.constant.int 0
    %true_9489 = torch.constant.bool true
    %result0_9490, %result1_9491 = torch.aten.var_mean.correction %6761, %6762, %int0_9488, %true_9489 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_9492 = torch.constant.float 9.9999999999999995E-7
    %int1_9493 = torch.constant.int 1
    %6763 = torch.aten.add.Scalar %result0_9490, %float9.999990e-07_9492, %int1_9493 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %6764 = torch.aten.rsqrt %6763 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_9494 = torch.constant.int 1
    %6765 = torch.aten.sub.Tensor %6741, %result1_9491, %int1_9494 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %6766 = torch.aten.mul.Tensor %6765, %6764 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_9495 = torch.constant.int 5
    %6767 = torch.prims.convert_element_type %6766, %int5_9495 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6768 = torch.aten.mul.Tensor %6760, %6767 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9496 = torch.constant.int 1
    %6769 = torch.aten.add.Tensor %6768, %6757, %int1_9496 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_9497 = torch.constant.int 4608
    %int3072_9498 = torch.constant.int 3072
    %6770 = torch.prim.ListConstruct %int4608_9497, %int3072_9498 : (!torch.int, !torch.int) -> !torch.list<int>
    %6771 = torch.aten.view %6769, %6770 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.1.linear1.weight = util.global.load @__auto.sampler.single_blocks.1.linear1.weight : tensor<21504x3072xf16>
    %6772 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_9499 = torch.constant.int 0
    %int1_9500 = torch.constant.int 1
    %6773 = torch.aten.transpose.int %6772, %int0_9499, %int1_9500 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.1.linear1.bias = util.global.load @__auto.sampler.single_blocks.1.linear1.bias : tensor<21504xf16>
    %6774 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_9501 = torch.constant.int 6
    %6775 = torch.prims.convert_element_type %6774, %int6_9501 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_9502 = torch.constant.int 6
    %6776 = torch.prims.convert_element_type %6771, %int6_9502 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_9503 = torch.constant.int 6
    %6777 = torch.prims.convert_element_type %6773, %int6_9503 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %6778 = torch.aten.mm %6776, %6777 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_9504 = torch.constant.int 1
    %6779 = torch.aten.mul.Scalar %6778, %int1_9504 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_9505 = torch.constant.int 1
    %6780 = torch.aten.mul.Scalar %6775, %int1_9505 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_9506 = torch.constant.int 1
    %6781 = torch.aten.add.Tensor %6779, %6780, %int1_9506 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_9507 = torch.constant.int 5
    %6782 = torch.prims.convert_element_type %6781, %int5_9507 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_9508 = torch.constant.int 1
    %int4608_9509 = torch.constant.int 4608
    %int21504_9510 = torch.constant.int 21504
    %6783 = torch.prim.ListConstruct %int1_9508, %int4608_9509, %int21504_9510 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6784 = torch.aten.view %6782, %6783 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_9511 = torch.constant.int -1
    %int0_9512 = torch.constant.int 0
    %int9216_9513 = torch.constant.int 9216
    %int1_9514 = torch.constant.int 1
    %6785 = torch.aten.slice.Tensor %6784, %int-1_9511, %int0_9512, %int9216_9513, %int1_9514 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_9515 = torch.constant.int -1
    %int9216_9516 = torch.constant.int 9216
    %int21504_9517 = torch.constant.int 21504
    %int1_9518 = torch.constant.int 1
    %6786 = torch.aten.slice.Tensor %6784, %int-1_9515, %int9216_9516, %int21504_9517, %int1_9518 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_9519 = torch.constant.int 1
    %int4608_9520 = torch.constant.int 4608
    %int3_9521 = torch.constant.int 3
    %int24_9522 = torch.constant.int 24
    %int128_9523 = torch.constant.int 128
    %6787 = torch.prim.ListConstruct %int1_9519, %int4608_9520, %int3_9521, %int24_9522, %int128_9523 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6788 = torch.aten.view %6785, %6787 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_9524 = torch.constant.int 2
    %int0_9525 = torch.constant.int 0
    %int3_9526 = torch.constant.int 3
    %int1_9527 = torch.constant.int 1
    %int4_9528 = torch.constant.int 4
    %6789 = torch.prim.ListConstruct %int2_9524, %int0_9525, %int3_9526, %int1_9527, %int4_9528 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6790 = torch.aten.permute %6788, %6789 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_9529 = torch.constant.int 0
    %int0_9530 = torch.constant.int 0
    %6791 = torch.aten.select.int %6790, %int0_9529, %int0_9530 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9531 = torch.constant.int 0
    %int1_9532 = torch.constant.int 1
    %6792 = torch.aten.select.int %6790, %int0_9531, %int1_9532 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9533 = torch.constant.int 0
    %int2_9534 = torch.constant.int 2
    %6793 = torch.aten.select.int %6790, %int0_9533, %int2_9534 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9535 = torch.constant.int 6
    %6794 = torch.prims.convert_element_type %6791, %int6_9535 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9536 = torch.constant.int 2
    %6795 = torch.aten.pow.Tensor_Scalar %6794, %int2_9536 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9537 = torch.constant.int -1
    %6796 = torch.prim.ListConstruct %int-1_9537 : (!torch.int) -> !torch.list<int>
    %true_9538 = torch.constant.bool true
    %none_9539 = torch.constant.none
    %6797 = torch.aten.mean.dim %6795, %6796, %true_9538, %none_9539 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9540 = torch.constant.float 9.9999999999999995E-7
    %int1_9541 = torch.constant.int 1
    %6798 = torch.aten.add.Scalar %6797, %float9.999990e-07_9540, %int1_9541 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6799 = torch.aten.rsqrt %6798 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6800 = torch.aten.mul.Tensor %6794, %6799 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9542 = torch.constant.int 5
    %6801 = torch.prims.convert_element_type %6800, %int5_9542 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.1.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.1.norm.query_norm.scale : tensor<128xf16>
    %6802 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6803 = torch.aten.mul.Tensor %6801, %6802 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9543 = torch.constant.int 6
    %6804 = torch.prims.convert_element_type %6792, %int6_9543 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9544 = torch.constant.int 2
    %6805 = torch.aten.pow.Tensor_Scalar %6804, %int2_9544 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9545 = torch.constant.int -1
    %6806 = torch.prim.ListConstruct %int-1_9545 : (!torch.int) -> !torch.list<int>
    %true_9546 = torch.constant.bool true
    %none_9547 = torch.constant.none
    %6807 = torch.aten.mean.dim %6805, %6806, %true_9546, %none_9547 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9548 = torch.constant.float 9.9999999999999995E-7
    %int1_9549 = torch.constant.int 1
    %6808 = torch.aten.add.Scalar %6807, %float9.999990e-07_9548, %int1_9549 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6809 = torch.aten.rsqrt %6808 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6810 = torch.aten.mul.Tensor %6804, %6809 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9550 = torch.constant.int 5
    %6811 = torch.prims.convert_element_type %6810, %int5_9550 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.1.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.1.norm.key_norm.scale : tensor<128xf16>
    %6812 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6813 = torch.aten.mul.Tensor %6811, %6812 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9551 = torch.constant.int 5
    %6814 = torch.prims.convert_element_type %6803, %int5_9551 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9552 = torch.constant.int 5
    %6815 = torch.prims.convert_element_type %6813, %int5_9552 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9553 = torch.constant.int 6
    %6816 = torch.prims.convert_element_type %6814, %int6_9553 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9554 = torch.constant.int 1
    %int24_9555 = torch.constant.int 24
    %int4608_9556 = torch.constant.int 4608
    %int64_9557 = torch.constant.int 64
    %int1_9558 = torch.constant.int 1
    %int2_9559 = torch.constant.int 2
    %6817 = torch.prim.ListConstruct %int1_9554, %int24_9555, %int4608_9556, %int64_9557, %int1_9558, %int2_9559 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6818 = torch.aten.view %6816, %6817 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_9560 = torch.constant.int 6
    %6819 = torch.prims.convert_element_type %6815, %int6_9560 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9561 = torch.constant.int 1
    %int24_9562 = torch.constant.int 24
    %int4608_9563 = torch.constant.int 4608
    %int64_9564 = torch.constant.int 64
    %int1_9565 = torch.constant.int 1
    %int2_9566 = torch.constant.int 2
    %6820 = torch.prim.ListConstruct %int1_9561, %int24_9562, %int4608_9563, %int64_9564, %int1_9565, %int2_9566 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6821 = torch.aten.view %6819, %6820 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_9567 = torch.constant.int 5
    %int0_9568 = torch.constant.int 0
    %6822 = torch.aten.select.int %211, %int5_9567, %int0_9568 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9569 = torch.constant.int 5
    %int0_9570 = torch.constant.int 0
    %6823 = torch.aten.select.int %6818, %int5_9569, %int0_9570 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6824 = torch.aten.mul.Tensor %6822, %6823 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9571 = torch.constant.int 5
    %int1_9572 = torch.constant.int 1
    %6825 = torch.aten.select.int %211, %int5_9571, %int1_9572 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9573 = torch.constant.int 5
    %int1_9574 = torch.constant.int 1
    %6826 = torch.aten.select.int %6818, %int5_9573, %int1_9574 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6827 = torch.aten.mul.Tensor %6825, %6826 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9575 = torch.constant.int 1
    %6828 = torch.aten.add.Tensor %6824, %6827, %int1_9575 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9576 = torch.constant.int 5
    %int0_9577 = torch.constant.int 0
    %6829 = torch.aten.select.int %211, %int5_9576, %int0_9577 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9578 = torch.constant.int 5
    %int0_9579 = torch.constant.int 0
    %6830 = torch.aten.select.int %6821, %int5_9578, %int0_9579 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6831 = torch.aten.mul.Tensor %6829, %6830 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9580 = torch.constant.int 5
    %int1_9581 = torch.constant.int 1
    %6832 = torch.aten.select.int %211, %int5_9580, %int1_9581 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9582 = torch.constant.int 5
    %int1_9583 = torch.constant.int 1
    %6833 = torch.aten.select.int %6821, %int5_9582, %int1_9583 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6834 = torch.aten.mul.Tensor %6832, %6833 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9584 = torch.constant.int 1
    %6835 = torch.aten.add.Tensor %6831, %6834, %int1_9584 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9585 = torch.constant.int 1
    %int24_9586 = torch.constant.int 24
    %int4608_9587 = torch.constant.int 4608
    %int128_9588 = torch.constant.int 128
    %6836 = torch.prim.ListConstruct %int1_9585, %int24_9586, %int4608_9587, %int128_9588 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6837 = torch.aten.view %6828, %6836 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9589 = torch.constant.int 5
    %6838 = torch.prims.convert_element_type %6837, %int5_9589 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_9590 = torch.constant.int 1
    %int24_9591 = torch.constant.int 24
    %int4608_9592 = torch.constant.int 4608
    %int128_9593 = torch.constant.int 128
    %6839 = torch.prim.ListConstruct %int1_9590, %int24_9591, %int4608_9592, %int128_9593 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6840 = torch.aten.view %6835, %6839 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9594 = torch.constant.int 5
    %6841 = torch.prims.convert_element_type %6840, %int5_9594 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_9595 = torch.constant.float 0.000000e+00
    %false_9596 = torch.constant.bool false
    %none_9597 = torch.constant.none
    %none_9598 = torch.constant.none
    %6842:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%6838, %6841, %6793, %float0.000000e00_9595, %false_9596, %none_9597, %none_9598) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_9599 = torch.constant.int 0
    %int2_9600 = torch.constant.int 2
    %int1_9601 = torch.constant.int 1
    %int3_9602 = torch.constant.int 3
    %6843 = torch.prim.ListConstruct %int0_9599, %int2_9600, %int1_9601, %int3_9602 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6844 = torch.aten.permute %6842#0, %6843 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_9603 = torch.constant.int 1
    %int4608_9604 = torch.constant.int 4608
    %int3072_9605 = torch.constant.int 3072
    %6845 = torch.prim.ListConstruct %int1_9603, %int4608_9604, %int3072_9605 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6846 = torch.aten.view %6844, %6845 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_9606 = torch.constant.str "tanh"
    %6847 = torch.aten.gelu %6786, %str_9606 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %6848 = torch.prim.ListConstruct %6846, %6847 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_9607 = torch.constant.int 2
    %6849 = torch.aten.cat %6848, %int2_9607 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_9608 = torch.constant.int 4608
    %int15360_9609 = torch.constant.int 15360
    %6850 = torch.prim.ListConstruct %int4608_9608, %int15360_9609 : (!torch.int, !torch.int) -> !torch.list<int>
    %6851 = torch.aten.view %6849, %6850 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.1.linear2.weight = util.global.load @__auto.sampler.single_blocks.1.linear2.weight : tensor<3072x15360xf16>
    %6852 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_9610 = torch.constant.int 0
    %int1_9611 = torch.constant.int 1
    %6853 = torch.aten.transpose.int %6852, %int0_9610, %int1_9611 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.1.linear2.bias = util.global.load @__auto.sampler.single_blocks.1.linear2.bias : tensor<3072xf16>
    %6854 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_9612 = torch.constant.int 6
    %6855 = torch.prims.convert_element_type %6854, %int6_9612 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_9613 = torch.constant.int 6
    %6856 = torch.prims.convert_element_type %6851, %int6_9613 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_9614 = torch.constant.int 6
    %6857 = torch.prims.convert_element_type %6853, %int6_9614 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %6858 = torch.aten.mm %6856, %6857 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_9615 = torch.constant.int 1
    %6859 = torch.aten.mul.Scalar %6858, %int1_9615 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_9616 = torch.constant.int 1
    %6860 = torch.aten.mul.Scalar %6855, %int1_9616 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_9617 = torch.constant.int 1
    %6861 = torch.aten.add.Tensor %6859, %6860, %int1_9617 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_9618 = torch.constant.int 5
    %6862 = torch.prims.convert_element_type %6861, %int5_9618 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_9619 = torch.constant.int 1
    %int4608_9620 = torch.constant.int 4608
    %int3072_9621 = torch.constant.int 3072
    %6863 = torch.prim.ListConstruct %int1_9619, %int4608_9620, %int3072_9621 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6864 = torch.aten.view %6862, %6863 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %6865 = torch.aten.mul.Tensor %6759, %6864 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9622 = torch.constant.int 1
    %6866 = torch.aten.add.Tensor %6741, %6865, %int1_9622 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6867 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.2.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.2.modulation.lin.weight : tensor<9216x3072xf16>
    %6868 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_9623 = torch.constant.int 0
    %int1_9624 = torch.constant.int 1
    %6869 = torch.aten.transpose.int %6868, %int0_9623, %int1_9624 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.2.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.2.modulation.lin.bias : tensor<9216xf16>
    %6870 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_9625 = torch.constant.int 6
    %6871 = torch.prims.convert_element_type %6870, %int6_9625 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_9626 = torch.constant.int 6
    %6872 = torch.prims.convert_element_type %6867, %int6_9626 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_9627 = torch.constant.int 6
    %6873 = torch.prims.convert_element_type %6869, %int6_9627 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %6874 = torch.aten.mm %6872, %6873 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_9628 = torch.constant.int 1
    %6875 = torch.aten.mul.Scalar %6874, %int1_9628 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_9629 = torch.constant.int 1
    %6876 = torch.aten.mul.Scalar %6871, %int1_9629 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_9630 = torch.constant.int 1
    %6877 = torch.aten.add.Tensor %6875, %6876, %int1_9630 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_9631 = torch.constant.int 5
    %6878 = torch.prims.convert_element_type %6877, %int5_9631 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_9632 = torch.constant.int 0
    %int0_9633 = torch.constant.int 0
    %int9223372036854775807_9634 = torch.constant.int 9223372036854775807
    %int1_9635 = torch.constant.int 1
    %6879 = torch.aten.slice.Tensor %6878, %int0_9632, %int0_9633, %int9223372036854775807_9634, %int1_9635 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_9636 = torch.constant.int 1
    %6880 = torch.aten.unsqueeze %6879, %int1_9636 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_9637 = torch.constant.int 2
    %int0_9638 = torch.constant.int 0
    %int9223372036854775807_9639 = torch.constant.int 9223372036854775807
    %int1_9640 = torch.constant.int 1
    %6881 = torch.aten.slice.Tensor %6880, %int2_9637, %int0_9638, %int9223372036854775807_9639, %int1_9640 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_9641 = torch.constant.int -1
    %int0_9642 = torch.constant.int 0
    %int3072_9643 = torch.constant.int 3072
    %int1_9644 = torch.constant.int 1
    %6882 = torch.aten.slice.Tensor %6881, %int-1_9641, %int0_9642, %int3072_9643, %int1_9644 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9645 = torch.constant.int -1
    %int3072_9646 = torch.constant.int 3072
    %int6144_9647 = torch.constant.int 6144
    %int1_9648 = torch.constant.int 1
    %6883 = torch.aten.slice.Tensor %6881, %int-1_9645, %int3072_9646, %int6144_9647, %int1_9648 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9649 = torch.constant.int -1
    %int6144_9650 = torch.constant.int 6144
    %int9216_9651 = torch.constant.int 9216
    %int1_9652 = torch.constant.int 1
    %6884 = torch.aten.slice.Tensor %6881, %int-1_9649, %int6144_9650, %int9216_9651, %int1_9652 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_9653 = torch.constant.int 1
    %int1_9654 = torch.constant.int 1
    %6885 = torch.aten.add.Scalar %6883, %int1_9653, %int1_9654 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_9655 = torch.constant.int 6
    %6886 = torch.prims.convert_element_type %6866, %int6_9655 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_9656 = torch.constant.int 2
    %6887 = torch.prim.ListConstruct %int2_9656 : (!torch.int) -> !torch.list<int>
    %int0_9657 = torch.constant.int 0
    %true_9658 = torch.constant.bool true
    %result0_9659, %result1_9660 = torch.aten.var_mean.correction %6886, %6887, %int0_9657, %true_9658 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_9661 = torch.constant.float 9.9999999999999995E-7
    %int1_9662 = torch.constant.int 1
    %6888 = torch.aten.add.Scalar %result0_9659, %float9.999990e-07_9661, %int1_9662 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %6889 = torch.aten.rsqrt %6888 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_9663 = torch.constant.int 1
    %6890 = torch.aten.sub.Tensor %6866, %result1_9660, %int1_9663 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %6891 = torch.aten.mul.Tensor %6890, %6889 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_9664 = torch.constant.int 5
    %6892 = torch.prims.convert_element_type %6891, %int5_9664 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6893 = torch.aten.mul.Tensor %6885, %6892 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9665 = torch.constant.int 1
    %6894 = torch.aten.add.Tensor %6893, %6882, %int1_9665 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_9666 = torch.constant.int 4608
    %int3072_9667 = torch.constant.int 3072
    %6895 = torch.prim.ListConstruct %int4608_9666, %int3072_9667 : (!torch.int, !torch.int) -> !torch.list<int>
    %6896 = torch.aten.view %6894, %6895 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.2.linear1.weight = util.global.load @__auto.sampler.single_blocks.2.linear1.weight : tensor<21504x3072xf16>
    %6897 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_9668 = torch.constant.int 0
    %int1_9669 = torch.constant.int 1
    %6898 = torch.aten.transpose.int %6897, %int0_9668, %int1_9669 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.2.linear1.bias = util.global.load @__auto.sampler.single_blocks.2.linear1.bias : tensor<21504xf16>
    %6899 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_9670 = torch.constant.int 6
    %6900 = torch.prims.convert_element_type %6899, %int6_9670 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_9671 = torch.constant.int 6
    %6901 = torch.prims.convert_element_type %6896, %int6_9671 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_9672 = torch.constant.int 6
    %6902 = torch.prims.convert_element_type %6898, %int6_9672 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %6903 = torch.aten.mm %6901, %6902 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_9673 = torch.constant.int 1
    %6904 = torch.aten.mul.Scalar %6903, %int1_9673 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_9674 = torch.constant.int 1
    %6905 = torch.aten.mul.Scalar %6900, %int1_9674 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_9675 = torch.constant.int 1
    %6906 = torch.aten.add.Tensor %6904, %6905, %int1_9675 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_9676 = torch.constant.int 5
    %6907 = torch.prims.convert_element_type %6906, %int5_9676 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_9677 = torch.constant.int 1
    %int4608_9678 = torch.constant.int 4608
    %int21504_9679 = torch.constant.int 21504
    %6908 = torch.prim.ListConstruct %int1_9677, %int4608_9678, %int21504_9679 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6909 = torch.aten.view %6907, %6908 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_9680 = torch.constant.int -1
    %int0_9681 = torch.constant.int 0
    %int9216_9682 = torch.constant.int 9216
    %int1_9683 = torch.constant.int 1
    %6910 = torch.aten.slice.Tensor %6909, %int-1_9680, %int0_9681, %int9216_9682, %int1_9683 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_9684 = torch.constant.int -1
    %int9216_9685 = torch.constant.int 9216
    %int21504_9686 = torch.constant.int 21504
    %int1_9687 = torch.constant.int 1
    %6911 = torch.aten.slice.Tensor %6909, %int-1_9684, %int9216_9685, %int21504_9686, %int1_9687 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_9688 = torch.constant.int 1
    %int4608_9689 = torch.constant.int 4608
    %int3_9690 = torch.constant.int 3
    %int24_9691 = torch.constant.int 24
    %int128_9692 = torch.constant.int 128
    %6912 = torch.prim.ListConstruct %int1_9688, %int4608_9689, %int3_9690, %int24_9691, %int128_9692 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6913 = torch.aten.view %6910, %6912 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_9693 = torch.constant.int 2
    %int0_9694 = torch.constant.int 0
    %int3_9695 = torch.constant.int 3
    %int1_9696 = torch.constant.int 1
    %int4_9697 = torch.constant.int 4
    %6914 = torch.prim.ListConstruct %int2_9693, %int0_9694, %int3_9695, %int1_9696, %int4_9697 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6915 = torch.aten.permute %6913, %6914 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_9698 = torch.constant.int 0
    %int0_9699 = torch.constant.int 0
    %6916 = torch.aten.select.int %6915, %int0_9698, %int0_9699 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9700 = torch.constant.int 0
    %int1_9701 = torch.constant.int 1
    %6917 = torch.aten.select.int %6915, %int0_9700, %int1_9701 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9702 = torch.constant.int 0
    %int2_9703 = torch.constant.int 2
    %6918 = torch.aten.select.int %6915, %int0_9702, %int2_9703 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9704 = torch.constant.int 6
    %6919 = torch.prims.convert_element_type %6916, %int6_9704 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9705 = torch.constant.int 2
    %6920 = torch.aten.pow.Tensor_Scalar %6919, %int2_9705 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9706 = torch.constant.int -1
    %6921 = torch.prim.ListConstruct %int-1_9706 : (!torch.int) -> !torch.list<int>
    %true_9707 = torch.constant.bool true
    %none_9708 = torch.constant.none
    %6922 = torch.aten.mean.dim %6920, %6921, %true_9707, %none_9708 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9709 = torch.constant.float 9.9999999999999995E-7
    %int1_9710 = torch.constant.int 1
    %6923 = torch.aten.add.Scalar %6922, %float9.999990e-07_9709, %int1_9710 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6924 = torch.aten.rsqrt %6923 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6925 = torch.aten.mul.Tensor %6919, %6924 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9711 = torch.constant.int 5
    %6926 = torch.prims.convert_element_type %6925, %int5_9711 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.2.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.2.norm.query_norm.scale : tensor<128xf16>
    %6927 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6928 = torch.aten.mul.Tensor %6926, %6927 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9712 = torch.constant.int 6
    %6929 = torch.prims.convert_element_type %6917, %int6_9712 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9713 = torch.constant.int 2
    %6930 = torch.aten.pow.Tensor_Scalar %6929, %int2_9713 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9714 = torch.constant.int -1
    %6931 = torch.prim.ListConstruct %int-1_9714 : (!torch.int) -> !torch.list<int>
    %true_9715 = torch.constant.bool true
    %none_9716 = torch.constant.none
    %6932 = torch.aten.mean.dim %6930, %6931, %true_9715, %none_9716 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9717 = torch.constant.float 9.9999999999999995E-7
    %int1_9718 = torch.constant.int 1
    %6933 = torch.aten.add.Scalar %6932, %float9.999990e-07_9717, %int1_9718 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6934 = torch.aten.rsqrt %6933 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6935 = torch.aten.mul.Tensor %6929, %6934 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9719 = torch.constant.int 5
    %6936 = torch.prims.convert_element_type %6935, %int5_9719 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.2.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.2.norm.key_norm.scale : tensor<128xf16>
    %6937 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6938 = torch.aten.mul.Tensor %6936, %6937 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9720 = torch.constant.int 5
    %6939 = torch.prims.convert_element_type %6928, %int5_9720 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9721 = torch.constant.int 5
    %6940 = torch.prims.convert_element_type %6938, %int5_9721 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9722 = torch.constant.int 6
    %6941 = torch.prims.convert_element_type %6939, %int6_9722 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9723 = torch.constant.int 1
    %int24_9724 = torch.constant.int 24
    %int4608_9725 = torch.constant.int 4608
    %int64_9726 = torch.constant.int 64
    %int1_9727 = torch.constant.int 1
    %int2_9728 = torch.constant.int 2
    %6942 = torch.prim.ListConstruct %int1_9723, %int24_9724, %int4608_9725, %int64_9726, %int1_9727, %int2_9728 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6943 = torch.aten.view %6941, %6942 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_9729 = torch.constant.int 6
    %6944 = torch.prims.convert_element_type %6940, %int6_9729 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9730 = torch.constant.int 1
    %int24_9731 = torch.constant.int 24
    %int4608_9732 = torch.constant.int 4608
    %int64_9733 = torch.constant.int 64
    %int1_9734 = torch.constant.int 1
    %int2_9735 = torch.constant.int 2
    %6945 = torch.prim.ListConstruct %int1_9730, %int24_9731, %int4608_9732, %int64_9733, %int1_9734, %int2_9735 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6946 = torch.aten.view %6944, %6945 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_9736 = torch.constant.int 5
    %int0_9737 = torch.constant.int 0
    %6947 = torch.aten.select.int %211, %int5_9736, %int0_9737 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9738 = torch.constant.int 5
    %int0_9739 = torch.constant.int 0
    %6948 = torch.aten.select.int %6943, %int5_9738, %int0_9739 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6949 = torch.aten.mul.Tensor %6947, %6948 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9740 = torch.constant.int 5
    %int1_9741 = torch.constant.int 1
    %6950 = torch.aten.select.int %211, %int5_9740, %int1_9741 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9742 = torch.constant.int 5
    %int1_9743 = torch.constant.int 1
    %6951 = torch.aten.select.int %6943, %int5_9742, %int1_9743 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6952 = torch.aten.mul.Tensor %6950, %6951 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9744 = torch.constant.int 1
    %6953 = torch.aten.add.Tensor %6949, %6952, %int1_9744 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9745 = torch.constant.int 5
    %int0_9746 = torch.constant.int 0
    %6954 = torch.aten.select.int %211, %int5_9745, %int0_9746 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9747 = torch.constant.int 5
    %int0_9748 = torch.constant.int 0
    %6955 = torch.aten.select.int %6946, %int5_9747, %int0_9748 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6956 = torch.aten.mul.Tensor %6954, %6955 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9749 = torch.constant.int 5
    %int1_9750 = torch.constant.int 1
    %6957 = torch.aten.select.int %211, %int5_9749, %int1_9750 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9751 = torch.constant.int 5
    %int1_9752 = torch.constant.int 1
    %6958 = torch.aten.select.int %6946, %int5_9751, %int1_9752 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6959 = torch.aten.mul.Tensor %6957, %6958 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9753 = torch.constant.int 1
    %6960 = torch.aten.add.Tensor %6956, %6959, %int1_9753 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9754 = torch.constant.int 1
    %int24_9755 = torch.constant.int 24
    %int4608_9756 = torch.constant.int 4608
    %int128_9757 = torch.constant.int 128
    %6961 = torch.prim.ListConstruct %int1_9754, %int24_9755, %int4608_9756, %int128_9757 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6962 = torch.aten.view %6953, %6961 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9758 = torch.constant.int 5
    %6963 = torch.prims.convert_element_type %6962, %int5_9758 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_9759 = torch.constant.int 1
    %int24_9760 = torch.constant.int 24
    %int4608_9761 = torch.constant.int 4608
    %int128_9762 = torch.constant.int 128
    %6964 = torch.prim.ListConstruct %int1_9759, %int24_9760, %int4608_9761, %int128_9762 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6965 = torch.aten.view %6960, %6964 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9763 = torch.constant.int 5
    %6966 = torch.prims.convert_element_type %6965, %int5_9763 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_9764 = torch.constant.float 0.000000e+00
    %false_9765 = torch.constant.bool false
    %none_9766 = torch.constant.none
    %none_9767 = torch.constant.none
    %6967:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%6963, %6966, %6918, %float0.000000e00_9764, %false_9765, %none_9766, %none_9767) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_9768 = torch.constant.int 0
    %int2_9769 = torch.constant.int 2
    %int1_9770 = torch.constant.int 1
    %int3_9771 = torch.constant.int 3
    %6968 = torch.prim.ListConstruct %int0_9768, %int2_9769, %int1_9770, %int3_9771 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6969 = torch.aten.permute %6967#0, %6968 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_9772 = torch.constant.int 1
    %int4608_9773 = torch.constant.int 4608
    %int3072_9774 = torch.constant.int 3072
    %6970 = torch.prim.ListConstruct %int1_9772, %int4608_9773, %int3072_9774 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6971 = torch.aten.view %6969, %6970 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_9775 = torch.constant.str "tanh"
    %6972 = torch.aten.gelu %6911, %str_9775 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %6973 = torch.prim.ListConstruct %6971, %6972 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_9776 = torch.constant.int 2
    %6974 = torch.aten.cat %6973, %int2_9776 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_9777 = torch.constant.int 4608
    %int15360_9778 = torch.constant.int 15360
    %6975 = torch.prim.ListConstruct %int4608_9777, %int15360_9778 : (!torch.int, !torch.int) -> !torch.list<int>
    %6976 = torch.aten.view %6974, %6975 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.2.linear2.weight = util.global.load @__auto.sampler.single_blocks.2.linear2.weight : tensor<3072x15360xf16>
    %6977 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_9779 = torch.constant.int 0
    %int1_9780 = torch.constant.int 1
    %6978 = torch.aten.transpose.int %6977, %int0_9779, %int1_9780 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.2.linear2.bias = util.global.load @__auto.sampler.single_blocks.2.linear2.bias : tensor<3072xf16>
    %6979 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_9781 = torch.constant.int 6
    %6980 = torch.prims.convert_element_type %6979, %int6_9781 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_9782 = torch.constant.int 6
    %6981 = torch.prims.convert_element_type %6976, %int6_9782 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_9783 = torch.constant.int 6
    %6982 = torch.prims.convert_element_type %6978, %int6_9783 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %6983 = torch.aten.mm %6981, %6982 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_9784 = torch.constant.int 1
    %6984 = torch.aten.mul.Scalar %6983, %int1_9784 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_9785 = torch.constant.int 1
    %6985 = torch.aten.mul.Scalar %6980, %int1_9785 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_9786 = torch.constant.int 1
    %6986 = torch.aten.add.Tensor %6984, %6985, %int1_9786 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_9787 = torch.constant.int 5
    %6987 = torch.prims.convert_element_type %6986, %int5_9787 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_9788 = torch.constant.int 1
    %int4608_9789 = torch.constant.int 4608
    %int3072_9790 = torch.constant.int 3072
    %6988 = torch.prim.ListConstruct %int1_9788, %int4608_9789, %int3072_9790 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6989 = torch.aten.view %6987, %6988 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %6990 = torch.aten.mul.Tensor %6884, %6989 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9791 = torch.constant.int 1
    %6991 = torch.aten.add.Tensor %6866, %6990, %int1_9791 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6992 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.3.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.3.modulation.lin.weight : tensor<9216x3072xf16>
    %6993 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_9792 = torch.constant.int 0
    %int1_9793 = torch.constant.int 1
    %6994 = torch.aten.transpose.int %6993, %int0_9792, %int1_9793 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.3.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.3.modulation.lin.bias : tensor<9216xf16>
    %6995 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_9794 = torch.constant.int 6
    %6996 = torch.prims.convert_element_type %6995, %int6_9794 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_9795 = torch.constant.int 6
    %6997 = torch.prims.convert_element_type %6992, %int6_9795 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_9796 = torch.constant.int 6
    %6998 = torch.prims.convert_element_type %6994, %int6_9796 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %6999 = torch.aten.mm %6997, %6998 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_9797 = torch.constant.int 1
    %7000 = torch.aten.mul.Scalar %6999, %int1_9797 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_9798 = torch.constant.int 1
    %7001 = torch.aten.mul.Scalar %6996, %int1_9798 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_9799 = torch.constant.int 1
    %7002 = torch.aten.add.Tensor %7000, %7001, %int1_9799 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_9800 = torch.constant.int 5
    %7003 = torch.prims.convert_element_type %7002, %int5_9800 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_9801 = torch.constant.int 0
    %int0_9802 = torch.constant.int 0
    %int9223372036854775807_9803 = torch.constant.int 9223372036854775807
    %int1_9804 = torch.constant.int 1
    %7004 = torch.aten.slice.Tensor %7003, %int0_9801, %int0_9802, %int9223372036854775807_9803, %int1_9804 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_9805 = torch.constant.int 1
    %7005 = torch.aten.unsqueeze %7004, %int1_9805 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_9806 = torch.constant.int 2
    %int0_9807 = torch.constant.int 0
    %int9223372036854775807_9808 = torch.constant.int 9223372036854775807
    %int1_9809 = torch.constant.int 1
    %7006 = torch.aten.slice.Tensor %7005, %int2_9806, %int0_9807, %int9223372036854775807_9808, %int1_9809 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_9810 = torch.constant.int -1
    %int0_9811 = torch.constant.int 0
    %int3072_9812 = torch.constant.int 3072
    %int1_9813 = torch.constant.int 1
    %7007 = torch.aten.slice.Tensor %7006, %int-1_9810, %int0_9811, %int3072_9812, %int1_9813 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9814 = torch.constant.int -1
    %int3072_9815 = torch.constant.int 3072
    %int6144_9816 = torch.constant.int 6144
    %int1_9817 = torch.constant.int 1
    %7008 = torch.aten.slice.Tensor %7006, %int-1_9814, %int3072_9815, %int6144_9816, %int1_9817 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9818 = torch.constant.int -1
    %int6144_9819 = torch.constant.int 6144
    %int9216_9820 = torch.constant.int 9216
    %int1_9821 = torch.constant.int 1
    %7009 = torch.aten.slice.Tensor %7006, %int-1_9818, %int6144_9819, %int9216_9820, %int1_9821 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_9822 = torch.constant.int 1
    %int1_9823 = torch.constant.int 1
    %7010 = torch.aten.add.Scalar %7008, %int1_9822, %int1_9823 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_9824 = torch.constant.int 6
    %7011 = torch.prims.convert_element_type %6991, %int6_9824 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_9825 = torch.constant.int 2
    %7012 = torch.prim.ListConstruct %int2_9825 : (!torch.int) -> !torch.list<int>
    %int0_9826 = torch.constant.int 0
    %true_9827 = torch.constant.bool true
    %result0_9828, %result1_9829 = torch.aten.var_mean.correction %7011, %7012, %int0_9826, %true_9827 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_9830 = torch.constant.float 9.9999999999999995E-7
    %int1_9831 = torch.constant.int 1
    %7013 = torch.aten.add.Scalar %result0_9828, %float9.999990e-07_9830, %int1_9831 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7014 = torch.aten.rsqrt %7013 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_9832 = torch.constant.int 1
    %7015 = torch.aten.sub.Tensor %6991, %result1_9829, %int1_9832 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7016 = torch.aten.mul.Tensor %7015, %7014 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_9833 = torch.constant.int 5
    %7017 = torch.prims.convert_element_type %7016, %int5_9833 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7018 = torch.aten.mul.Tensor %7010, %7017 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9834 = torch.constant.int 1
    %7019 = torch.aten.add.Tensor %7018, %7007, %int1_9834 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_9835 = torch.constant.int 4608
    %int3072_9836 = torch.constant.int 3072
    %7020 = torch.prim.ListConstruct %int4608_9835, %int3072_9836 : (!torch.int, !torch.int) -> !torch.list<int>
    %7021 = torch.aten.view %7019, %7020 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.3.linear1.weight = util.global.load @__auto.sampler.single_blocks.3.linear1.weight : tensor<21504x3072xf16>
    %7022 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_9837 = torch.constant.int 0
    %int1_9838 = torch.constant.int 1
    %7023 = torch.aten.transpose.int %7022, %int0_9837, %int1_9838 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.3.linear1.bias = util.global.load @__auto.sampler.single_blocks.3.linear1.bias : tensor<21504xf16>
    %7024 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_9839 = torch.constant.int 6
    %7025 = torch.prims.convert_element_type %7024, %int6_9839 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_9840 = torch.constant.int 6
    %7026 = torch.prims.convert_element_type %7021, %int6_9840 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_9841 = torch.constant.int 6
    %7027 = torch.prims.convert_element_type %7023, %int6_9841 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7028 = torch.aten.mm %7026, %7027 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_9842 = torch.constant.int 1
    %7029 = torch.aten.mul.Scalar %7028, %int1_9842 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_9843 = torch.constant.int 1
    %7030 = torch.aten.mul.Scalar %7025, %int1_9843 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_9844 = torch.constant.int 1
    %7031 = torch.aten.add.Tensor %7029, %7030, %int1_9844 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_9845 = torch.constant.int 5
    %7032 = torch.prims.convert_element_type %7031, %int5_9845 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_9846 = torch.constant.int 1
    %int4608_9847 = torch.constant.int 4608
    %int21504_9848 = torch.constant.int 21504
    %7033 = torch.prim.ListConstruct %int1_9846, %int4608_9847, %int21504_9848 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7034 = torch.aten.view %7032, %7033 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_9849 = torch.constant.int -1
    %int0_9850 = torch.constant.int 0
    %int9216_9851 = torch.constant.int 9216
    %int1_9852 = torch.constant.int 1
    %7035 = torch.aten.slice.Tensor %7034, %int-1_9849, %int0_9850, %int9216_9851, %int1_9852 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_9853 = torch.constant.int -1
    %int9216_9854 = torch.constant.int 9216
    %int21504_9855 = torch.constant.int 21504
    %int1_9856 = torch.constant.int 1
    %7036 = torch.aten.slice.Tensor %7034, %int-1_9853, %int9216_9854, %int21504_9855, %int1_9856 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_9857 = torch.constant.int 1
    %int4608_9858 = torch.constant.int 4608
    %int3_9859 = torch.constant.int 3
    %int24_9860 = torch.constant.int 24
    %int128_9861 = torch.constant.int 128
    %7037 = torch.prim.ListConstruct %int1_9857, %int4608_9858, %int3_9859, %int24_9860, %int128_9861 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7038 = torch.aten.view %7035, %7037 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_9862 = torch.constant.int 2
    %int0_9863 = torch.constant.int 0
    %int3_9864 = torch.constant.int 3
    %int1_9865 = torch.constant.int 1
    %int4_9866 = torch.constant.int 4
    %7039 = torch.prim.ListConstruct %int2_9862, %int0_9863, %int3_9864, %int1_9865, %int4_9866 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7040 = torch.aten.permute %7038, %7039 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_9867 = torch.constant.int 0
    %int0_9868 = torch.constant.int 0
    %7041 = torch.aten.select.int %7040, %int0_9867, %int0_9868 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9869 = torch.constant.int 0
    %int1_9870 = torch.constant.int 1
    %7042 = torch.aten.select.int %7040, %int0_9869, %int1_9870 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9871 = torch.constant.int 0
    %int2_9872 = torch.constant.int 2
    %7043 = torch.aten.select.int %7040, %int0_9871, %int2_9872 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9873 = torch.constant.int 6
    %7044 = torch.prims.convert_element_type %7041, %int6_9873 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9874 = torch.constant.int 2
    %7045 = torch.aten.pow.Tensor_Scalar %7044, %int2_9874 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9875 = torch.constant.int -1
    %7046 = torch.prim.ListConstruct %int-1_9875 : (!torch.int) -> !torch.list<int>
    %true_9876 = torch.constant.bool true
    %none_9877 = torch.constant.none
    %7047 = torch.aten.mean.dim %7045, %7046, %true_9876, %none_9877 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9878 = torch.constant.float 9.9999999999999995E-7
    %int1_9879 = torch.constant.int 1
    %7048 = torch.aten.add.Scalar %7047, %float9.999990e-07_9878, %int1_9879 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7049 = torch.aten.rsqrt %7048 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7050 = torch.aten.mul.Tensor %7044, %7049 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9880 = torch.constant.int 5
    %7051 = torch.prims.convert_element_type %7050, %int5_9880 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.3.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.3.norm.query_norm.scale : tensor<128xf16>
    %7052 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7053 = torch.aten.mul.Tensor %7051, %7052 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9881 = torch.constant.int 6
    %7054 = torch.prims.convert_element_type %7042, %int6_9881 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9882 = torch.constant.int 2
    %7055 = torch.aten.pow.Tensor_Scalar %7054, %int2_9882 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9883 = torch.constant.int -1
    %7056 = torch.prim.ListConstruct %int-1_9883 : (!torch.int) -> !torch.list<int>
    %true_9884 = torch.constant.bool true
    %none_9885 = torch.constant.none
    %7057 = torch.aten.mean.dim %7055, %7056, %true_9884, %none_9885 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9886 = torch.constant.float 9.9999999999999995E-7
    %int1_9887 = torch.constant.int 1
    %7058 = torch.aten.add.Scalar %7057, %float9.999990e-07_9886, %int1_9887 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7059 = torch.aten.rsqrt %7058 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7060 = torch.aten.mul.Tensor %7054, %7059 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9888 = torch.constant.int 5
    %7061 = torch.prims.convert_element_type %7060, %int5_9888 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.3.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.3.norm.key_norm.scale : tensor<128xf16>
    %7062 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7063 = torch.aten.mul.Tensor %7061, %7062 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9889 = torch.constant.int 5
    %7064 = torch.prims.convert_element_type %7053, %int5_9889 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9890 = torch.constant.int 5
    %7065 = torch.prims.convert_element_type %7063, %int5_9890 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9891 = torch.constant.int 6
    %7066 = torch.prims.convert_element_type %7064, %int6_9891 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9892 = torch.constant.int 1
    %int24_9893 = torch.constant.int 24
    %int4608_9894 = torch.constant.int 4608
    %int64_9895 = torch.constant.int 64
    %int1_9896 = torch.constant.int 1
    %int2_9897 = torch.constant.int 2
    %7067 = torch.prim.ListConstruct %int1_9892, %int24_9893, %int4608_9894, %int64_9895, %int1_9896, %int2_9897 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7068 = torch.aten.view %7066, %7067 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_9898 = torch.constant.int 6
    %7069 = torch.prims.convert_element_type %7065, %int6_9898 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9899 = torch.constant.int 1
    %int24_9900 = torch.constant.int 24
    %int4608_9901 = torch.constant.int 4608
    %int64_9902 = torch.constant.int 64
    %int1_9903 = torch.constant.int 1
    %int2_9904 = torch.constant.int 2
    %7070 = torch.prim.ListConstruct %int1_9899, %int24_9900, %int4608_9901, %int64_9902, %int1_9903, %int2_9904 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7071 = torch.aten.view %7069, %7070 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_9905 = torch.constant.int 5
    %int0_9906 = torch.constant.int 0
    %7072 = torch.aten.select.int %211, %int5_9905, %int0_9906 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9907 = torch.constant.int 5
    %int0_9908 = torch.constant.int 0
    %7073 = torch.aten.select.int %7068, %int5_9907, %int0_9908 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7074 = torch.aten.mul.Tensor %7072, %7073 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9909 = torch.constant.int 5
    %int1_9910 = torch.constant.int 1
    %7075 = torch.aten.select.int %211, %int5_9909, %int1_9910 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9911 = torch.constant.int 5
    %int1_9912 = torch.constant.int 1
    %7076 = torch.aten.select.int %7068, %int5_9911, %int1_9912 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7077 = torch.aten.mul.Tensor %7075, %7076 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9913 = torch.constant.int 1
    %7078 = torch.aten.add.Tensor %7074, %7077, %int1_9913 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9914 = torch.constant.int 5
    %int0_9915 = torch.constant.int 0
    %7079 = torch.aten.select.int %211, %int5_9914, %int0_9915 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9916 = torch.constant.int 5
    %int0_9917 = torch.constant.int 0
    %7080 = torch.aten.select.int %7071, %int5_9916, %int0_9917 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7081 = torch.aten.mul.Tensor %7079, %7080 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9918 = torch.constant.int 5
    %int1_9919 = torch.constant.int 1
    %7082 = torch.aten.select.int %211, %int5_9918, %int1_9919 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9920 = torch.constant.int 5
    %int1_9921 = torch.constant.int 1
    %7083 = torch.aten.select.int %7071, %int5_9920, %int1_9921 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7084 = torch.aten.mul.Tensor %7082, %7083 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9922 = torch.constant.int 1
    %7085 = torch.aten.add.Tensor %7081, %7084, %int1_9922 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9923 = torch.constant.int 1
    %int24_9924 = torch.constant.int 24
    %int4608_9925 = torch.constant.int 4608
    %int128_9926 = torch.constant.int 128
    %7086 = torch.prim.ListConstruct %int1_9923, %int24_9924, %int4608_9925, %int128_9926 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7087 = torch.aten.view %7078, %7086 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9927 = torch.constant.int 5
    %7088 = torch.prims.convert_element_type %7087, %int5_9927 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_9928 = torch.constant.int 1
    %int24_9929 = torch.constant.int 24
    %int4608_9930 = torch.constant.int 4608
    %int128_9931 = torch.constant.int 128
    %7089 = torch.prim.ListConstruct %int1_9928, %int24_9929, %int4608_9930, %int128_9931 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7090 = torch.aten.view %7085, %7089 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9932 = torch.constant.int 5
    %7091 = torch.prims.convert_element_type %7090, %int5_9932 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_9933 = torch.constant.float 0.000000e+00
    %false_9934 = torch.constant.bool false
    %none_9935 = torch.constant.none
    %none_9936 = torch.constant.none
    %7092:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7088, %7091, %7043, %float0.000000e00_9933, %false_9934, %none_9935, %none_9936) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_9937 = torch.constant.int 0
    %int2_9938 = torch.constant.int 2
    %int1_9939 = torch.constant.int 1
    %int3_9940 = torch.constant.int 3
    %7093 = torch.prim.ListConstruct %int0_9937, %int2_9938, %int1_9939, %int3_9940 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7094 = torch.aten.permute %7092#0, %7093 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_9941 = torch.constant.int 1
    %int4608_9942 = torch.constant.int 4608
    %int3072_9943 = torch.constant.int 3072
    %7095 = torch.prim.ListConstruct %int1_9941, %int4608_9942, %int3072_9943 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7096 = torch.aten.view %7094, %7095 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_9944 = torch.constant.str "tanh"
    %7097 = torch.aten.gelu %7036, %str_9944 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7098 = torch.prim.ListConstruct %7096, %7097 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_9945 = torch.constant.int 2
    %7099 = torch.aten.cat %7098, %int2_9945 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_9946 = torch.constant.int 4608
    %int15360_9947 = torch.constant.int 15360
    %7100 = torch.prim.ListConstruct %int4608_9946, %int15360_9947 : (!torch.int, !torch.int) -> !torch.list<int>
    %7101 = torch.aten.view %7099, %7100 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.3.linear2.weight = util.global.load @__auto.sampler.single_blocks.3.linear2.weight : tensor<3072x15360xf16>
    %7102 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_9948 = torch.constant.int 0
    %int1_9949 = torch.constant.int 1
    %7103 = torch.aten.transpose.int %7102, %int0_9948, %int1_9949 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.3.linear2.bias = util.global.load @__auto.sampler.single_blocks.3.linear2.bias : tensor<3072xf16>
    %7104 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_9950 = torch.constant.int 6
    %7105 = torch.prims.convert_element_type %7104, %int6_9950 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_9951 = torch.constant.int 6
    %7106 = torch.prims.convert_element_type %7101, %int6_9951 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_9952 = torch.constant.int 6
    %7107 = torch.prims.convert_element_type %7103, %int6_9952 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7108 = torch.aten.mm %7106, %7107 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_9953 = torch.constant.int 1
    %7109 = torch.aten.mul.Scalar %7108, %int1_9953 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_9954 = torch.constant.int 1
    %7110 = torch.aten.mul.Scalar %7105, %int1_9954 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_9955 = torch.constant.int 1
    %7111 = torch.aten.add.Tensor %7109, %7110, %int1_9955 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_9956 = torch.constant.int 5
    %7112 = torch.prims.convert_element_type %7111, %int5_9956 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_9957 = torch.constant.int 1
    %int4608_9958 = torch.constant.int 4608
    %int3072_9959 = torch.constant.int 3072
    %7113 = torch.prim.ListConstruct %int1_9957, %int4608_9958, %int3072_9959 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7114 = torch.aten.view %7112, %7113 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7115 = torch.aten.mul.Tensor %7009, %7114 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9960 = torch.constant.int 1
    %7116 = torch.aten.add.Tensor %6991, %7115, %int1_9960 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7117 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.4.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.4.modulation.lin.weight : tensor<9216x3072xf16>
    %7118 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_9961 = torch.constant.int 0
    %int1_9962 = torch.constant.int 1
    %7119 = torch.aten.transpose.int %7118, %int0_9961, %int1_9962 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.4.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.4.modulation.lin.bias : tensor<9216xf16>
    %7120 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_9963 = torch.constant.int 6
    %7121 = torch.prims.convert_element_type %7120, %int6_9963 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_9964 = torch.constant.int 6
    %7122 = torch.prims.convert_element_type %7117, %int6_9964 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_9965 = torch.constant.int 6
    %7123 = torch.prims.convert_element_type %7119, %int6_9965 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7124 = torch.aten.mm %7122, %7123 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_9966 = torch.constant.int 1
    %7125 = torch.aten.mul.Scalar %7124, %int1_9966 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_9967 = torch.constant.int 1
    %7126 = torch.aten.mul.Scalar %7121, %int1_9967 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_9968 = torch.constant.int 1
    %7127 = torch.aten.add.Tensor %7125, %7126, %int1_9968 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_9969 = torch.constant.int 5
    %7128 = torch.prims.convert_element_type %7127, %int5_9969 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_9970 = torch.constant.int 0
    %int0_9971 = torch.constant.int 0
    %int9223372036854775807_9972 = torch.constant.int 9223372036854775807
    %int1_9973 = torch.constant.int 1
    %7129 = torch.aten.slice.Tensor %7128, %int0_9970, %int0_9971, %int9223372036854775807_9972, %int1_9973 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_9974 = torch.constant.int 1
    %7130 = torch.aten.unsqueeze %7129, %int1_9974 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_9975 = torch.constant.int 2
    %int0_9976 = torch.constant.int 0
    %int9223372036854775807_9977 = torch.constant.int 9223372036854775807
    %int1_9978 = torch.constant.int 1
    %7131 = torch.aten.slice.Tensor %7130, %int2_9975, %int0_9976, %int9223372036854775807_9977, %int1_9978 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_9979 = torch.constant.int -1
    %int0_9980 = torch.constant.int 0
    %int3072_9981 = torch.constant.int 3072
    %int1_9982 = torch.constant.int 1
    %7132 = torch.aten.slice.Tensor %7131, %int-1_9979, %int0_9980, %int3072_9981, %int1_9982 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9983 = torch.constant.int -1
    %int3072_9984 = torch.constant.int 3072
    %int6144_9985 = torch.constant.int 6144
    %int1_9986 = torch.constant.int 1
    %7133 = torch.aten.slice.Tensor %7131, %int-1_9983, %int3072_9984, %int6144_9985, %int1_9986 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9987 = torch.constant.int -1
    %int6144_9988 = torch.constant.int 6144
    %int9216_9989 = torch.constant.int 9216
    %int1_9990 = torch.constant.int 1
    %7134 = torch.aten.slice.Tensor %7131, %int-1_9987, %int6144_9988, %int9216_9989, %int1_9990 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_9991 = torch.constant.int 1
    %int1_9992 = torch.constant.int 1
    %7135 = torch.aten.add.Scalar %7133, %int1_9991, %int1_9992 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_9993 = torch.constant.int 6
    %7136 = torch.prims.convert_element_type %7116, %int6_9993 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_9994 = torch.constant.int 2
    %7137 = torch.prim.ListConstruct %int2_9994 : (!torch.int) -> !torch.list<int>
    %int0_9995 = torch.constant.int 0
    %true_9996 = torch.constant.bool true
    %result0_9997, %result1_9998 = torch.aten.var_mean.correction %7136, %7137, %int0_9995, %true_9996 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_9999 = torch.constant.float 9.9999999999999995E-7
    %int1_10000 = torch.constant.int 1
    %7138 = torch.aten.add.Scalar %result0_9997, %float9.999990e-07_9999, %int1_10000 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7139 = torch.aten.rsqrt %7138 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_10001 = torch.constant.int 1
    %7140 = torch.aten.sub.Tensor %7116, %result1_9998, %int1_10001 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7141 = torch.aten.mul.Tensor %7140, %7139 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_10002 = torch.constant.int 5
    %7142 = torch.prims.convert_element_type %7141, %int5_10002 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7143 = torch.aten.mul.Tensor %7135, %7142 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10003 = torch.constant.int 1
    %7144 = torch.aten.add.Tensor %7143, %7132, %int1_10003 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_10004 = torch.constant.int 4608
    %int3072_10005 = torch.constant.int 3072
    %7145 = torch.prim.ListConstruct %int4608_10004, %int3072_10005 : (!torch.int, !torch.int) -> !torch.list<int>
    %7146 = torch.aten.view %7144, %7145 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.4.linear1.weight = util.global.load @__auto.sampler.single_blocks.4.linear1.weight : tensor<21504x3072xf16>
    %7147 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_10006 = torch.constant.int 0
    %int1_10007 = torch.constant.int 1
    %7148 = torch.aten.transpose.int %7147, %int0_10006, %int1_10007 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.4.linear1.bias = util.global.load @__auto.sampler.single_blocks.4.linear1.bias : tensor<21504xf16>
    %7149 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_10008 = torch.constant.int 6
    %7150 = torch.prims.convert_element_type %7149, %int6_10008 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_10009 = torch.constant.int 6
    %7151 = torch.prims.convert_element_type %7146, %int6_10009 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_10010 = torch.constant.int 6
    %7152 = torch.prims.convert_element_type %7148, %int6_10010 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7153 = torch.aten.mm %7151, %7152 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_10011 = torch.constant.int 1
    %7154 = torch.aten.mul.Scalar %7153, %int1_10011 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_10012 = torch.constant.int 1
    %7155 = torch.aten.mul.Scalar %7150, %int1_10012 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_10013 = torch.constant.int 1
    %7156 = torch.aten.add.Tensor %7154, %7155, %int1_10013 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_10014 = torch.constant.int 5
    %7157 = torch.prims.convert_element_type %7156, %int5_10014 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_10015 = torch.constant.int 1
    %int4608_10016 = torch.constant.int 4608
    %int21504_10017 = torch.constant.int 21504
    %7158 = torch.prim.ListConstruct %int1_10015, %int4608_10016, %int21504_10017 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7159 = torch.aten.view %7157, %7158 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_10018 = torch.constant.int -1
    %int0_10019 = torch.constant.int 0
    %int9216_10020 = torch.constant.int 9216
    %int1_10021 = torch.constant.int 1
    %7160 = torch.aten.slice.Tensor %7159, %int-1_10018, %int0_10019, %int9216_10020, %int1_10021 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_10022 = torch.constant.int -1
    %int9216_10023 = torch.constant.int 9216
    %int21504_10024 = torch.constant.int 21504
    %int1_10025 = torch.constant.int 1
    %7161 = torch.aten.slice.Tensor %7159, %int-1_10022, %int9216_10023, %int21504_10024, %int1_10025 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_10026 = torch.constant.int 1
    %int4608_10027 = torch.constant.int 4608
    %int3_10028 = torch.constant.int 3
    %int24_10029 = torch.constant.int 24
    %int128_10030 = torch.constant.int 128
    %7162 = torch.prim.ListConstruct %int1_10026, %int4608_10027, %int3_10028, %int24_10029, %int128_10030 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7163 = torch.aten.view %7160, %7162 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_10031 = torch.constant.int 2
    %int0_10032 = torch.constant.int 0
    %int3_10033 = torch.constant.int 3
    %int1_10034 = torch.constant.int 1
    %int4_10035 = torch.constant.int 4
    %7164 = torch.prim.ListConstruct %int2_10031, %int0_10032, %int3_10033, %int1_10034, %int4_10035 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7165 = torch.aten.permute %7163, %7164 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_10036 = torch.constant.int 0
    %int0_10037 = torch.constant.int 0
    %7166 = torch.aten.select.int %7165, %int0_10036, %int0_10037 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10038 = torch.constant.int 0
    %int1_10039 = torch.constant.int 1
    %7167 = torch.aten.select.int %7165, %int0_10038, %int1_10039 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10040 = torch.constant.int 0
    %int2_10041 = torch.constant.int 2
    %7168 = torch.aten.select.int %7165, %int0_10040, %int2_10041 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10042 = torch.constant.int 6
    %7169 = torch.prims.convert_element_type %7166, %int6_10042 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10043 = torch.constant.int 2
    %7170 = torch.aten.pow.Tensor_Scalar %7169, %int2_10043 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10044 = torch.constant.int -1
    %7171 = torch.prim.ListConstruct %int-1_10044 : (!torch.int) -> !torch.list<int>
    %true_10045 = torch.constant.bool true
    %none_10046 = torch.constant.none
    %7172 = torch.aten.mean.dim %7170, %7171, %true_10045, %none_10046 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10047 = torch.constant.float 9.9999999999999995E-7
    %int1_10048 = torch.constant.int 1
    %7173 = torch.aten.add.Scalar %7172, %float9.999990e-07_10047, %int1_10048 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7174 = torch.aten.rsqrt %7173 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7175 = torch.aten.mul.Tensor %7169, %7174 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10049 = torch.constant.int 5
    %7176 = torch.prims.convert_element_type %7175, %int5_10049 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.4.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.4.norm.query_norm.scale : tensor<128xf16>
    %7177 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7178 = torch.aten.mul.Tensor %7176, %7177 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10050 = torch.constant.int 6
    %7179 = torch.prims.convert_element_type %7167, %int6_10050 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10051 = torch.constant.int 2
    %7180 = torch.aten.pow.Tensor_Scalar %7179, %int2_10051 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10052 = torch.constant.int -1
    %7181 = torch.prim.ListConstruct %int-1_10052 : (!torch.int) -> !torch.list<int>
    %true_10053 = torch.constant.bool true
    %none_10054 = torch.constant.none
    %7182 = torch.aten.mean.dim %7180, %7181, %true_10053, %none_10054 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10055 = torch.constant.float 9.9999999999999995E-7
    %int1_10056 = torch.constant.int 1
    %7183 = torch.aten.add.Scalar %7182, %float9.999990e-07_10055, %int1_10056 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7184 = torch.aten.rsqrt %7183 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7185 = torch.aten.mul.Tensor %7179, %7184 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10057 = torch.constant.int 5
    %7186 = torch.prims.convert_element_type %7185, %int5_10057 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.4.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.4.norm.key_norm.scale : tensor<128xf16>
    %7187 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7188 = torch.aten.mul.Tensor %7186, %7187 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10058 = torch.constant.int 5
    %7189 = torch.prims.convert_element_type %7178, %int5_10058 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10059 = torch.constant.int 5
    %7190 = torch.prims.convert_element_type %7188, %int5_10059 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10060 = torch.constant.int 6
    %7191 = torch.prims.convert_element_type %7189, %int6_10060 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10061 = torch.constant.int 1
    %int24_10062 = torch.constant.int 24
    %int4608_10063 = torch.constant.int 4608
    %int64_10064 = torch.constant.int 64
    %int1_10065 = torch.constant.int 1
    %int2_10066 = torch.constant.int 2
    %7192 = torch.prim.ListConstruct %int1_10061, %int24_10062, %int4608_10063, %int64_10064, %int1_10065, %int2_10066 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7193 = torch.aten.view %7191, %7192 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_10067 = torch.constant.int 6
    %7194 = torch.prims.convert_element_type %7190, %int6_10067 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10068 = torch.constant.int 1
    %int24_10069 = torch.constant.int 24
    %int4608_10070 = torch.constant.int 4608
    %int64_10071 = torch.constant.int 64
    %int1_10072 = torch.constant.int 1
    %int2_10073 = torch.constant.int 2
    %7195 = torch.prim.ListConstruct %int1_10068, %int24_10069, %int4608_10070, %int64_10071, %int1_10072, %int2_10073 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7196 = torch.aten.view %7194, %7195 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_10074 = torch.constant.int 5
    %int0_10075 = torch.constant.int 0
    %7197 = torch.aten.select.int %211, %int5_10074, %int0_10075 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10076 = torch.constant.int 5
    %int0_10077 = torch.constant.int 0
    %7198 = torch.aten.select.int %7193, %int5_10076, %int0_10077 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7199 = torch.aten.mul.Tensor %7197, %7198 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10078 = torch.constant.int 5
    %int1_10079 = torch.constant.int 1
    %7200 = torch.aten.select.int %211, %int5_10078, %int1_10079 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10080 = torch.constant.int 5
    %int1_10081 = torch.constant.int 1
    %7201 = torch.aten.select.int %7193, %int5_10080, %int1_10081 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7202 = torch.aten.mul.Tensor %7200, %7201 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10082 = torch.constant.int 1
    %7203 = torch.aten.add.Tensor %7199, %7202, %int1_10082 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10083 = torch.constant.int 5
    %int0_10084 = torch.constant.int 0
    %7204 = torch.aten.select.int %211, %int5_10083, %int0_10084 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10085 = torch.constant.int 5
    %int0_10086 = torch.constant.int 0
    %7205 = torch.aten.select.int %7196, %int5_10085, %int0_10086 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7206 = torch.aten.mul.Tensor %7204, %7205 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10087 = torch.constant.int 5
    %int1_10088 = torch.constant.int 1
    %7207 = torch.aten.select.int %211, %int5_10087, %int1_10088 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10089 = torch.constant.int 5
    %int1_10090 = torch.constant.int 1
    %7208 = torch.aten.select.int %7196, %int5_10089, %int1_10090 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7209 = torch.aten.mul.Tensor %7207, %7208 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10091 = torch.constant.int 1
    %7210 = torch.aten.add.Tensor %7206, %7209, %int1_10091 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10092 = torch.constant.int 1
    %int24_10093 = torch.constant.int 24
    %int4608_10094 = torch.constant.int 4608
    %int128_10095 = torch.constant.int 128
    %7211 = torch.prim.ListConstruct %int1_10092, %int24_10093, %int4608_10094, %int128_10095 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7212 = torch.aten.view %7203, %7211 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10096 = torch.constant.int 5
    %7213 = torch.prims.convert_element_type %7212, %int5_10096 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_10097 = torch.constant.int 1
    %int24_10098 = torch.constant.int 24
    %int4608_10099 = torch.constant.int 4608
    %int128_10100 = torch.constant.int 128
    %7214 = torch.prim.ListConstruct %int1_10097, %int24_10098, %int4608_10099, %int128_10100 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7215 = torch.aten.view %7210, %7214 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10101 = torch.constant.int 5
    %7216 = torch.prims.convert_element_type %7215, %int5_10101 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_10102 = torch.constant.float 0.000000e+00
    %false_10103 = torch.constant.bool false
    %none_10104 = torch.constant.none
    %none_10105 = torch.constant.none
    %7217:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7213, %7216, %7168, %float0.000000e00_10102, %false_10103, %none_10104, %none_10105) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_10106 = torch.constant.int 0
    %int2_10107 = torch.constant.int 2
    %int1_10108 = torch.constant.int 1
    %int3_10109 = torch.constant.int 3
    %7218 = torch.prim.ListConstruct %int0_10106, %int2_10107, %int1_10108, %int3_10109 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7219 = torch.aten.permute %7217#0, %7218 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_10110 = torch.constant.int 1
    %int4608_10111 = torch.constant.int 4608
    %int3072_10112 = torch.constant.int 3072
    %7220 = torch.prim.ListConstruct %int1_10110, %int4608_10111, %int3072_10112 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7221 = torch.aten.view %7219, %7220 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_10113 = torch.constant.str "tanh"
    %7222 = torch.aten.gelu %7161, %str_10113 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7223 = torch.prim.ListConstruct %7221, %7222 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_10114 = torch.constant.int 2
    %7224 = torch.aten.cat %7223, %int2_10114 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_10115 = torch.constant.int 4608
    %int15360_10116 = torch.constant.int 15360
    %7225 = torch.prim.ListConstruct %int4608_10115, %int15360_10116 : (!torch.int, !torch.int) -> !torch.list<int>
    %7226 = torch.aten.view %7224, %7225 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.4.linear2.weight = util.global.load @__auto.sampler.single_blocks.4.linear2.weight : tensor<3072x15360xf16>
    %7227 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_10117 = torch.constant.int 0
    %int1_10118 = torch.constant.int 1
    %7228 = torch.aten.transpose.int %7227, %int0_10117, %int1_10118 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.4.linear2.bias = util.global.load @__auto.sampler.single_blocks.4.linear2.bias : tensor<3072xf16>
    %7229 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_10119 = torch.constant.int 6
    %7230 = torch.prims.convert_element_type %7229, %int6_10119 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_10120 = torch.constant.int 6
    %7231 = torch.prims.convert_element_type %7226, %int6_10120 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_10121 = torch.constant.int 6
    %7232 = torch.prims.convert_element_type %7228, %int6_10121 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7233 = torch.aten.mm %7231, %7232 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_10122 = torch.constant.int 1
    %7234 = torch.aten.mul.Scalar %7233, %int1_10122 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_10123 = torch.constant.int 1
    %7235 = torch.aten.mul.Scalar %7230, %int1_10123 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_10124 = torch.constant.int 1
    %7236 = torch.aten.add.Tensor %7234, %7235, %int1_10124 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_10125 = torch.constant.int 5
    %7237 = torch.prims.convert_element_type %7236, %int5_10125 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_10126 = torch.constant.int 1
    %int4608_10127 = torch.constant.int 4608
    %int3072_10128 = torch.constant.int 3072
    %7238 = torch.prim.ListConstruct %int1_10126, %int4608_10127, %int3072_10128 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7239 = torch.aten.view %7237, %7238 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7240 = torch.aten.mul.Tensor %7134, %7239 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10129 = torch.constant.int 1
    %7241 = torch.aten.add.Tensor %7116, %7240, %int1_10129 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7242 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.5.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.5.modulation.lin.weight : tensor<9216x3072xf16>
    %7243 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_10130 = torch.constant.int 0
    %int1_10131 = torch.constant.int 1
    %7244 = torch.aten.transpose.int %7243, %int0_10130, %int1_10131 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.5.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.5.modulation.lin.bias : tensor<9216xf16>
    %7245 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_10132 = torch.constant.int 6
    %7246 = torch.prims.convert_element_type %7245, %int6_10132 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_10133 = torch.constant.int 6
    %7247 = torch.prims.convert_element_type %7242, %int6_10133 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_10134 = torch.constant.int 6
    %7248 = torch.prims.convert_element_type %7244, %int6_10134 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7249 = torch.aten.mm %7247, %7248 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_10135 = torch.constant.int 1
    %7250 = torch.aten.mul.Scalar %7249, %int1_10135 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_10136 = torch.constant.int 1
    %7251 = torch.aten.mul.Scalar %7246, %int1_10136 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_10137 = torch.constant.int 1
    %7252 = torch.aten.add.Tensor %7250, %7251, %int1_10137 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_10138 = torch.constant.int 5
    %7253 = torch.prims.convert_element_type %7252, %int5_10138 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_10139 = torch.constant.int 0
    %int0_10140 = torch.constant.int 0
    %int9223372036854775807_10141 = torch.constant.int 9223372036854775807
    %int1_10142 = torch.constant.int 1
    %7254 = torch.aten.slice.Tensor %7253, %int0_10139, %int0_10140, %int9223372036854775807_10141, %int1_10142 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_10143 = torch.constant.int 1
    %7255 = torch.aten.unsqueeze %7254, %int1_10143 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_10144 = torch.constant.int 2
    %int0_10145 = torch.constant.int 0
    %int9223372036854775807_10146 = torch.constant.int 9223372036854775807
    %int1_10147 = torch.constant.int 1
    %7256 = torch.aten.slice.Tensor %7255, %int2_10144, %int0_10145, %int9223372036854775807_10146, %int1_10147 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_10148 = torch.constant.int -1
    %int0_10149 = torch.constant.int 0
    %int3072_10150 = torch.constant.int 3072
    %int1_10151 = torch.constant.int 1
    %7257 = torch.aten.slice.Tensor %7256, %int-1_10148, %int0_10149, %int3072_10150, %int1_10151 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10152 = torch.constant.int -1
    %int3072_10153 = torch.constant.int 3072
    %int6144_10154 = torch.constant.int 6144
    %int1_10155 = torch.constant.int 1
    %7258 = torch.aten.slice.Tensor %7256, %int-1_10152, %int3072_10153, %int6144_10154, %int1_10155 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10156 = torch.constant.int -1
    %int6144_10157 = torch.constant.int 6144
    %int9216_10158 = torch.constant.int 9216
    %int1_10159 = torch.constant.int 1
    %7259 = torch.aten.slice.Tensor %7256, %int-1_10156, %int6144_10157, %int9216_10158, %int1_10159 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_10160 = torch.constant.int 1
    %int1_10161 = torch.constant.int 1
    %7260 = torch.aten.add.Scalar %7258, %int1_10160, %int1_10161 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_10162 = torch.constant.int 6
    %7261 = torch.prims.convert_element_type %7241, %int6_10162 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_10163 = torch.constant.int 2
    %7262 = torch.prim.ListConstruct %int2_10163 : (!torch.int) -> !torch.list<int>
    %int0_10164 = torch.constant.int 0
    %true_10165 = torch.constant.bool true
    %result0_10166, %result1_10167 = torch.aten.var_mean.correction %7261, %7262, %int0_10164, %true_10165 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_10168 = torch.constant.float 9.9999999999999995E-7
    %int1_10169 = torch.constant.int 1
    %7263 = torch.aten.add.Scalar %result0_10166, %float9.999990e-07_10168, %int1_10169 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7264 = torch.aten.rsqrt %7263 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_10170 = torch.constant.int 1
    %7265 = torch.aten.sub.Tensor %7241, %result1_10167, %int1_10170 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7266 = torch.aten.mul.Tensor %7265, %7264 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_10171 = torch.constant.int 5
    %7267 = torch.prims.convert_element_type %7266, %int5_10171 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7268 = torch.aten.mul.Tensor %7260, %7267 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10172 = torch.constant.int 1
    %7269 = torch.aten.add.Tensor %7268, %7257, %int1_10172 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_10173 = torch.constant.int 4608
    %int3072_10174 = torch.constant.int 3072
    %7270 = torch.prim.ListConstruct %int4608_10173, %int3072_10174 : (!torch.int, !torch.int) -> !torch.list<int>
    %7271 = torch.aten.view %7269, %7270 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.5.linear1.weight = util.global.load @__auto.sampler.single_blocks.5.linear1.weight : tensor<21504x3072xf16>
    %7272 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_10175 = torch.constant.int 0
    %int1_10176 = torch.constant.int 1
    %7273 = torch.aten.transpose.int %7272, %int0_10175, %int1_10176 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.5.linear1.bias = util.global.load @__auto.sampler.single_blocks.5.linear1.bias : tensor<21504xf16>
    %7274 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_10177 = torch.constant.int 6
    %7275 = torch.prims.convert_element_type %7274, %int6_10177 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_10178 = torch.constant.int 6
    %7276 = torch.prims.convert_element_type %7271, %int6_10178 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_10179 = torch.constant.int 6
    %7277 = torch.prims.convert_element_type %7273, %int6_10179 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7278 = torch.aten.mm %7276, %7277 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_10180 = torch.constant.int 1
    %7279 = torch.aten.mul.Scalar %7278, %int1_10180 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_10181 = torch.constant.int 1
    %7280 = torch.aten.mul.Scalar %7275, %int1_10181 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_10182 = torch.constant.int 1
    %7281 = torch.aten.add.Tensor %7279, %7280, %int1_10182 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_10183 = torch.constant.int 5
    %7282 = torch.prims.convert_element_type %7281, %int5_10183 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_10184 = torch.constant.int 1
    %int4608_10185 = torch.constant.int 4608
    %int21504_10186 = torch.constant.int 21504
    %7283 = torch.prim.ListConstruct %int1_10184, %int4608_10185, %int21504_10186 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7284 = torch.aten.view %7282, %7283 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_10187 = torch.constant.int -1
    %int0_10188 = torch.constant.int 0
    %int9216_10189 = torch.constant.int 9216
    %int1_10190 = torch.constant.int 1
    %7285 = torch.aten.slice.Tensor %7284, %int-1_10187, %int0_10188, %int9216_10189, %int1_10190 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_10191 = torch.constant.int -1
    %int9216_10192 = torch.constant.int 9216
    %int21504_10193 = torch.constant.int 21504
    %int1_10194 = torch.constant.int 1
    %7286 = torch.aten.slice.Tensor %7284, %int-1_10191, %int9216_10192, %int21504_10193, %int1_10194 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_10195 = torch.constant.int 1
    %int4608_10196 = torch.constant.int 4608
    %int3_10197 = torch.constant.int 3
    %int24_10198 = torch.constant.int 24
    %int128_10199 = torch.constant.int 128
    %7287 = torch.prim.ListConstruct %int1_10195, %int4608_10196, %int3_10197, %int24_10198, %int128_10199 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7288 = torch.aten.view %7285, %7287 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_10200 = torch.constant.int 2
    %int0_10201 = torch.constant.int 0
    %int3_10202 = torch.constant.int 3
    %int1_10203 = torch.constant.int 1
    %int4_10204 = torch.constant.int 4
    %7289 = torch.prim.ListConstruct %int2_10200, %int0_10201, %int3_10202, %int1_10203, %int4_10204 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7290 = torch.aten.permute %7288, %7289 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_10205 = torch.constant.int 0
    %int0_10206 = torch.constant.int 0
    %7291 = torch.aten.select.int %7290, %int0_10205, %int0_10206 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10207 = torch.constant.int 0
    %int1_10208 = torch.constant.int 1
    %7292 = torch.aten.select.int %7290, %int0_10207, %int1_10208 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10209 = torch.constant.int 0
    %int2_10210 = torch.constant.int 2
    %7293 = torch.aten.select.int %7290, %int0_10209, %int2_10210 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10211 = torch.constant.int 6
    %7294 = torch.prims.convert_element_type %7291, %int6_10211 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10212 = torch.constant.int 2
    %7295 = torch.aten.pow.Tensor_Scalar %7294, %int2_10212 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10213 = torch.constant.int -1
    %7296 = torch.prim.ListConstruct %int-1_10213 : (!torch.int) -> !torch.list<int>
    %true_10214 = torch.constant.bool true
    %none_10215 = torch.constant.none
    %7297 = torch.aten.mean.dim %7295, %7296, %true_10214, %none_10215 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10216 = torch.constant.float 9.9999999999999995E-7
    %int1_10217 = torch.constant.int 1
    %7298 = torch.aten.add.Scalar %7297, %float9.999990e-07_10216, %int1_10217 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7299 = torch.aten.rsqrt %7298 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7300 = torch.aten.mul.Tensor %7294, %7299 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10218 = torch.constant.int 5
    %7301 = torch.prims.convert_element_type %7300, %int5_10218 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.5.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.5.norm.query_norm.scale : tensor<128xf16>
    %7302 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7303 = torch.aten.mul.Tensor %7301, %7302 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10219 = torch.constant.int 6
    %7304 = torch.prims.convert_element_type %7292, %int6_10219 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10220 = torch.constant.int 2
    %7305 = torch.aten.pow.Tensor_Scalar %7304, %int2_10220 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10221 = torch.constant.int -1
    %7306 = torch.prim.ListConstruct %int-1_10221 : (!torch.int) -> !torch.list<int>
    %true_10222 = torch.constant.bool true
    %none_10223 = torch.constant.none
    %7307 = torch.aten.mean.dim %7305, %7306, %true_10222, %none_10223 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10224 = torch.constant.float 9.9999999999999995E-7
    %int1_10225 = torch.constant.int 1
    %7308 = torch.aten.add.Scalar %7307, %float9.999990e-07_10224, %int1_10225 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7309 = torch.aten.rsqrt %7308 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7310 = torch.aten.mul.Tensor %7304, %7309 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10226 = torch.constant.int 5
    %7311 = torch.prims.convert_element_type %7310, %int5_10226 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.5.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.5.norm.key_norm.scale : tensor<128xf16>
    %7312 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7313 = torch.aten.mul.Tensor %7311, %7312 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10227 = torch.constant.int 5
    %7314 = torch.prims.convert_element_type %7303, %int5_10227 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10228 = torch.constant.int 5
    %7315 = torch.prims.convert_element_type %7313, %int5_10228 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10229 = torch.constant.int 6
    %7316 = torch.prims.convert_element_type %7314, %int6_10229 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10230 = torch.constant.int 1
    %int24_10231 = torch.constant.int 24
    %int4608_10232 = torch.constant.int 4608
    %int64_10233 = torch.constant.int 64
    %int1_10234 = torch.constant.int 1
    %int2_10235 = torch.constant.int 2
    %7317 = torch.prim.ListConstruct %int1_10230, %int24_10231, %int4608_10232, %int64_10233, %int1_10234, %int2_10235 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7318 = torch.aten.view %7316, %7317 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_10236 = torch.constant.int 6
    %7319 = torch.prims.convert_element_type %7315, %int6_10236 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10237 = torch.constant.int 1
    %int24_10238 = torch.constant.int 24
    %int4608_10239 = torch.constant.int 4608
    %int64_10240 = torch.constant.int 64
    %int1_10241 = torch.constant.int 1
    %int2_10242 = torch.constant.int 2
    %7320 = torch.prim.ListConstruct %int1_10237, %int24_10238, %int4608_10239, %int64_10240, %int1_10241, %int2_10242 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7321 = torch.aten.view %7319, %7320 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_10243 = torch.constant.int 5
    %int0_10244 = torch.constant.int 0
    %7322 = torch.aten.select.int %211, %int5_10243, %int0_10244 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10245 = torch.constant.int 5
    %int0_10246 = torch.constant.int 0
    %7323 = torch.aten.select.int %7318, %int5_10245, %int0_10246 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7324 = torch.aten.mul.Tensor %7322, %7323 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10247 = torch.constant.int 5
    %int1_10248 = torch.constant.int 1
    %7325 = torch.aten.select.int %211, %int5_10247, %int1_10248 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10249 = torch.constant.int 5
    %int1_10250 = torch.constant.int 1
    %7326 = torch.aten.select.int %7318, %int5_10249, %int1_10250 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7327 = torch.aten.mul.Tensor %7325, %7326 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10251 = torch.constant.int 1
    %7328 = torch.aten.add.Tensor %7324, %7327, %int1_10251 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10252 = torch.constant.int 5
    %int0_10253 = torch.constant.int 0
    %7329 = torch.aten.select.int %211, %int5_10252, %int0_10253 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10254 = torch.constant.int 5
    %int0_10255 = torch.constant.int 0
    %7330 = torch.aten.select.int %7321, %int5_10254, %int0_10255 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7331 = torch.aten.mul.Tensor %7329, %7330 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10256 = torch.constant.int 5
    %int1_10257 = torch.constant.int 1
    %7332 = torch.aten.select.int %211, %int5_10256, %int1_10257 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10258 = torch.constant.int 5
    %int1_10259 = torch.constant.int 1
    %7333 = torch.aten.select.int %7321, %int5_10258, %int1_10259 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7334 = torch.aten.mul.Tensor %7332, %7333 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10260 = torch.constant.int 1
    %7335 = torch.aten.add.Tensor %7331, %7334, %int1_10260 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10261 = torch.constant.int 1
    %int24_10262 = torch.constant.int 24
    %int4608_10263 = torch.constant.int 4608
    %int128_10264 = torch.constant.int 128
    %7336 = torch.prim.ListConstruct %int1_10261, %int24_10262, %int4608_10263, %int128_10264 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7337 = torch.aten.view %7328, %7336 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10265 = torch.constant.int 5
    %7338 = torch.prims.convert_element_type %7337, %int5_10265 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_10266 = torch.constant.int 1
    %int24_10267 = torch.constant.int 24
    %int4608_10268 = torch.constant.int 4608
    %int128_10269 = torch.constant.int 128
    %7339 = torch.prim.ListConstruct %int1_10266, %int24_10267, %int4608_10268, %int128_10269 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7340 = torch.aten.view %7335, %7339 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10270 = torch.constant.int 5
    %7341 = torch.prims.convert_element_type %7340, %int5_10270 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_10271 = torch.constant.float 0.000000e+00
    %false_10272 = torch.constant.bool false
    %none_10273 = torch.constant.none
    %none_10274 = torch.constant.none
    %7342:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7338, %7341, %7293, %float0.000000e00_10271, %false_10272, %none_10273, %none_10274) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_10275 = torch.constant.int 0
    %int2_10276 = torch.constant.int 2
    %int1_10277 = torch.constant.int 1
    %int3_10278 = torch.constant.int 3
    %7343 = torch.prim.ListConstruct %int0_10275, %int2_10276, %int1_10277, %int3_10278 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7344 = torch.aten.permute %7342#0, %7343 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_10279 = torch.constant.int 1
    %int4608_10280 = torch.constant.int 4608
    %int3072_10281 = torch.constant.int 3072
    %7345 = torch.prim.ListConstruct %int1_10279, %int4608_10280, %int3072_10281 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7346 = torch.aten.view %7344, %7345 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_10282 = torch.constant.str "tanh"
    %7347 = torch.aten.gelu %7286, %str_10282 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7348 = torch.prim.ListConstruct %7346, %7347 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_10283 = torch.constant.int 2
    %7349 = torch.aten.cat %7348, %int2_10283 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_10284 = torch.constant.int 4608
    %int15360_10285 = torch.constant.int 15360
    %7350 = torch.prim.ListConstruct %int4608_10284, %int15360_10285 : (!torch.int, !torch.int) -> !torch.list<int>
    %7351 = torch.aten.view %7349, %7350 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.5.linear2.weight = util.global.load @__auto.sampler.single_blocks.5.linear2.weight : tensor<3072x15360xf16>
    %7352 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_10286 = torch.constant.int 0
    %int1_10287 = torch.constant.int 1
    %7353 = torch.aten.transpose.int %7352, %int0_10286, %int1_10287 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.5.linear2.bias = util.global.load @__auto.sampler.single_blocks.5.linear2.bias : tensor<3072xf16>
    %7354 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_10288 = torch.constant.int 6
    %7355 = torch.prims.convert_element_type %7354, %int6_10288 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_10289 = torch.constant.int 6
    %7356 = torch.prims.convert_element_type %7351, %int6_10289 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_10290 = torch.constant.int 6
    %7357 = torch.prims.convert_element_type %7353, %int6_10290 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7358 = torch.aten.mm %7356, %7357 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_10291 = torch.constant.int 1
    %7359 = torch.aten.mul.Scalar %7358, %int1_10291 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_10292 = torch.constant.int 1
    %7360 = torch.aten.mul.Scalar %7355, %int1_10292 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_10293 = torch.constant.int 1
    %7361 = torch.aten.add.Tensor %7359, %7360, %int1_10293 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_10294 = torch.constant.int 5
    %7362 = torch.prims.convert_element_type %7361, %int5_10294 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_10295 = torch.constant.int 1
    %int4608_10296 = torch.constant.int 4608
    %int3072_10297 = torch.constant.int 3072
    %7363 = torch.prim.ListConstruct %int1_10295, %int4608_10296, %int3072_10297 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7364 = torch.aten.view %7362, %7363 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7365 = torch.aten.mul.Tensor %7259, %7364 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10298 = torch.constant.int 1
    %7366 = torch.aten.add.Tensor %7241, %7365, %int1_10298 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7367 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.6.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.6.modulation.lin.weight : tensor<9216x3072xf16>
    %7368 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_10299 = torch.constant.int 0
    %int1_10300 = torch.constant.int 1
    %7369 = torch.aten.transpose.int %7368, %int0_10299, %int1_10300 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.6.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.6.modulation.lin.bias : tensor<9216xf16>
    %7370 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_10301 = torch.constant.int 6
    %7371 = torch.prims.convert_element_type %7370, %int6_10301 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_10302 = torch.constant.int 6
    %7372 = torch.prims.convert_element_type %7367, %int6_10302 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_10303 = torch.constant.int 6
    %7373 = torch.prims.convert_element_type %7369, %int6_10303 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7374 = torch.aten.mm %7372, %7373 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_10304 = torch.constant.int 1
    %7375 = torch.aten.mul.Scalar %7374, %int1_10304 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_10305 = torch.constant.int 1
    %7376 = torch.aten.mul.Scalar %7371, %int1_10305 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_10306 = torch.constant.int 1
    %7377 = torch.aten.add.Tensor %7375, %7376, %int1_10306 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_10307 = torch.constant.int 5
    %7378 = torch.prims.convert_element_type %7377, %int5_10307 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_10308 = torch.constant.int 0
    %int0_10309 = torch.constant.int 0
    %int9223372036854775807_10310 = torch.constant.int 9223372036854775807
    %int1_10311 = torch.constant.int 1
    %7379 = torch.aten.slice.Tensor %7378, %int0_10308, %int0_10309, %int9223372036854775807_10310, %int1_10311 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_10312 = torch.constant.int 1
    %7380 = torch.aten.unsqueeze %7379, %int1_10312 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_10313 = torch.constant.int 2
    %int0_10314 = torch.constant.int 0
    %int9223372036854775807_10315 = torch.constant.int 9223372036854775807
    %int1_10316 = torch.constant.int 1
    %7381 = torch.aten.slice.Tensor %7380, %int2_10313, %int0_10314, %int9223372036854775807_10315, %int1_10316 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_10317 = torch.constant.int -1
    %int0_10318 = torch.constant.int 0
    %int3072_10319 = torch.constant.int 3072
    %int1_10320 = torch.constant.int 1
    %7382 = torch.aten.slice.Tensor %7381, %int-1_10317, %int0_10318, %int3072_10319, %int1_10320 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10321 = torch.constant.int -1
    %int3072_10322 = torch.constant.int 3072
    %int6144_10323 = torch.constant.int 6144
    %int1_10324 = torch.constant.int 1
    %7383 = torch.aten.slice.Tensor %7381, %int-1_10321, %int3072_10322, %int6144_10323, %int1_10324 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10325 = torch.constant.int -1
    %int6144_10326 = torch.constant.int 6144
    %int9216_10327 = torch.constant.int 9216
    %int1_10328 = torch.constant.int 1
    %7384 = torch.aten.slice.Tensor %7381, %int-1_10325, %int6144_10326, %int9216_10327, %int1_10328 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_10329 = torch.constant.int 1
    %int1_10330 = torch.constant.int 1
    %7385 = torch.aten.add.Scalar %7383, %int1_10329, %int1_10330 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_10331 = torch.constant.int 6
    %7386 = torch.prims.convert_element_type %7366, %int6_10331 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_10332 = torch.constant.int 2
    %7387 = torch.prim.ListConstruct %int2_10332 : (!torch.int) -> !torch.list<int>
    %int0_10333 = torch.constant.int 0
    %true_10334 = torch.constant.bool true
    %result0_10335, %result1_10336 = torch.aten.var_mean.correction %7386, %7387, %int0_10333, %true_10334 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_10337 = torch.constant.float 9.9999999999999995E-7
    %int1_10338 = torch.constant.int 1
    %7388 = torch.aten.add.Scalar %result0_10335, %float9.999990e-07_10337, %int1_10338 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7389 = torch.aten.rsqrt %7388 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_10339 = torch.constant.int 1
    %7390 = torch.aten.sub.Tensor %7366, %result1_10336, %int1_10339 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7391 = torch.aten.mul.Tensor %7390, %7389 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_10340 = torch.constant.int 5
    %7392 = torch.prims.convert_element_type %7391, %int5_10340 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7393 = torch.aten.mul.Tensor %7385, %7392 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10341 = torch.constant.int 1
    %7394 = torch.aten.add.Tensor %7393, %7382, %int1_10341 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_10342 = torch.constant.int 4608
    %int3072_10343 = torch.constant.int 3072
    %7395 = torch.prim.ListConstruct %int4608_10342, %int3072_10343 : (!torch.int, !torch.int) -> !torch.list<int>
    %7396 = torch.aten.view %7394, %7395 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.6.linear1.weight = util.global.load @__auto.sampler.single_blocks.6.linear1.weight : tensor<21504x3072xf16>
    %7397 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_10344 = torch.constant.int 0
    %int1_10345 = torch.constant.int 1
    %7398 = torch.aten.transpose.int %7397, %int0_10344, %int1_10345 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.6.linear1.bias = util.global.load @__auto.sampler.single_blocks.6.linear1.bias : tensor<21504xf16>
    %7399 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_10346 = torch.constant.int 6
    %7400 = torch.prims.convert_element_type %7399, %int6_10346 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_10347 = torch.constant.int 6
    %7401 = torch.prims.convert_element_type %7396, %int6_10347 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_10348 = torch.constant.int 6
    %7402 = torch.prims.convert_element_type %7398, %int6_10348 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7403 = torch.aten.mm %7401, %7402 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_10349 = torch.constant.int 1
    %7404 = torch.aten.mul.Scalar %7403, %int1_10349 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_10350 = torch.constant.int 1
    %7405 = torch.aten.mul.Scalar %7400, %int1_10350 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_10351 = torch.constant.int 1
    %7406 = torch.aten.add.Tensor %7404, %7405, %int1_10351 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_10352 = torch.constant.int 5
    %7407 = torch.prims.convert_element_type %7406, %int5_10352 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_10353 = torch.constant.int 1
    %int4608_10354 = torch.constant.int 4608
    %int21504_10355 = torch.constant.int 21504
    %7408 = torch.prim.ListConstruct %int1_10353, %int4608_10354, %int21504_10355 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7409 = torch.aten.view %7407, %7408 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_10356 = torch.constant.int -1
    %int0_10357 = torch.constant.int 0
    %int9216_10358 = torch.constant.int 9216
    %int1_10359 = torch.constant.int 1
    %7410 = torch.aten.slice.Tensor %7409, %int-1_10356, %int0_10357, %int9216_10358, %int1_10359 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_10360 = torch.constant.int -1
    %int9216_10361 = torch.constant.int 9216
    %int21504_10362 = torch.constant.int 21504
    %int1_10363 = torch.constant.int 1
    %7411 = torch.aten.slice.Tensor %7409, %int-1_10360, %int9216_10361, %int21504_10362, %int1_10363 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_10364 = torch.constant.int 1
    %int4608_10365 = torch.constant.int 4608
    %int3_10366 = torch.constant.int 3
    %int24_10367 = torch.constant.int 24
    %int128_10368 = torch.constant.int 128
    %7412 = torch.prim.ListConstruct %int1_10364, %int4608_10365, %int3_10366, %int24_10367, %int128_10368 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7413 = torch.aten.view %7410, %7412 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_10369 = torch.constant.int 2
    %int0_10370 = torch.constant.int 0
    %int3_10371 = torch.constant.int 3
    %int1_10372 = torch.constant.int 1
    %int4_10373 = torch.constant.int 4
    %7414 = torch.prim.ListConstruct %int2_10369, %int0_10370, %int3_10371, %int1_10372, %int4_10373 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7415 = torch.aten.permute %7413, %7414 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_10374 = torch.constant.int 0
    %int0_10375 = torch.constant.int 0
    %7416 = torch.aten.select.int %7415, %int0_10374, %int0_10375 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10376 = torch.constant.int 0
    %int1_10377 = torch.constant.int 1
    %7417 = torch.aten.select.int %7415, %int0_10376, %int1_10377 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10378 = torch.constant.int 0
    %int2_10379 = torch.constant.int 2
    %7418 = torch.aten.select.int %7415, %int0_10378, %int2_10379 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10380 = torch.constant.int 6
    %7419 = torch.prims.convert_element_type %7416, %int6_10380 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10381 = torch.constant.int 2
    %7420 = torch.aten.pow.Tensor_Scalar %7419, %int2_10381 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10382 = torch.constant.int -1
    %7421 = torch.prim.ListConstruct %int-1_10382 : (!torch.int) -> !torch.list<int>
    %true_10383 = torch.constant.bool true
    %none_10384 = torch.constant.none
    %7422 = torch.aten.mean.dim %7420, %7421, %true_10383, %none_10384 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10385 = torch.constant.float 9.9999999999999995E-7
    %int1_10386 = torch.constant.int 1
    %7423 = torch.aten.add.Scalar %7422, %float9.999990e-07_10385, %int1_10386 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7424 = torch.aten.rsqrt %7423 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7425 = torch.aten.mul.Tensor %7419, %7424 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10387 = torch.constant.int 5
    %7426 = torch.prims.convert_element_type %7425, %int5_10387 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.6.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.6.norm.query_norm.scale : tensor<128xf16>
    %7427 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7428 = torch.aten.mul.Tensor %7426, %7427 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10388 = torch.constant.int 6
    %7429 = torch.prims.convert_element_type %7417, %int6_10388 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10389 = torch.constant.int 2
    %7430 = torch.aten.pow.Tensor_Scalar %7429, %int2_10389 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10390 = torch.constant.int -1
    %7431 = torch.prim.ListConstruct %int-1_10390 : (!torch.int) -> !torch.list<int>
    %true_10391 = torch.constant.bool true
    %none_10392 = torch.constant.none
    %7432 = torch.aten.mean.dim %7430, %7431, %true_10391, %none_10392 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10393 = torch.constant.float 9.9999999999999995E-7
    %int1_10394 = torch.constant.int 1
    %7433 = torch.aten.add.Scalar %7432, %float9.999990e-07_10393, %int1_10394 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7434 = torch.aten.rsqrt %7433 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7435 = torch.aten.mul.Tensor %7429, %7434 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10395 = torch.constant.int 5
    %7436 = torch.prims.convert_element_type %7435, %int5_10395 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.6.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.6.norm.key_norm.scale : tensor<128xf16>
    %7437 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7438 = torch.aten.mul.Tensor %7436, %7437 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10396 = torch.constant.int 5
    %7439 = torch.prims.convert_element_type %7428, %int5_10396 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10397 = torch.constant.int 5
    %7440 = torch.prims.convert_element_type %7438, %int5_10397 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10398 = torch.constant.int 6
    %7441 = torch.prims.convert_element_type %7439, %int6_10398 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10399 = torch.constant.int 1
    %int24_10400 = torch.constant.int 24
    %int4608_10401 = torch.constant.int 4608
    %int64_10402 = torch.constant.int 64
    %int1_10403 = torch.constant.int 1
    %int2_10404 = torch.constant.int 2
    %7442 = torch.prim.ListConstruct %int1_10399, %int24_10400, %int4608_10401, %int64_10402, %int1_10403, %int2_10404 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7443 = torch.aten.view %7441, %7442 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_10405 = torch.constant.int 6
    %7444 = torch.prims.convert_element_type %7440, %int6_10405 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10406 = torch.constant.int 1
    %int24_10407 = torch.constant.int 24
    %int4608_10408 = torch.constant.int 4608
    %int64_10409 = torch.constant.int 64
    %int1_10410 = torch.constant.int 1
    %int2_10411 = torch.constant.int 2
    %7445 = torch.prim.ListConstruct %int1_10406, %int24_10407, %int4608_10408, %int64_10409, %int1_10410, %int2_10411 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7446 = torch.aten.view %7444, %7445 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_10412 = torch.constant.int 5
    %int0_10413 = torch.constant.int 0
    %7447 = torch.aten.select.int %211, %int5_10412, %int0_10413 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10414 = torch.constant.int 5
    %int0_10415 = torch.constant.int 0
    %7448 = torch.aten.select.int %7443, %int5_10414, %int0_10415 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7449 = torch.aten.mul.Tensor %7447, %7448 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10416 = torch.constant.int 5
    %int1_10417 = torch.constant.int 1
    %7450 = torch.aten.select.int %211, %int5_10416, %int1_10417 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10418 = torch.constant.int 5
    %int1_10419 = torch.constant.int 1
    %7451 = torch.aten.select.int %7443, %int5_10418, %int1_10419 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7452 = torch.aten.mul.Tensor %7450, %7451 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10420 = torch.constant.int 1
    %7453 = torch.aten.add.Tensor %7449, %7452, %int1_10420 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10421 = torch.constant.int 5
    %int0_10422 = torch.constant.int 0
    %7454 = torch.aten.select.int %211, %int5_10421, %int0_10422 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10423 = torch.constant.int 5
    %int0_10424 = torch.constant.int 0
    %7455 = torch.aten.select.int %7446, %int5_10423, %int0_10424 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7456 = torch.aten.mul.Tensor %7454, %7455 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10425 = torch.constant.int 5
    %int1_10426 = torch.constant.int 1
    %7457 = torch.aten.select.int %211, %int5_10425, %int1_10426 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10427 = torch.constant.int 5
    %int1_10428 = torch.constant.int 1
    %7458 = torch.aten.select.int %7446, %int5_10427, %int1_10428 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7459 = torch.aten.mul.Tensor %7457, %7458 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10429 = torch.constant.int 1
    %7460 = torch.aten.add.Tensor %7456, %7459, %int1_10429 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10430 = torch.constant.int 1
    %int24_10431 = torch.constant.int 24
    %int4608_10432 = torch.constant.int 4608
    %int128_10433 = torch.constant.int 128
    %7461 = torch.prim.ListConstruct %int1_10430, %int24_10431, %int4608_10432, %int128_10433 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7462 = torch.aten.view %7453, %7461 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10434 = torch.constant.int 5
    %7463 = torch.prims.convert_element_type %7462, %int5_10434 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_10435 = torch.constant.int 1
    %int24_10436 = torch.constant.int 24
    %int4608_10437 = torch.constant.int 4608
    %int128_10438 = torch.constant.int 128
    %7464 = torch.prim.ListConstruct %int1_10435, %int24_10436, %int4608_10437, %int128_10438 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7465 = torch.aten.view %7460, %7464 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10439 = torch.constant.int 5
    %7466 = torch.prims.convert_element_type %7465, %int5_10439 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_10440 = torch.constant.float 0.000000e+00
    %false_10441 = torch.constant.bool false
    %none_10442 = torch.constant.none
    %none_10443 = torch.constant.none
    %7467:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7463, %7466, %7418, %float0.000000e00_10440, %false_10441, %none_10442, %none_10443) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_10444 = torch.constant.int 0
    %int2_10445 = torch.constant.int 2
    %int1_10446 = torch.constant.int 1
    %int3_10447 = torch.constant.int 3
    %7468 = torch.prim.ListConstruct %int0_10444, %int2_10445, %int1_10446, %int3_10447 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7469 = torch.aten.permute %7467#0, %7468 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_10448 = torch.constant.int 1
    %int4608_10449 = torch.constant.int 4608
    %int3072_10450 = torch.constant.int 3072
    %7470 = torch.prim.ListConstruct %int1_10448, %int4608_10449, %int3072_10450 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7471 = torch.aten.view %7469, %7470 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_10451 = torch.constant.str "tanh"
    %7472 = torch.aten.gelu %7411, %str_10451 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7473 = torch.prim.ListConstruct %7471, %7472 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_10452 = torch.constant.int 2
    %7474 = torch.aten.cat %7473, %int2_10452 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_10453 = torch.constant.int 4608
    %int15360_10454 = torch.constant.int 15360
    %7475 = torch.prim.ListConstruct %int4608_10453, %int15360_10454 : (!torch.int, !torch.int) -> !torch.list<int>
    %7476 = torch.aten.view %7474, %7475 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.6.linear2.weight = util.global.load @__auto.sampler.single_blocks.6.linear2.weight : tensor<3072x15360xf16>
    %7477 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_10455 = torch.constant.int 0
    %int1_10456 = torch.constant.int 1
    %7478 = torch.aten.transpose.int %7477, %int0_10455, %int1_10456 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.6.linear2.bias = util.global.load @__auto.sampler.single_blocks.6.linear2.bias : tensor<3072xf16>
    %7479 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_10457 = torch.constant.int 6
    %7480 = torch.prims.convert_element_type %7479, %int6_10457 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_10458 = torch.constant.int 6
    %7481 = torch.prims.convert_element_type %7476, %int6_10458 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_10459 = torch.constant.int 6
    %7482 = torch.prims.convert_element_type %7478, %int6_10459 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7483 = torch.aten.mm %7481, %7482 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_10460 = torch.constant.int 1
    %7484 = torch.aten.mul.Scalar %7483, %int1_10460 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_10461 = torch.constant.int 1
    %7485 = torch.aten.mul.Scalar %7480, %int1_10461 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_10462 = torch.constant.int 1
    %7486 = torch.aten.add.Tensor %7484, %7485, %int1_10462 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_10463 = torch.constant.int 5
    %7487 = torch.prims.convert_element_type %7486, %int5_10463 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_10464 = torch.constant.int 1
    %int4608_10465 = torch.constant.int 4608
    %int3072_10466 = torch.constant.int 3072
    %7488 = torch.prim.ListConstruct %int1_10464, %int4608_10465, %int3072_10466 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7489 = torch.aten.view %7487, %7488 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7490 = torch.aten.mul.Tensor %7384, %7489 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10467 = torch.constant.int 1
    %7491 = torch.aten.add.Tensor %7366, %7490, %int1_10467 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7492 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.7.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.7.modulation.lin.weight : tensor<9216x3072xf16>
    %7493 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_10468 = torch.constant.int 0
    %int1_10469 = torch.constant.int 1
    %7494 = torch.aten.transpose.int %7493, %int0_10468, %int1_10469 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.7.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.7.modulation.lin.bias : tensor<9216xf16>
    %7495 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_10470 = torch.constant.int 6
    %7496 = torch.prims.convert_element_type %7495, %int6_10470 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_10471 = torch.constant.int 6
    %7497 = torch.prims.convert_element_type %7492, %int6_10471 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_10472 = torch.constant.int 6
    %7498 = torch.prims.convert_element_type %7494, %int6_10472 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7499 = torch.aten.mm %7497, %7498 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_10473 = torch.constant.int 1
    %7500 = torch.aten.mul.Scalar %7499, %int1_10473 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_10474 = torch.constant.int 1
    %7501 = torch.aten.mul.Scalar %7496, %int1_10474 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_10475 = torch.constant.int 1
    %7502 = torch.aten.add.Tensor %7500, %7501, %int1_10475 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_10476 = torch.constant.int 5
    %7503 = torch.prims.convert_element_type %7502, %int5_10476 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_10477 = torch.constant.int 0
    %int0_10478 = torch.constant.int 0
    %int9223372036854775807_10479 = torch.constant.int 9223372036854775807
    %int1_10480 = torch.constant.int 1
    %7504 = torch.aten.slice.Tensor %7503, %int0_10477, %int0_10478, %int9223372036854775807_10479, %int1_10480 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_10481 = torch.constant.int 1
    %7505 = torch.aten.unsqueeze %7504, %int1_10481 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_10482 = torch.constant.int 2
    %int0_10483 = torch.constant.int 0
    %int9223372036854775807_10484 = torch.constant.int 9223372036854775807
    %int1_10485 = torch.constant.int 1
    %7506 = torch.aten.slice.Tensor %7505, %int2_10482, %int0_10483, %int9223372036854775807_10484, %int1_10485 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_10486 = torch.constant.int -1
    %int0_10487 = torch.constant.int 0
    %int3072_10488 = torch.constant.int 3072
    %int1_10489 = torch.constant.int 1
    %7507 = torch.aten.slice.Tensor %7506, %int-1_10486, %int0_10487, %int3072_10488, %int1_10489 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10490 = torch.constant.int -1
    %int3072_10491 = torch.constant.int 3072
    %int6144_10492 = torch.constant.int 6144
    %int1_10493 = torch.constant.int 1
    %7508 = torch.aten.slice.Tensor %7506, %int-1_10490, %int3072_10491, %int6144_10492, %int1_10493 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10494 = torch.constant.int -1
    %int6144_10495 = torch.constant.int 6144
    %int9216_10496 = torch.constant.int 9216
    %int1_10497 = torch.constant.int 1
    %7509 = torch.aten.slice.Tensor %7506, %int-1_10494, %int6144_10495, %int9216_10496, %int1_10497 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_10498 = torch.constant.int 1
    %int1_10499 = torch.constant.int 1
    %7510 = torch.aten.add.Scalar %7508, %int1_10498, %int1_10499 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_10500 = torch.constant.int 6
    %7511 = torch.prims.convert_element_type %7491, %int6_10500 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_10501 = torch.constant.int 2
    %7512 = torch.prim.ListConstruct %int2_10501 : (!torch.int) -> !torch.list<int>
    %int0_10502 = torch.constant.int 0
    %true_10503 = torch.constant.bool true
    %result0_10504, %result1_10505 = torch.aten.var_mean.correction %7511, %7512, %int0_10502, %true_10503 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_10506 = torch.constant.float 9.9999999999999995E-7
    %int1_10507 = torch.constant.int 1
    %7513 = torch.aten.add.Scalar %result0_10504, %float9.999990e-07_10506, %int1_10507 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7514 = torch.aten.rsqrt %7513 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_10508 = torch.constant.int 1
    %7515 = torch.aten.sub.Tensor %7491, %result1_10505, %int1_10508 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7516 = torch.aten.mul.Tensor %7515, %7514 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_10509 = torch.constant.int 5
    %7517 = torch.prims.convert_element_type %7516, %int5_10509 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7518 = torch.aten.mul.Tensor %7510, %7517 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10510 = torch.constant.int 1
    %7519 = torch.aten.add.Tensor %7518, %7507, %int1_10510 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_10511 = torch.constant.int 4608
    %int3072_10512 = torch.constant.int 3072
    %7520 = torch.prim.ListConstruct %int4608_10511, %int3072_10512 : (!torch.int, !torch.int) -> !torch.list<int>
    %7521 = torch.aten.view %7519, %7520 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.7.linear1.weight = util.global.load @__auto.sampler.single_blocks.7.linear1.weight : tensor<21504x3072xf16>
    %7522 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_10513 = torch.constant.int 0
    %int1_10514 = torch.constant.int 1
    %7523 = torch.aten.transpose.int %7522, %int0_10513, %int1_10514 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.7.linear1.bias = util.global.load @__auto.sampler.single_blocks.7.linear1.bias : tensor<21504xf16>
    %7524 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_10515 = torch.constant.int 6
    %7525 = torch.prims.convert_element_type %7524, %int6_10515 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_10516 = torch.constant.int 6
    %7526 = torch.prims.convert_element_type %7521, %int6_10516 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_10517 = torch.constant.int 6
    %7527 = torch.prims.convert_element_type %7523, %int6_10517 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7528 = torch.aten.mm %7526, %7527 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_10518 = torch.constant.int 1
    %7529 = torch.aten.mul.Scalar %7528, %int1_10518 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_10519 = torch.constant.int 1
    %7530 = torch.aten.mul.Scalar %7525, %int1_10519 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_10520 = torch.constant.int 1
    %7531 = torch.aten.add.Tensor %7529, %7530, %int1_10520 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_10521 = torch.constant.int 5
    %7532 = torch.prims.convert_element_type %7531, %int5_10521 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_10522 = torch.constant.int 1
    %int4608_10523 = torch.constant.int 4608
    %int21504_10524 = torch.constant.int 21504
    %7533 = torch.prim.ListConstruct %int1_10522, %int4608_10523, %int21504_10524 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7534 = torch.aten.view %7532, %7533 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_10525 = torch.constant.int -1
    %int0_10526 = torch.constant.int 0
    %int9216_10527 = torch.constant.int 9216
    %int1_10528 = torch.constant.int 1
    %7535 = torch.aten.slice.Tensor %7534, %int-1_10525, %int0_10526, %int9216_10527, %int1_10528 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_10529 = torch.constant.int -1
    %int9216_10530 = torch.constant.int 9216
    %int21504_10531 = torch.constant.int 21504
    %int1_10532 = torch.constant.int 1
    %7536 = torch.aten.slice.Tensor %7534, %int-1_10529, %int9216_10530, %int21504_10531, %int1_10532 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_10533 = torch.constant.int 1
    %int4608_10534 = torch.constant.int 4608
    %int3_10535 = torch.constant.int 3
    %int24_10536 = torch.constant.int 24
    %int128_10537 = torch.constant.int 128
    %7537 = torch.prim.ListConstruct %int1_10533, %int4608_10534, %int3_10535, %int24_10536, %int128_10537 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7538 = torch.aten.view %7535, %7537 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_10538 = torch.constant.int 2
    %int0_10539 = torch.constant.int 0
    %int3_10540 = torch.constant.int 3
    %int1_10541 = torch.constant.int 1
    %int4_10542 = torch.constant.int 4
    %7539 = torch.prim.ListConstruct %int2_10538, %int0_10539, %int3_10540, %int1_10541, %int4_10542 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7540 = torch.aten.permute %7538, %7539 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_10543 = torch.constant.int 0
    %int0_10544 = torch.constant.int 0
    %7541 = torch.aten.select.int %7540, %int0_10543, %int0_10544 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10545 = torch.constant.int 0
    %int1_10546 = torch.constant.int 1
    %7542 = torch.aten.select.int %7540, %int0_10545, %int1_10546 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10547 = torch.constant.int 0
    %int2_10548 = torch.constant.int 2
    %7543 = torch.aten.select.int %7540, %int0_10547, %int2_10548 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10549 = torch.constant.int 6
    %7544 = torch.prims.convert_element_type %7541, %int6_10549 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10550 = torch.constant.int 2
    %7545 = torch.aten.pow.Tensor_Scalar %7544, %int2_10550 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10551 = torch.constant.int -1
    %7546 = torch.prim.ListConstruct %int-1_10551 : (!torch.int) -> !torch.list<int>
    %true_10552 = torch.constant.bool true
    %none_10553 = torch.constant.none
    %7547 = torch.aten.mean.dim %7545, %7546, %true_10552, %none_10553 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10554 = torch.constant.float 9.9999999999999995E-7
    %int1_10555 = torch.constant.int 1
    %7548 = torch.aten.add.Scalar %7547, %float9.999990e-07_10554, %int1_10555 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7549 = torch.aten.rsqrt %7548 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7550 = torch.aten.mul.Tensor %7544, %7549 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10556 = torch.constant.int 5
    %7551 = torch.prims.convert_element_type %7550, %int5_10556 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.7.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.7.norm.query_norm.scale : tensor<128xf16>
    %7552 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7553 = torch.aten.mul.Tensor %7551, %7552 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10557 = torch.constant.int 6
    %7554 = torch.prims.convert_element_type %7542, %int6_10557 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10558 = torch.constant.int 2
    %7555 = torch.aten.pow.Tensor_Scalar %7554, %int2_10558 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10559 = torch.constant.int -1
    %7556 = torch.prim.ListConstruct %int-1_10559 : (!torch.int) -> !torch.list<int>
    %true_10560 = torch.constant.bool true
    %none_10561 = torch.constant.none
    %7557 = torch.aten.mean.dim %7555, %7556, %true_10560, %none_10561 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10562 = torch.constant.float 9.9999999999999995E-7
    %int1_10563 = torch.constant.int 1
    %7558 = torch.aten.add.Scalar %7557, %float9.999990e-07_10562, %int1_10563 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7559 = torch.aten.rsqrt %7558 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7560 = torch.aten.mul.Tensor %7554, %7559 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10564 = torch.constant.int 5
    %7561 = torch.prims.convert_element_type %7560, %int5_10564 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.7.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.7.norm.key_norm.scale : tensor<128xf16>
    %7562 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7563 = torch.aten.mul.Tensor %7561, %7562 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10565 = torch.constant.int 5
    %7564 = torch.prims.convert_element_type %7553, %int5_10565 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10566 = torch.constant.int 5
    %7565 = torch.prims.convert_element_type %7563, %int5_10566 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10567 = torch.constant.int 6
    %7566 = torch.prims.convert_element_type %7564, %int6_10567 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10568 = torch.constant.int 1
    %int24_10569 = torch.constant.int 24
    %int4608_10570 = torch.constant.int 4608
    %int64_10571 = torch.constant.int 64
    %int1_10572 = torch.constant.int 1
    %int2_10573 = torch.constant.int 2
    %7567 = torch.prim.ListConstruct %int1_10568, %int24_10569, %int4608_10570, %int64_10571, %int1_10572, %int2_10573 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7568 = torch.aten.view %7566, %7567 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_10574 = torch.constant.int 6
    %7569 = torch.prims.convert_element_type %7565, %int6_10574 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10575 = torch.constant.int 1
    %int24_10576 = torch.constant.int 24
    %int4608_10577 = torch.constant.int 4608
    %int64_10578 = torch.constant.int 64
    %int1_10579 = torch.constant.int 1
    %int2_10580 = torch.constant.int 2
    %7570 = torch.prim.ListConstruct %int1_10575, %int24_10576, %int4608_10577, %int64_10578, %int1_10579, %int2_10580 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7571 = torch.aten.view %7569, %7570 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_10581 = torch.constant.int 5
    %int0_10582 = torch.constant.int 0
    %7572 = torch.aten.select.int %211, %int5_10581, %int0_10582 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10583 = torch.constant.int 5
    %int0_10584 = torch.constant.int 0
    %7573 = torch.aten.select.int %7568, %int5_10583, %int0_10584 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7574 = torch.aten.mul.Tensor %7572, %7573 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10585 = torch.constant.int 5
    %int1_10586 = torch.constant.int 1
    %7575 = torch.aten.select.int %211, %int5_10585, %int1_10586 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10587 = torch.constant.int 5
    %int1_10588 = torch.constant.int 1
    %7576 = torch.aten.select.int %7568, %int5_10587, %int1_10588 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7577 = torch.aten.mul.Tensor %7575, %7576 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10589 = torch.constant.int 1
    %7578 = torch.aten.add.Tensor %7574, %7577, %int1_10589 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10590 = torch.constant.int 5
    %int0_10591 = torch.constant.int 0
    %7579 = torch.aten.select.int %211, %int5_10590, %int0_10591 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10592 = torch.constant.int 5
    %int0_10593 = torch.constant.int 0
    %7580 = torch.aten.select.int %7571, %int5_10592, %int0_10593 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7581 = torch.aten.mul.Tensor %7579, %7580 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10594 = torch.constant.int 5
    %int1_10595 = torch.constant.int 1
    %7582 = torch.aten.select.int %211, %int5_10594, %int1_10595 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10596 = torch.constant.int 5
    %int1_10597 = torch.constant.int 1
    %7583 = torch.aten.select.int %7571, %int5_10596, %int1_10597 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7584 = torch.aten.mul.Tensor %7582, %7583 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10598 = torch.constant.int 1
    %7585 = torch.aten.add.Tensor %7581, %7584, %int1_10598 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10599 = torch.constant.int 1
    %int24_10600 = torch.constant.int 24
    %int4608_10601 = torch.constant.int 4608
    %int128_10602 = torch.constant.int 128
    %7586 = torch.prim.ListConstruct %int1_10599, %int24_10600, %int4608_10601, %int128_10602 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7587 = torch.aten.view %7578, %7586 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10603 = torch.constant.int 5
    %7588 = torch.prims.convert_element_type %7587, %int5_10603 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_10604 = torch.constant.int 1
    %int24_10605 = torch.constant.int 24
    %int4608_10606 = torch.constant.int 4608
    %int128_10607 = torch.constant.int 128
    %7589 = torch.prim.ListConstruct %int1_10604, %int24_10605, %int4608_10606, %int128_10607 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7590 = torch.aten.view %7585, %7589 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10608 = torch.constant.int 5
    %7591 = torch.prims.convert_element_type %7590, %int5_10608 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_10609 = torch.constant.float 0.000000e+00
    %false_10610 = torch.constant.bool false
    %none_10611 = torch.constant.none
    %none_10612 = torch.constant.none
    %7592:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7588, %7591, %7543, %float0.000000e00_10609, %false_10610, %none_10611, %none_10612) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_10613 = torch.constant.int 0
    %int2_10614 = torch.constant.int 2
    %int1_10615 = torch.constant.int 1
    %int3_10616 = torch.constant.int 3
    %7593 = torch.prim.ListConstruct %int0_10613, %int2_10614, %int1_10615, %int3_10616 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7594 = torch.aten.permute %7592#0, %7593 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_10617 = torch.constant.int 1
    %int4608_10618 = torch.constant.int 4608
    %int3072_10619 = torch.constant.int 3072
    %7595 = torch.prim.ListConstruct %int1_10617, %int4608_10618, %int3072_10619 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7596 = torch.aten.view %7594, %7595 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_10620 = torch.constant.str "tanh"
    %7597 = torch.aten.gelu %7536, %str_10620 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7598 = torch.prim.ListConstruct %7596, %7597 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_10621 = torch.constant.int 2
    %7599 = torch.aten.cat %7598, %int2_10621 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_10622 = torch.constant.int 4608
    %int15360_10623 = torch.constant.int 15360
    %7600 = torch.prim.ListConstruct %int4608_10622, %int15360_10623 : (!torch.int, !torch.int) -> !torch.list<int>
    %7601 = torch.aten.view %7599, %7600 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.7.linear2.weight = util.global.load @__auto.sampler.single_blocks.7.linear2.weight : tensor<3072x15360xf16>
    %7602 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_10624 = torch.constant.int 0
    %int1_10625 = torch.constant.int 1
    %7603 = torch.aten.transpose.int %7602, %int0_10624, %int1_10625 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.7.linear2.bias = util.global.load @__auto.sampler.single_blocks.7.linear2.bias : tensor<3072xf16>
    %7604 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_10626 = torch.constant.int 6
    %7605 = torch.prims.convert_element_type %7604, %int6_10626 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_10627 = torch.constant.int 6
    %7606 = torch.prims.convert_element_type %7601, %int6_10627 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_10628 = torch.constant.int 6
    %7607 = torch.prims.convert_element_type %7603, %int6_10628 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7608 = torch.aten.mm %7606, %7607 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_10629 = torch.constant.int 1
    %7609 = torch.aten.mul.Scalar %7608, %int1_10629 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_10630 = torch.constant.int 1
    %7610 = torch.aten.mul.Scalar %7605, %int1_10630 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_10631 = torch.constant.int 1
    %7611 = torch.aten.add.Tensor %7609, %7610, %int1_10631 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_10632 = torch.constant.int 5
    %7612 = torch.prims.convert_element_type %7611, %int5_10632 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_10633 = torch.constant.int 1
    %int4608_10634 = torch.constant.int 4608
    %int3072_10635 = torch.constant.int 3072
    %7613 = torch.prim.ListConstruct %int1_10633, %int4608_10634, %int3072_10635 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7614 = torch.aten.view %7612, %7613 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7615 = torch.aten.mul.Tensor %7509, %7614 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10636 = torch.constant.int 1
    %7616 = torch.aten.add.Tensor %7491, %7615, %int1_10636 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7617 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.8.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.8.modulation.lin.weight : tensor<9216x3072xf16>
    %7618 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_10637 = torch.constant.int 0
    %int1_10638 = torch.constant.int 1
    %7619 = torch.aten.transpose.int %7618, %int0_10637, %int1_10638 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.8.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.8.modulation.lin.bias : tensor<9216xf16>
    %7620 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_10639 = torch.constant.int 6
    %7621 = torch.prims.convert_element_type %7620, %int6_10639 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_10640 = torch.constant.int 6
    %7622 = torch.prims.convert_element_type %7617, %int6_10640 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_10641 = torch.constant.int 6
    %7623 = torch.prims.convert_element_type %7619, %int6_10641 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7624 = torch.aten.mm %7622, %7623 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_10642 = torch.constant.int 1
    %7625 = torch.aten.mul.Scalar %7624, %int1_10642 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_10643 = torch.constant.int 1
    %7626 = torch.aten.mul.Scalar %7621, %int1_10643 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_10644 = torch.constant.int 1
    %7627 = torch.aten.add.Tensor %7625, %7626, %int1_10644 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_10645 = torch.constant.int 5
    %7628 = torch.prims.convert_element_type %7627, %int5_10645 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_10646 = torch.constant.int 0
    %int0_10647 = torch.constant.int 0
    %int9223372036854775807_10648 = torch.constant.int 9223372036854775807
    %int1_10649 = torch.constant.int 1
    %7629 = torch.aten.slice.Tensor %7628, %int0_10646, %int0_10647, %int9223372036854775807_10648, %int1_10649 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_10650 = torch.constant.int 1
    %7630 = torch.aten.unsqueeze %7629, %int1_10650 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_10651 = torch.constant.int 2
    %int0_10652 = torch.constant.int 0
    %int9223372036854775807_10653 = torch.constant.int 9223372036854775807
    %int1_10654 = torch.constant.int 1
    %7631 = torch.aten.slice.Tensor %7630, %int2_10651, %int0_10652, %int9223372036854775807_10653, %int1_10654 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_10655 = torch.constant.int -1
    %int0_10656 = torch.constant.int 0
    %int3072_10657 = torch.constant.int 3072
    %int1_10658 = torch.constant.int 1
    %7632 = torch.aten.slice.Tensor %7631, %int-1_10655, %int0_10656, %int3072_10657, %int1_10658 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10659 = torch.constant.int -1
    %int3072_10660 = torch.constant.int 3072
    %int6144_10661 = torch.constant.int 6144
    %int1_10662 = torch.constant.int 1
    %7633 = torch.aten.slice.Tensor %7631, %int-1_10659, %int3072_10660, %int6144_10661, %int1_10662 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10663 = torch.constant.int -1
    %int6144_10664 = torch.constant.int 6144
    %int9216_10665 = torch.constant.int 9216
    %int1_10666 = torch.constant.int 1
    %7634 = torch.aten.slice.Tensor %7631, %int-1_10663, %int6144_10664, %int9216_10665, %int1_10666 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_10667 = torch.constant.int 1
    %int1_10668 = torch.constant.int 1
    %7635 = torch.aten.add.Scalar %7633, %int1_10667, %int1_10668 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_10669 = torch.constant.int 6
    %7636 = torch.prims.convert_element_type %7616, %int6_10669 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_10670 = torch.constant.int 2
    %7637 = torch.prim.ListConstruct %int2_10670 : (!torch.int) -> !torch.list<int>
    %int0_10671 = torch.constant.int 0
    %true_10672 = torch.constant.bool true
    %result0_10673, %result1_10674 = torch.aten.var_mean.correction %7636, %7637, %int0_10671, %true_10672 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_10675 = torch.constant.float 9.9999999999999995E-7
    %int1_10676 = torch.constant.int 1
    %7638 = torch.aten.add.Scalar %result0_10673, %float9.999990e-07_10675, %int1_10676 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7639 = torch.aten.rsqrt %7638 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_10677 = torch.constant.int 1
    %7640 = torch.aten.sub.Tensor %7616, %result1_10674, %int1_10677 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7641 = torch.aten.mul.Tensor %7640, %7639 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_10678 = torch.constant.int 5
    %7642 = torch.prims.convert_element_type %7641, %int5_10678 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7643 = torch.aten.mul.Tensor %7635, %7642 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10679 = torch.constant.int 1
    %7644 = torch.aten.add.Tensor %7643, %7632, %int1_10679 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_10680 = torch.constant.int 4608
    %int3072_10681 = torch.constant.int 3072
    %7645 = torch.prim.ListConstruct %int4608_10680, %int3072_10681 : (!torch.int, !torch.int) -> !torch.list<int>
    %7646 = torch.aten.view %7644, %7645 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.8.linear1.weight = util.global.load @__auto.sampler.single_blocks.8.linear1.weight : tensor<21504x3072xf16>
    %7647 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_10682 = torch.constant.int 0
    %int1_10683 = torch.constant.int 1
    %7648 = torch.aten.transpose.int %7647, %int0_10682, %int1_10683 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.8.linear1.bias = util.global.load @__auto.sampler.single_blocks.8.linear1.bias : tensor<21504xf16>
    %7649 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_10684 = torch.constant.int 6
    %7650 = torch.prims.convert_element_type %7649, %int6_10684 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_10685 = torch.constant.int 6
    %7651 = torch.prims.convert_element_type %7646, %int6_10685 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_10686 = torch.constant.int 6
    %7652 = torch.prims.convert_element_type %7648, %int6_10686 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7653 = torch.aten.mm %7651, %7652 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_10687 = torch.constant.int 1
    %7654 = torch.aten.mul.Scalar %7653, %int1_10687 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_10688 = torch.constant.int 1
    %7655 = torch.aten.mul.Scalar %7650, %int1_10688 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_10689 = torch.constant.int 1
    %7656 = torch.aten.add.Tensor %7654, %7655, %int1_10689 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_10690 = torch.constant.int 5
    %7657 = torch.prims.convert_element_type %7656, %int5_10690 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_10691 = torch.constant.int 1
    %int4608_10692 = torch.constant.int 4608
    %int21504_10693 = torch.constant.int 21504
    %7658 = torch.prim.ListConstruct %int1_10691, %int4608_10692, %int21504_10693 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7659 = torch.aten.view %7657, %7658 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_10694 = torch.constant.int -1
    %int0_10695 = torch.constant.int 0
    %int9216_10696 = torch.constant.int 9216
    %int1_10697 = torch.constant.int 1
    %7660 = torch.aten.slice.Tensor %7659, %int-1_10694, %int0_10695, %int9216_10696, %int1_10697 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_10698 = torch.constant.int -1
    %int9216_10699 = torch.constant.int 9216
    %int21504_10700 = torch.constant.int 21504
    %int1_10701 = torch.constant.int 1
    %7661 = torch.aten.slice.Tensor %7659, %int-1_10698, %int9216_10699, %int21504_10700, %int1_10701 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_10702 = torch.constant.int 1
    %int4608_10703 = torch.constant.int 4608
    %int3_10704 = torch.constant.int 3
    %int24_10705 = torch.constant.int 24
    %int128_10706 = torch.constant.int 128
    %7662 = torch.prim.ListConstruct %int1_10702, %int4608_10703, %int3_10704, %int24_10705, %int128_10706 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7663 = torch.aten.view %7660, %7662 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_10707 = torch.constant.int 2
    %int0_10708 = torch.constant.int 0
    %int3_10709 = torch.constant.int 3
    %int1_10710 = torch.constant.int 1
    %int4_10711 = torch.constant.int 4
    %7664 = torch.prim.ListConstruct %int2_10707, %int0_10708, %int3_10709, %int1_10710, %int4_10711 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7665 = torch.aten.permute %7663, %7664 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_10712 = torch.constant.int 0
    %int0_10713 = torch.constant.int 0
    %7666 = torch.aten.select.int %7665, %int0_10712, %int0_10713 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10714 = torch.constant.int 0
    %int1_10715 = torch.constant.int 1
    %7667 = torch.aten.select.int %7665, %int0_10714, %int1_10715 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10716 = torch.constant.int 0
    %int2_10717 = torch.constant.int 2
    %7668 = torch.aten.select.int %7665, %int0_10716, %int2_10717 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10718 = torch.constant.int 6
    %7669 = torch.prims.convert_element_type %7666, %int6_10718 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10719 = torch.constant.int 2
    %7670 = torch.aten.pow.Tensor_Scalar %7669, %int2_10719 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10720 = torch.constant.int -1
    %7671 = torch.prim.ListConstruct %int-1_10720 : (!torch.int) -> !torch.list<int>
    %true_10721 = torch.constant.bool true
    %none_10722 = torch.constant.none
    %7672 = torch.aten.mean.dim %7670, %7671, %true_10721, %none_10722 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10723 = torch.constant.float 9.9999999999999995E-7
    %int1_10724 = torch.constant.int 1
    %7673 = torch.aten.add.Scalar %7672, %float9.999990e-07_10723, %int1_10724 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7674 = torch.aten.rsqrt %7673 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7675 = torch.aten.mul.Tensor %7669, %7674 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10725 = torch.constant.int 5
    %7676 = torch.prims.convert_element_type %7675, %int5_10725 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.8.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.8.norm.query_norm.scale : tensor<128xf16>
    %7677 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7678 = torch.aten.mul.Tensor %7676, %7677 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10726 = torch.constant.int 6
    %7679 = torch.prims.convert_element_type %7667, %int6_10726 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10727 = torch.constant.int 2
    %7680 = torch.aten.pow.Tensor_Scalar %7679, %int2_10727 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10728 = torch.constant.int -1
    %7681 = torch.prim.ListConstruct %int-1_10728 : (!torch.int) -> !torch.list<int>
    %true_10729 = torch.constant.bool true
    %none_10730 = torch.constant.none
    %7682 = torch.aten.mean.dim %7680, %7681, %true_10729, %none_10730 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10731 = torch.constant.float 9.9999999999999995E-7
    %int1_10732 = torch.constant.int 1
    %7683 = torch.aten.add.Scalar %7682, %float9.999990e-07_10731, %int1_10732 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7684 = torch.aten.rsqrt %7683 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7685 = torch.aten.mul.Tensor %7679, %7684 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10733 = torch.constant.int 5
    %7686 = torch.prims.convert_element_type %7685, %int5_10733 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.8.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.8.norm.key_norm.scale : tensor<128xf16>
    %7687 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7688 = torch.aten.mul.Tensor %7686, %7687 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10734 = torch.constant.int 5
    %7689 = torch.prims.convert_element_type %7678, %int5_10734 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10735 = torch.constant.int 5
    %7690 = torch.prims.convert_element_type %7688, %int5_10735 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10736 = torch.constant.int 6
    %7691 = torch.prims.convert_element_type %7689, %int6_10736 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10737 = torch.constant.int 1
    %int24_10738 = torch.constant.int 24
    %int4608_10739 = torch.constant.int 4608
    %int64_10740 = torch.constant.int 64
    %int1_10741 = torch.constant.int 1
    %int2_10742 = torch.constant.int 2
    %7692 = torch.prim.ListConstruct %int1_10737, %int24_10738, %int4608_10739, %int64_10740, %int1_10741, %int2_10742 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7693 = torch.aten.view %7691, %7692 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_10743 = torch.constant.int 6
    %7694 = torch.prims.convert_element_type %7690, %int6_10743 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10744 = torch.constant.int 1
    %int24_10745 = torch.constant.int 24
    %int4608_10746 = torch.constant.int 4608
    %int64_10747 = torch.constant.int 64
    %int1_10748 = torch.constant.int 1
    %int2_10749 = torch.constant.int 2
    %7695 = torch.prim.ListConstruct %int1_10744, %int24_10745, %int4608_10746, %int64_10747, %int1_10748, %int2_10749 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7696 = torch.aten.view %7694, %7695 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_10750 = torch.constant.int 5
    %int0_10751 = torch.constant.int 0
    %7697 = torch.aten.select.int %211, %int5_10750, %int0_10751 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10752 = torch.constant.int 5
    %int0_10753 = torch.constant.int 0
    %7698 = torch.aten.select.int %7693, %int5_10752, %int0_10753 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7699 = torch.aten.mul.Tensor %7697, %7698 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10754 = torch.constant.int 5
    %int1_10755 = torch.constant.int 1
    %7700 = torch.aten.select.int %211, %int5_10754, %int1_10755 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10756 = torch.constant.int 5
    %int1_10757 = torch.constant.int 1
    %7701 = torch.aten.select.int %7693, %int5_10756, %int1_10757 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7702 = torch.aten.mul.Tensor %7700, %7701 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10758 = torch.constant.int 1
    %7703 = torch.aten.add.Tensor %7699, %7702, %int1_10758 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10759 = torch.constant.int 5
    %int0_10760 = torch.constant.int 0
    %7704 = torch.aten.select.int %211, %int5_10759, %int0_10760 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10761 = torch.constant.int 5
    %int0_10762 = torch.constant.int 0
    %7705 = torch.aten.select.int %7696, %int5_10761, %int0_10762 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7706 = torch.aten.mul.Tensor %7704, %7705 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10763 = torch.constant.int 5
    %int1_10764 = torch.constant.int 1
    %7707 = torch.aten.select.int %211, %int5_10763, %int1_10764 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10765 = torch.constant.int 5
    %int1_10766 = torch.constant.int 1
    %7708 = torch.aten.select.int %7696, %int5_10765, %int1_10766 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7709 = torch.aten.mul.Tensor %7707, %7708 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10767 = torch.constant.int 1
    %7710 = torch.aten.add.Tensor %7706, %7709, %int1_10767 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10768 = torch.constant.int 1
    %int24_10769 = torch.constant.int 24
    %int4608_10770 = torch.constant.int 4608
    %int128_10771 = torch.constant.int 128
    %7711 = torch.prim.ListConstruct %int1_10768, %int24_10769, %int4608_10770, %int128_10771 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7712 = torch.aten.view %7703, %7711 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10772 = torch.constant.int 5
    %7713 = torch.prims.convert_element_type %7712, %int5_10772 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_10773 = torch.constant.int 1
    %int24_10774 = torch.constant.int 24
    %int4608_10775 = torch.constant.int 4608
    %int128_10776 = torch.constant.int 128
    %7714 = torch.prim.ListConstruct %int1_10773, %int24_10774, %int4608_10775, %int128_10776 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7715 = torch.aten.view %7710, %7714 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10777 = torch.constant.int 5
    %7716 = torch.prims.convert_element_type %7715, %int5_10777 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_10778 = torch.constant.float 0.000000e+00
    %false_10779 = torch.constant.bool false
    %none_10780 = torch.constant.none
    %none_10781 = torch.constant.none
    %7717:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7713, %7716, %7668, %float0.000000e00_10778, %false_10779, %none_10780, %none_10781) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_10782 = torch.constant.int 0
    %int2_10783 = torch.constant.int 2
    %int1_10784 = torch.constant.int 1
    %int3_10785 = torch.constant.int 3
    %7718 = torch.prim.ListConstruct %int0_10782, %int2_10783, %int1_10784, %int3_10785 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7719 = torch.aten.permute %7717#0, %7718 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_10786 = torch.constant.int 1
    %int4608_10787 = torch.constant.int 4608
    %int3072_10788 = torch.constant.int 3072
    %7720 = torch.prim.ListConstruct %int1_10786, %int4608_10787, %int3072_10788 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7721 = torch.aten.view %7719, %7720 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_10789 = torch.constant.str "tanh"
    %7722 = torch.aten.gelu %7661, %str_10789 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7723 = torch.prim.ListConstruct %7721, %7722 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_10790 = torch.constant.int 2
    %7724 = torch.aten.cat %7723, %int2_10790 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_10791 = torch.constant.int 4608
    %int15360_10792 = torch.constant.int 15360
    %7725 = torch.prim.ListConstruct %int4608_10791, %int15360_10792 : (!torch.int, !torch.int) -> !torch.list<int>
    %7726 = torch.aten.view %7724, %7725 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.8.linear2.weight = util.global.load @__auto.sampler.single_blocks.8.linear2.weight : tensor<3072x15360xf16>
    %7727 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_10793 = torch.constant.int 0
    %int1_10794 = torch.constant.int 1
    %7728 = torch.aten.transpose.int %7727, %int0_10793, %int1_10794 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.8.linear2.bias = util.global.load @__auto.sampler.single_blocks.8.linear2.bias : tensor<3072xf16>
    %7729 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_10795 = torch.constant.int 6
    %7730 = torch.prims.convert_element_type %7729, %int6_10795 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_10796 = torch.constant.int 6
    %7731 = torch.prims.convert_element_type %7726, %int6_10796 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_10797 = torch.constant.int 6
    %7732 = torch.prims.convert_element_type %7728, %int6_10797 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7733 = torch.aten.mm %7731, %7732 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_10798 = torch.constant.int 1
    %7734 = torch.aten.mul.Scalar %7733, %int1_10798 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_10799 = torch.constant.int 1
    %7735 = torch.aten.mul.Scalar %7730, %int1_10799 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_10800 = torch.constant.int 1
    %7736 = torch.aten.add.Tensor %7734, %7735, %int1_10800 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_10801 = torch.constant.int 5
    %7737 = torch.prims.convert_element_type %7736, %int5_10801 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_10802 = torch.constant.int 1
    %int4608_10803 = torch.constant.int 4608
    %int3072_10804 = torch.constant.int 3072
    %7738 = torch.prim.ListConstruct %int1_10802, %int4608_10803, %int3072_10804 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7739 = torch.aten.view %7737, %7738 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7740 = torch.aten.mul.Tensor %7634, %7739 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10805 = torch.constant.int 1
    %7741 = torch.aten.add.Tensor %7616, %7740, %int1_10805 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7742 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.9.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.9.modulation.lin.weight : tensor<9216x3072xf16>
    %7743 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_10806 = torch.constant.int 0
    %int1_10807 = torch.constant.int 1
    %7744 = torch.aten.transpose.int %7743, %int0_10806, %int1_10807 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.9.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.9.modulation.lin.bias : tensor<9216xf16>
    %7745 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_10808 = torch.constant.int 6
    %7746 = torch.prims.convert_element_type %7745, %int6_10808 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_10809 = torch.constant.int 6
    %7747 = torch.prims.convert_element_type %7742, %int6_10809 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_10810 = torch.constant.int 6
    %7748 = torch.prims.convert_element_type %7744, %int6_10810 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7749 = torch.aten.mm %7747, %7748 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_10811 = torch.constant.int 1
    %7750 = torch.aten.mul.Scalar %7749, %int1_10811 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_10812 = torch.constant.int 1
    %7751 = torch.aten.mul.Scalar %7746, %int1_10812 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_10813 = torch.constant.int 1
    %7752 = torch.aten.add.Tensor %7750, %7751, %int1_10813 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_10814 = torch.constant.int 5
    %7753 = torch.prims.convert_element_type %7752, %int5_10814 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_10815 = torch.constant.int 0
    %int0_10816 = torch.constant.int 0
    %int9223372036854775807_10817 = torch.constant.int 9223372036854775807
    %int1_10818 = torch.constant.int 1
    %7754 = torch.aten.slice.Tensor %7753, %int0_10815, %int0_10816, %int9223372036854775807_10817, %int1_10818 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_10819 = torch.constant.int 1
    %7755 = torch.aten.unsqueeze %7754, %int1_10819 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_10820 = torch.constant.int 2
    %int0_10821 = torch.constant.int 0
    %int9223372036854775807_10822 = torch.constant.int 9223372036854775807
    %int1_10823 = torch.constant.int 1
    %7756 = torch.aten.slice.Tensor %7755, %int2_10820, %int0_10821, %int9223372036854775807_10822, %int1_10823 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_10824 = torch.constant.int -1
    %int0_10825 = torch.constant.int 0
    %int3072_10826 = torch.constant.int 3072
    %int1_10827 = torch.constant.int 1
    %7757 = torch.aten.slice.Tensor %7756, %int-1_10824, %int0_10825, %int3072_10826, %int1_10827 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10828 = torch.constant.int -1
    %int3072_10829 = torch.constant.int 3072
    %int6144_10830 = torch.constant.int 6144
    %int1_10831 = torch.constant.int 1
    %7758 = torch.aten.slice.Tensor %7756, %int-1_10828, %int3072_10829, %int6144_10830, %int1_10831 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10832 = torch.constant.int -1
    %int6144_10833 = torch.constant.int 6144
    %int9216_10834 = torch.constant.int 9216
    %int1_10835 = torch.constant.int 1
    %7759 = torch.aten.slice.Tensor %7756, %int-1_10832, %int6144_10833, %int9216_10834, %int1_10835 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_10836 = torch.constant.int 1
    %int1_10837 = torch.constant.int 1
    %7760 = torch.aten.add.Scalar %7758, %int1_10836, %int1_10837 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_10838 = torch.constant.int 6
    %7761 = torch.prims.convert_element_type %7741, %int6_10838 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_10839 = torch.constant.int 2
    %7762 = torch.prim.ListConstruct %int2_10839 : (!torch.int) -> !torch.list<int>
    %int0_10840 = torch.constant.int 0
    %true_10841 = torch.constant.bool true
    %result0_10842, %result1_10843 = torch.aten.var_mean.correction %7761, %7762, %int0_10840, %true_10841 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_10844 = torch.constant.float 9.9999999999999995E-7
    %int1_10845 = torch.constant.int 1
    %7763 = torch.aten.add.Scalar %result0_10842, %float9.999990e-07_10844, %int1_10845 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7764 = torch.aten.rsqrt %7763 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_10846 = torch.constant.int 1
    %7765 = torch.aten.sub.Tensor %7741, %result1_10843, %int1_10846 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7766 = torch.aten.mul.Tensor %7765, %7764 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_10847 = torch.constant.int 5
    %7767 = torch.prims.convert_element_type %7766, %int5_10847 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7768 = torch.aten.mul.Tensor %7760, %7767 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10848 = torch.constant.int 1
    %7769 = torch.aten.add.Tensor %7768, %7757, %int1_10848 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_10849 = torch.constant.int 4608
    %int3072_10850 = torch.constant.int 3072
    %7770 = torch.prim.ListConstruct %int4608_10849, %int3072_10850 : (!torch.int, !torch.int) -> !torch.list<int>
    %7771 = torch.aten.view %7769, %7770 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.9.linear1.weight = util.global.load @__auto.sampler.single_blocks.9.linear1.weight : tensor<21504x3072xf16>
    %7772 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_10851 = torch.constant.int 0
    %int1_10852 = torch.constant.int 1
    %7773 = torch.aten.transpose.int %7772, %int0_10851, %int1_10852 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.9.linear1.bias = util.global.load @__auto.sampler.single_blocks.9.linear1.bias : tensor<21504xf16>
    %7774 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_10853 = torch.constant.int 6
    %7775 = torch.prims.convert_element_type %7774, %int6_10853 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_10854 = torch.constant.int 6
    %7776 = torch.prims.convert_element_type %7771, %int6_10854 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_10855 = torch.constant.int 6
    %7777 = torch.prims.convert_element_type %7773, %int6_10855 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7778 = torch.aten.mm %7776, %7777 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_10856 = torch.constant.int 1
    %7779 = torch.aten.mul.Scalar %7778, %int1_10856 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_10857 = torch.constant.int 1
    %7780 = torch.aten.mul.Scalar %7775, %int1_10857 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_10858 = torch.constant.int 1
    %7781 = torch.aten.add.Tensor %7779, %7780, %int1_10858 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_10859 = torch.constant.int 5
    %7782 = torch.prims.convert_element_type %7781, %int5_10859 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_10860 = torch.constant.int 1
    %int4608_10861 = torch.constant.int 4608
    %int21504_10862 = torch.constant.int 21504
    %7783 = torch.prim.ListConstruct %int1_10860, %int4608_10861, %int21504_10862 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7784 = torch.aten.view %7782, %7783 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_10863 = torch.constant.int -1
    %int0_10864 = torch.constant.int 0
    %int9216_10865 = torch.constant.int 9216
    %int1_10866 = torch.constant.int 1
    %7785 = torch.aten.slice.Tensor %7784, %int-1_10863, %int0_10864, %int9216_10865, %int1_10866 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_10867 = torch.constant.int -1
    %int9216_10868 = torch.constant.int 9216
    %int21504_10869 = torch.constant.int 21504
    %int1_10870 = torch.constant.int 1
    %7786 = torch.aten.slice.Tensor %7784, %int-1_10867, %int9216_10868, %int21504_10869, %int1_10870 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_10871 = torch.constant.int 1
    %int4608_10872 = torch.constant.int 4608
    %int3_10873 = torch.constant.int 3
    %int24_10874 = torch.constant.int 24
    %int128_10875 = torch.constant.int 128
    %7787 = torch.prim.ListConstruct %int1_10871, %int4608_10872, %int3_10873, %int24_10874, %int128_10875 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7788 = torch.aten.view %7785, %7787 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_10876 = torch.constant.int 2
    %int0_10877 = torch.constant.int 0
    %int3_10878 = torch.constant.int 3
    %int1_10879 = torch.constant.int 1
    %int4_10880 = torch.constant.int 4
    %7789 = torch.prim.ListConstruct %int2_10876, %int0_10877, %int3_10878, %int1_10879, %int4_10880 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7790 = torch.aten.permute %7788, %7789 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_10881 = torch.constant.int 0
    %int0_10882 = torch.constant.int 0
    %7791 = torch.aten.select.int %7790, %int0_10881, %int0_10882 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10883 = torch.constant.int 0
    %int1_10884 = torch.constant.int 1
    %7792 = torch.aten.select.int %7790, %int0_10883, %int1_10884 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10885 = torch.constant.int 0
    %int2_10886 = torch.constant.int 2
    %7793 = torch.aten.select.int %7790, %int0_10885, %int2_10886 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10887 = torch.constant.int 6
    %7794 = torch.prims.convert_element_type %7791, %int6_10887 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10888 = torch.constant.int 2
    %7795 = torch.aten.pow.Tensor_Scalar %7794, %int2_10888 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10889 = torch.constant.int -1
    %7796 = torch.prim.ListConstruct %int-1_10889 : (!torch.int) -> !torch.list<int>
    %true_10890 = torch.constant.bool true
    %none_10891 = torch.constant.none
    %7797 = torch.aten.mean.dim %7795, %7796, %true_10890, %none_10891 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10892 = torch.constant.float 9.9999999999999995E-7
    %int1_10893 = torch.constant.int 1
    %7798 = torch.aten.add.Scalar %7797, %float9.999990e-07_10892, %int1_10893 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7799 = torch.aten.rsqrt %7798 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7800 = torch.aten.mul.Tensor %7794, %7799 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10894 = torch.constant.int 5
    %7801 = torch.prims.convert_element_type %7800, %int5_10894 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.9.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.9.norm.query_norm.scale : tensor<128xf16>
    %7802 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7803 = torch.aten.mul.Tensor %7801, %7802 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10895 = torch.constant.int 6
    %7804 = torch.prims.convert_element_type %7792, %int6_10895 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10896 = torch.constant.int 2
    %7805 = torch.aten.pow.Tensor_Scalar %7804, %int2_10896 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10897 = torch.constant.int -1
    %7806 = torch.prim.ListConstruct %int-1_10897 : (!torch.int) -> !torch.list<int>
    %true_10898 = torch.constant.bool true
    %none_10899 = torch.constant.none
    %7807 = torch.aten.mean.dim %7805, %7806, %true_10898, %none_10899 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10900 = torch.constant.float 9.9999999999999995E-7
    %int1_10901 = torch.constant.int 1
    %7808 = torch.aten.add.Scalar %7807, %float9.999990e-07_10900, %int1_10901 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7809 = torch.aten.rsqrt %7808 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7810 = torch.aten.mul.Tensor %7804, %7809 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10902 = torch.constant.int 5
    %7811 = torch.prims.convert_element_type %7810, %int5_10902 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.9.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.9.norm.key_norm.scale : tensor<128xf16>
    %7812 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7813 = torch.aten.mul.Tensor %7811, %7812 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10903 = torch.constant.int 5
    %7814 = torch.prims.convert_element_type %7803, %int5_10903 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10904 = torch.constant.int 5
    %7815 = torch.prims.convert_element_type %7813, %int5_10904 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10905 = torch.constant.int 6
    %7816 = torch.prims.convert_element_type %7814, %int6_10905 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10906 = torch.constant.int 1
    %int24_10907 = torch.constant.int 24
    %int4608_10908 = torch.constant.int 4608
    %int64_10909 = torch.constant.int 64
    %int1_10910 = torch.constant.int 1
    %int2_10911 = torch.constant.int 2
    %7817 = torch.prim.ListConstruct %int1_10906, %int24_10907, %int4608_10908, %int64_10909, %int1_10910, %int2_10911 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7818 = torch.aten.view %7816, %7817 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_10912 = torch.constant.int 6
    %7819 = torch.prims.convert_element_type %7815, %int6_10912 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10913 = torch.constant.int 1
    %int24_10914 = torch.constant.int 24
    %int4608_10915 = torch.constant.int 4608
    %int64_10916 = torch.constant.int 64
    %int1_10917 = torch.constant.int 1
    %int2_10918 = torch.constant.int 2
    %7820 = torch.prim.ListConstruct %int1_10913, %int24_10914, %int4608_10915, %int64_10916, %int1_10917, %int2_10918 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7821 = torch.aten.view %7819, %7820 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_10919 = torch.constant.int 5
    %int0_10920 = torch.constant.int 0
    %7822 = torch.aten.select.int %211, %int5_10919, %int0_10920 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10921 = torch.constant.int 5
    %int0_10922 = torch.constant.int 0
    %7823 = torch.aten.select.int %7818, %int5_10921, %int0_10922 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7824 = torch.aten.mul.Tensor %7822, %7823 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10923 = torch.constant.int 5
    %int1_10924 = torch.constant.int 1
    %7825 = torch.aten.select.int %211, %int5_10923, %int1_10924 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10925 = torch.constant.int 5
    %int1_10926 = torch.constant.int 1
    %7826 = torch.aten.select.int %7818, %int5_10925, %int1_10926 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7827 = torch.aten.mul.Tensor %7825, %7826 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10927 = torch.constant.int 1
    %7828 = torch.aten.add.Tensor %7824, %7827, %int1_10927 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10928 = torch.constant.int 5
    %int0_10929 = torch.constant.int 0
    %7829 = torch.aten.select.int %211, %int5_10928, %int0_10929 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10930 = torch.constant.int 5
    %int0_10931 = torch.constant.int 0
    %7830 = torch.aten.select.int %7821, %int5_10930, %int0_10931 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7831 = torch.aten.mul.Tensor %7829, %7830 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10932 = torch.constant.int 5
    %int1_10933 = torch.constant.int 1
    %7832 = torch.aten.select.int %211, %int5_10932, %int1_10933 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10934 = torch.constant.int 5
    %int1_10935 = torch.constant.int 1
    %7833 = torch.aten.select.int %7821, %int5_10934, %int1_10935 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7834 = torch.aten.mul.Tensor %7832, %7833 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10936 = torch.constant.int 1
    %7835 = torch.aten.add.Tensor %7831, %7834, %int1_10936 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10937 = torch.constant.int 1
    %int24_10938 = torch.constant.int 24
    %int4608_10939 = torch.constant.int 4608
    %int128_10940 = torch.constant.int 128
    %7836 = torch.prim.ListConstruct %int1_10937, %int24_10938, %int4608_10939, %int128_10940 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7837 = torch.aten.view %7828, %7836 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10941 = torch.constant.int 5
    %7838 = torch.prims.convert_element_type %7837, %int5_10941 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_10942 = torch.constant.int 1
    %int24_10943 = torch.constant.int 24
    %int4608_10944 = torch.constant.int 4608
    %int128_10945 = torch.constant.int 128
    %7839 = torch.prim.ListConstruct %int1_10942, %int24_10943, %int4608_10944, %int128_10945 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7840 = torch.aten.view %7835, %7839 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10946 = torch.constant.int 5
    %7841 = torch.prims.convert_element_type %7840, %int5_10946 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_10947 = torch.constant.float 0.000000e+00
    %false_10948 = torch.constant.bool false
    %none_10949 = torch.constant.none
    %none_10950 = torch.constant.none
    %7842:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7838, %7841, %7793, %float0.000000e00_10947, %false_10948, %none_10949, %none_10950) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_10951 = torch.constant.int 0
    %int2_10952 = torch.constant.int 2
    %int1_10953 = torch.constant.int 1
    %int3_10954 = torch.constant.int 3
    %7843 = torch.prim.ListConstruct %int0_10951, %int2_10952, %int1_10953, %int3_10954 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7844 = torch.aten.permute %7842#0, %7843 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_10955 = torch.constant.int 1
    %int4608_10956 = torch.constant.int 4608
    %int3072_10957 = torch.constant.int 3072
    %7845 = torch.prim.ListConstruct %int1_10955, %int4608_10956, %int3072_10957 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7846 = torch.aten.view %7844, %7845 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_10958 = torch.constant.str "tanh"
    %7847 = torch.aten.gelu %7786, %str_10958 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7848 = torch.prim.ListConstruct %7846, %7847 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_10959 = torch.constant.int 2
    %7849 = torch.aten.cat %7848, %int2_10959 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_10960 = torch.constant.int 4608
    %int15360_10961 = torch.constant.int 15360
    %7850 = torch.prim.ListConstruct %int4608_10960, %int15360_10961 : (!torch.int, !torch.int) -> !torch.list<int>
    %7851 = torch.aten.view %7849, %7850 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.9.linear2.weight = util.global.load @__auto.sampler.single_blocks.9.linear2.weight : tensor<3072x15360xf16>
    %7852 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_10962 = torch.constant.int 0
    %int1_10963 = torch.constant.int 1
    %7853 = torch.aten.transpose.int %7852, %int0_10962, %int1_10963 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.9.linear2.bias = util.global.load @__auto.sampler.single_blocks.9.linear2.bias : tensor<3072xf16>
    %7854 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_10964 = torch.constant.int 6
    %7855 = torch.prims.convert_element_type %7854, %int6_10964 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_10965 = torch.constant.int 6
    %7856 = torch.prims.convert_element_type %7851, %int6_10965 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_10966 = torch.constant.int 6
    %7857 = torch.prims.convert_element_type %7853, %int6_10966 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7858 = torch.aten.mm %7856, %7857 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_10967 = torch.constant.int 1
    %7859 = torch.aten.mul.Scalar %7858, %int1_10967 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_10968 = torch.constant.int 1
    %7860 = torch.aten.mul.Scalar %7855, %int1_10968 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_10969 = torch.constant.int 1
    %7861 = torch.aten.add.Tensor %7859, %7860, %int1_10969 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_10970 = torch.constant.int 5
    %7862 = torch.prims.convert_element_type %7861, %int5_10970 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_10971 = torch.constant.int 1
    %int4608_10972 = torch.constant.int 4608
    %int3072_10973 = torch.constant.int 3072
    %7863 = torch.prim.ListConstruct %int1_10971, %int4608_10972, %int3072_10973 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7864 = torch.aten.view %7862, %7863 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7865 = torch.aten.mul.Tensor %7759, %7864 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10974 = torch.constant.int 1
    %7866 = torch.aten.add.Tensor %7741, %7865, %int1_10974 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7867 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.10.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.10.modulation.lin.weight : tensor<9216x3072xf16>
    %7868 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_10975 = torch.constant.int 0
    %int1_10976 = torch.constant.int 1
    %7869 = torch.aten.transpose.int %7868, %int0_10975, %int1_10976 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.10.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.10.modulation.lin.bias : tensor<9216xf16>
    %7870 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_10977 = torch.constant.int 6
    %7871 = torch.prims.convert_element_type %7870, %int6_10977 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_10978 = torch.constant.int 6
    %7872 = torch.prims.convert_element_type %7867, %int6_10978 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_10979 = torch.constant.int 6
    %7873 = torch.prims.convert_element_type %7869, %int6_10979 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7874 = torch.aten.mm %7872, %7873 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_10980 = torch.constant.int 1
    %7875 = torch.aten.mul.Scalar %7874, %int1_10980 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_10981 = torch.constant.int 1
    %7876 = torch.aten.mul.Scalar %7871, %int1_10981 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_10982 = torch.constant.int 1
    %7877 = torch.aten.add.Tensor %7875, %7876, %int1_10982 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_10983 = torch.constant.int 5
    %7878 = torch.prims.convert_element_type %7877, %int5_10983 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_10984 = torch.constant.int 0
    %int0_10985 = torch.constant.int 0
    %int9223372036854775807_10986 = torch.constant.int 9223372036854775807
    %int1_10987 = torch.constant.int 1
    %7879 = torch.aten.slice.Tensor %7878, %int0_10984, %int0_10985, %int9223372036854775807_10986, %int1_10987 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_10988 = torch.constant.int 1
    %7880 = torch.aten.unsqueeze %7879, %int1_10988 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_10989 = torch.constant.int 2
    %int0_10990 = torch.constant.int 0
    %int9223372036854775807_10991 = torch.constant.int 9223372036854775807
    %int1_10992 = torch.constant.int 1
    %7881 = torch.aten.slice.Tensor %7880, %int2_10989, %int0_10990, %int9223372036854775807_10991, %int1_10992 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_10993 = torch.constant.int -1
    %int0_10994 = torch.constant.int 0
    %int3072_10995 = torch.constant.int 3072
    %int1_10996 = torch.constant.int 1
    %7882 = torch.aten.slice.Tensor %7881, %int-1_10993, %int0_10994, %int3072_10995, %int1_10996 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10997 = torch.constant.int -1
    %int3072_10998 = torch.constant.int 3072
    %int6144_10999 = torch.constant.int 6144
    %int1_11000 = torch.constant.int 1
    %7883 = torch.aten.slice.Tensor %7881, %int-1_10997, %int3072_10998, %int6144_10999, %int1_11000 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11001 = torch.constant.int -1
    %int6144_11002 = torch.constant.int 6144
    %int9216_11003 = torch.constant.int 9216
    %int1_11004 = torch.constant.int 1
    %7884 = torch.aten.slice.Tensor %7881, %int-1_11001, %int6144_11002, %int9216_11003, %int1_11004 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_11005 = torch.constant.int 1
    %int1_11006 = torch.constant.int 1
    %7885 = torch.aten.add.Scalar %7883, %int1_11005, %int1_11006 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_11007 = torch.constant.int 6
    %7886 = torch.prims.convert_element_type %7866, %int6_11007 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_11008 = torch.constant.int 2
    %7887 = torch.prim.ListConstruct %int2_11008 : (!torch.int) -> !torch.list<int>
    %int0_11009 = torch.constant.int 0
    %true_11010 = torch.constant.bool true
    %result0_11011, %result1_11012 = torch.aten.var_mean.correction %7886, %7887, %int0_11009, %true_11010 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_11013 = torch.constant.float 9.9999999999999995E-7
    %int1_11014 = torch.constant.int 1
    %7888 = torch.aten.add.Scalar %result0_11011, %float9.999990e-07_11013, %int1_11014 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7889 = torch.aten.rsqrt %7888 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_11015 = torch.constant.int 1
    %7890 = torch.aten.sub.Tensor %7866, %result1_11012, %int1_11015 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7891 = torch.aten.mul.Tensor %7890, %7889 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_11016 = torch.constant.int 5
    %7892 = torch.prims.convert_element_type %7891, %int5_11016 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7893 = torch.aten.mul.Tensor %7885, %7892 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11017 = torch.constant.int 1
    %7894 = torch.aten.add.Tensor %7893, %7882, %int1_11017 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_11018 = torch.constant.int 4608
    %int3072_11019 = torch.constant.int 3072
    %7895 = torch.prim.ListConstruct %int4608_11018, %int3072_11019 : (!torch.int, !torch.int) -> !torch.list<int>
    %7896 = torch.aten.view %7894, %7895 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.10.linear1.weight = util.global.load @__auto.sampler.single_blocks.10.linear1.weight : tensor<21504x3072xf16>
    %7897 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_11020 = torch.constant.int 0
    %int1_11021 = torch.constant.int 1
    %7898 = torch.aten.transpose.int %7897, %int0_11020, %int1_11021 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.10.linear1.bias = util.global.load @__auto.sampler.single_blocks.10.linear1.bias : tensor<21504xf16>
    %7899 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_11022 = torch.constant.int 6
    %7900 = torch.prims.convert_element_type %7899, %int6_11022 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_11023 = torch.constant.int 6
    %7901 = torch.prims.convert_element_type %7896, %int6_11023 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_11024 = torch.constant.int 6
    %7902 = torch.prims.convert_element_type %7898, %int6_11024 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7903 = torch.aten.mm %7901, %7902 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_11025 = torch.constant.int 1
    %7904 = torch.aten.mul.Scalar %7903, %int1_11025 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_11026 = torch.constant.int 1
    %7905 = torch.aten.mul.Scalar %7900, %int1_11026 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_11027 = torch.constant.int 1
    %7906 = torch.aten.add.Tensor %7904, %7905, %int1_11027 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_11028 = torch.constant.int 5
    %7907 = torch.prims.convert_element_type %7906, %int5_11028 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_11029 = torch.constant.int 1
    %int4608_11030 = torch.constant.int 4608
    %int21504_11031 = torch.constant.int 21504
    %7908 = torch.prim.ListConstruct %int1_11029, %int4608_11030, %int21504_11031 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7909 = torch.aten.view %7907, %7908 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_11032 = torch.constant.int -1
    %int0_11033 = torch.constant.int 0
    %int9216_11034 = torch.constant.int 9216
    %int1_11035 = torch.constant.int 1
    %7910 = torch.aten.slice.Tensor %7909, %int-1_11032, %int0_11033, %int9216_11034, %int1_11035 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_11036 = torch.constant.int -1
    %int9216_11037 = torch.constant.int 9216
    %int21504_11038 = torch.constant.int 21504
    %int1_11039 = torch.constant.int 1
    %7911 = torch.aten.slice.Tensor %7909, %int-1_11036, %int9216_11037, %int21504_11038, %int1_11039 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_11040 = torch.constant.int 1
    %int4608_11041 = torch.constant.int 4608
    %int3_11042 = torch.constant.int 3
    %int24_11043 = torch.constant.int 24
    %int128_11044 = torch.constant.int 128
    %7912 = torch.prim.ListConstruct %int1_11040, %int4608_11041, %int3_11042, %int24_11043, %int128_11044 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7913 = torch.aten.view %7910, %7912 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_11045 = torch.constant.int 2
    %int0_11046 = torch.constant.int 0
    %int3_11047 = torch.constant.int 3
    %int1_11048 = torch.constant.int 1
    %int4_11049 = torch.constant.int 4
    %7914 = torch.prim.ListConstruct %int2_11045, %int0_11046, %int3_11047, %int1_11048, %int4_11049 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7915 = torch.aten.permute %7913, %7914 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_11050 = torch.constant.int 0
    %int0_11051 = torch.constant.int 0
    %7916 = torch.aten.select.int %7915, %int0_11050, %int0_11051 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11052 = torch.constant.int 0
    %int1_11053 = torch.constant.int 1
    %7917 = torch.aten.select.int %7915, %int0_11052, %int1_11053 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11054 = torch.constant.int 0
    %int2_11055 = torch.constant.int 2
    %7918 = torch.aten.select.int %7915, %int0_11054, %int2_11055 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11056 = torch.constant.int 6
    %7919 = torch.prims.convert_element_type %7916, %int6_11056 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11057 = torch.constant.int 2
    %7920 = torch.aten.pow.Tensor_Scalar %7919, %int2_11057 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11058 = torch.constant.int -1
    %7921 = torch.prim.ListConstruct %int-1_11058 : (!torch.int) -> !torch.list<int>
    %true_11059 = torch.constant.bool true
    %none_11060 = torch.constant.none
    %7922 = torch.aten.mean.dim %7920, %7921, %true_11059, %none_11060 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11061 = torch.constant.float 9.9999999999999995E-7
    %int1_11062 = torch.constant.int 1
    %7923 = torch.aten.add.Scalar %7922, %float9.999990e-07_11061, %int1_11062 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7924 = torch.aten.rsqrt %7923 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7925 = torch.aten.mul.Tensor %7919, %7924 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11063 = torch.constant.int 5
    %7926 = torch.prims.convert_element_type %7925, %int5_11063 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.10.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.10.norm.query_norm.scale : tensor<128xf16>
    %7927 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7928 = torch.aten.mul.Tensor %7926, %7927 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11064 = torch.constant.int 6
    %7929 = torch.prims.convert_element_type %7917, %int6_11064 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11065 = torch.constant.int 2
    %7930 = torch.aten.pow.Tensor_Scalar %7929, %int2_11065 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11066 = torch.constant.int -1
    %7931 = torch.prim.ListConstruct %int-1_11066 : (!torch.int) -> !torch.list<int>
    %true_11067 = torch.constant.bool true
    %none_11068 = torch.constant.none
    %7932 = torch.aten.mean.dim %7930, %7931, %true_11067, %none_11068 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11069 = torch.constant.float 9.9999999999999995E-7
    %int1_11070 = torch.constant.int 1
    %7933 = torch.aten.add.Scalar %7932, %float9.999990e-07_11069, %int1_11070 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7934 = torch.aten.rsqrt %7933 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7935 = torch.aten.mul.Tensor %7929, %7934 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11071 = torch.constant.int 5
    %7936 = torch.prims.convert_element_type %7935, %int5_11071 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.10.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.10.norm.key_norm.scale : tensor<128xf16>
    %7937 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7938 = torch.aten.mul.Tensor %7936, %7937 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11072 = torch.constant.int 5
    %7939 = torch.prims.convert_element_type %7928, %int5_11072 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11073 = torch.constant.int 5
    %7940 = torch.prims.convert_element_type %7938, %int5_11073 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11074 = torch.constant.int 6
    %7941 = torch.prims.convert_element_type %7939, %int6_11074 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11075 = torch.constant.int 1
    %int24_11076 = torch.constant.int 24
    %int4608_11077 = torch.constant.int 4608
    %int64_11078 = torch.constant.int 64
    %int1_11079 = torch.constant.int 1
    %int2_11080 = torch.constant.int 2
    %7942 = torch.prim.ListConstruct %int1_11075, %int24_11076, %int4608_11077, %int64_11078, %int1_11079, %int2_11080 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7943 = torch.aten.view %7941, %7942 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_11081 = torch.constant.int 6
    %7944 = torch.prims.convert_element_type %7940, %int6_11081 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11082 = torch.constant.int 1
    %int24_11083 = torch.constant.int 24
    %int4608_11084 = torch.constant.int 4608
    %int64_11085 = torch.constant.int 64
    %int1_11086 = torch.constant.int 1
    %int2_11087 = torch.constant.int 2
    %7945 = torch.prim.ListConstruct %int1_11082, %int24_11083, %int4608_11084, %int64_11085, %int1_11086, %int2_11087 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7946 = torch.aten.view %7944, %7945 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_11088 = torch.constant.int 5
    %int0_11089 = torch.constant.int 0
    %7947 = torch.aten.select.int %211, %int5_11088, %int0_11089 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11090 = torch.constant.int 5
    %int0_11091 = torch.constant.int 0
    %7948 = torch.aten.select.int %7943, %int5_11090, %int0_11091 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7949 = torch.aten.mul.Tensor %7947, %7948 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11092 = torch.constant.int 5
    %int1_11093 = torch.constant.int 1
    %7950 = torch.aten.select.int %211, %int5_11092, %int1_11093 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11094 = torch.constant.int 5
    %int1_11095 = torch.constant.int 1
    %7951 = torch.aten.select.int %7943, %int5_11094, %int1_11095 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7952 = torch.aten.mul.Tensor %7950, %7951 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11096 = torch.constant.int 1
    %7953 = torch.aten.add.Tensor %7949, %7952, %int1_11096 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11097 = torch.constant.int 5
    %int0_11098 = torch.constant.int 0
    %7954 = torch.aten.select.int %211, %int5_11097, %int0_11098 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11099 = torch.constant.int 5
    %int0_11100 = torch.constant.int 0
    %7955 = torch.aten.select.int %7946, %int5_11099, %int0_11100 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7956 = torch.aten.mul.Tensor %7954, %7955 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11101 = torch.constant.int 5
    %int1_11102 = torch.constant.int 1
    %7957 = torch.aten.select.int %211, %int5_11101, %int1_11102 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11103 = torch.constant.int 5
    %int1_11104 = torch.constant.int 1
    %7958 = torch.aten.select.int %7946, %int5_11103, %int1_11104 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7959 = torch.aten.mul.Tensor %7957, %7958 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11105 = torch.constant.int 1
    %7960 = torch.aten.add.Tensor %7956, %7959, %int1_11105 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11106 = torch.constant.int 1
    %int24_11107 = torch.constant.int 24
    %int4608_11108 = torch.constant.int 4608
    %int128_11109 = torch.constant.int 128
    %7961 = torch.prim.ListConstruct %int1_11106, %int24_11107, %int4608_11108, %int128_11109 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7962 = torch.aten.view %7953, %7961 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11110 = torch.constant.int 5
    %7963 = torch.prims.convert_element_type %7962, %int5_11110 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_11111 = torch.constant.int 1
    %int24_11112 = torch.constant.int 24
    %int4608_11113 = torch.constant.int 4608
    %int128_11114 = torch.constant.int 128
    %7964 = torch.prim.ListConstruct %int1_11111, %int24_11112, %int4608_11113, %int128_11114 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7965 = torch.aten.view %7960, %7964 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11115 = torch.constant.int 5
    %7966 = torch.prims.convert_element_type %7965, %int5_11115 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_11116 = torch.constant.float 0.000000e+00
    %false_11117 = torch.constant.bool false
    %none_11118 = torch.constant.none
    %none_11119 = torch.constant.none
    %7967:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7963, %7966, %7918, %float0.000000e00_11116, %false_11117, %none_11118, %none_11119) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_11120 = torch.constant.int 0
    %int2_11121 = torch.constant.int 2
    %int1_11122 = torch.constant.int 1
    %int3_11123 = torch.constant.int 3
    %7968 = torch.prim.ListConstruct %int0_11120, %int2_11121, %int1_11122, %int3_11123 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7969 = torch.aten.permute %7967#0, %7968 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_11124 = torch.constant.int 1
    %int4608_11125 = torch.constant.int 4608
    %int3072_11126 = torch.constant.int 3072
    %7970 = torch.prim.ListConstruct %int1_11124, %int4608_11125, %int3072_11126 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7971 = torch.aten.view %7969, %7970 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_11127 = torch.constant.str "tanh"
    %7972 = torch.aten.gelu %7911, %str_11127 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7973 = torch.prim.ListConstruct %7971, %7972 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_11128 = torch.constant.int 2
    %7974 = torch.aten.cat %7973, %int2_11128 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_11129 = torch.constant.int 4608
    %int15360_11130 = torch.constant.int 15360
    %7975 = torch.prim.ListConstruct %int4608_11129, %int15360_11130 : (!torch.int, !torch.int) -> !torch.list<int>
    %7976 = torch.aten.view %7974, %7975 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.10.linear2.weight = util.global.load @__auto.sampler.single_blocks.10.linear2.weight : tensor<3072x15360xf16>
    %7977 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_11131 = torch.constant.int 0
    %int1_11132 = torch.constant.int 1
    %7978 = torch.aten.transpose.int %7977, %int0_11131, %int1_11132 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.10.linear2.bias = util.global.load @__auto.sampler.single_blocks.10.linear2.bias : tensor<3072xf16>
    %7979 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_11133 = torch.constant.int 6
    %7980 = torch.prims.convert_element_type %7979, %int6_11133 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_11134 = torch.constant.int 6
    %7981 = torch.prims.convert_element_type %7976, %int6_11134 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_11135 = torch.constant.int 6
    %7982 = torch.prims.convert_element_type %7978, %int6_11135 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7983 = torch.aten.mm %7981, %7982 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_11136 = torch.constant.int 1
    %7984 = torch.aten.mul.Scalar %7983, %int1_11136 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_11137 = torch.constant.int 1
    %7985 = torch.aten.mul.Scalar %7980, %int1_11137 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_11138 = torch.constant.int 1
    %7986 = torch.aten.add.Tensor %7984, %7985, %int1_11138 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_11139 = torch.constant.int 5
    %7987 = torch.prims.convert_element_type %7986, %int5_11139 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_11140 = torch.constant.int 1
    %int4608_11141 = torch.constant.int 4608
    %int3072_11142 = torch.constant.int 3072
    %7988 = torch.prim.ListConstruct %int1_11140, %int4608_11141, %int3072_11142 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7989 = torch.aten.view %7987, %7988 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7990 = torch.aten.mul.Tensor %7884, %7989 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11143 = torch.constant.int 1
    %7991 = torch.aten.add.Tensor %7866, %7990, %int1_11143 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7992 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.11.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.11.modulation.lin.weight : tensor<9216x3072xf16>
    %7993 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_11144 = torch.constant.int 0
    %int1_11145 = torch.constant.int 1
    %7994 = torch.aten.transpose.int %7993, %int0_11144, %int1_11145 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.11.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.11.modulation.lin.bias : tensor<9216xf16>
    %7995 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_11146 = torch.constant.int 6
    %7996 = torch.prims.convert_element_type %7995, %int6_11146 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_11147 = torch.constant.int 6
    %7997 = torch.prims.convert_element_type %7992, %int6_11147 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_11148 = torch.constant.int 6
    %7998 = torch.prims.convert_element_type %7994, %int6_11148 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7999 = torch.aten.mm %7997, %7998 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_11149 = torch.constant.int 1
    %8000 = torch.aten.mul.Scalar %7999, %int1_11149 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_11150 = torch.constant.int 1
    %8001 = torch.aten.mul.Scalar %7996, %int1_11150 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_11151 = torch.constant.int 1
    %8002 = torch.aten.add.Tensor %8000, %8001, %int1_11151 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_11152 = torch.constant.int 5
    %8003 = torch.prims.convert_element_type %8002, %int5_11152 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_11153 = torch.constant.int 0
    %int0_11154 = torch.constant.int 0
    %int9223372036854775807_11155 = torch.constant.int 9223372036854775807
    %int1_11156 = torch.constant.int 1
    %8004 = torch.aten.slice.Tensor %8003, %int0_11153, %int0_11154, %int9223372036854775807_11155, %int1_11156 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_11157 = torch.constant.int 1
    %8005 = torch.aten.unsqueeze %8004, %int1_11157 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_11158 = torch.constant.int 2
    %int0_11159 = torch.constant.int 0
    %int9223372036854775807_11160 = torch.constant.int 9223372036854775807
    %int1_11161 = torch.constant.int 1
    %8006 = torch.aten.slice.Tensor %8005, %int2_11158, %int0_11159, %int9223372036854775807_11160, %int1_11161 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_11162 = torch.constant.int -1
    %int0_11163 = torch.constant.int 0
    %int3072_11164 = torch.constant.int 3072
    %int1_11165 = torch.constant.int 1
    %8007 = torch.aten.slice.Tensor %8006, %int-1_11162, %int0_11163, %int3072_11164, %int1_11165 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11166 = torch.constant.int -1
    %int3072_11167 = torch.constant.int 3072
    %int6144_11168 = torch.constant.int 6144
    %int1_11169 = torch.constant.int 1
    %8008 = torch.aten.slice.Tensor %8006, %int-1_11166, %int3072_11167, %int6144_11168, %int1_11169 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11170 = torch.constant.int -1
    %int6144_11171 = torch.constant.int 6144
    %int9216_11172 = torch.constant.int 9216
    %int1_11173 = torch.constant.int 1
    %8009 = torch.aten.slice.Tensor %8006, %int-1_11170, %int6144_11171, %int9216_11172, %int1_11173 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_11174 = torch.constant.int 1
    %int1_11175 = torch.constant.int 1
    %8010 = torch.aten.add.Scalar %8008, %int1_11174, %int1_11175 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_11176 = torch.constant.int 6
    %8011 = torch.prims.convert_element_type %7991, %int6_11176 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_11177 = torch.constant.int 2
    %8012 = torch.prim.ListConstruct %int2_11177 : (!torch.int) -> !torch.list<int>
    %int0_11178 = torch.constant.int 0
    %true_11179 = torch.constant.bool true
    %result0_11180, %result1_11181 = torch.aten.var_mean.correction %8011, %8012, %int0_11178, %true_11179 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_11182 = torch.constant.float 9.9999999999999995E-7
    %int1_11183 = torch.constant.int 1
    %8013 = torch.aten.add.Scalar %result0_11180, %float9.999990e-07_11182, %int1_11183 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8014 = torch.aten.rsqrt %8013 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_11184 = torch.constant.int 1
    %8015 = torch.aten.sub.Tensor %7991, %result1_11181, %int1_11184 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8016 = torch.aten.mul.Tensor %8015, %8014 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_11185 = torch.constant.int 5
    %8017 = torch.prims.convert_element_type %8016, %int5_11185 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8018 = torch.aten.mul.Tensor %8010, %8017 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11186 = torch.constant.int 1
    %8019 = torch.aten.add.Tensor %8018, %8007, %int1_11186 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_11187 = torch.constant.int 4608
    %int3072_11188 = torch.constant.int 3072
    %8020 = torch.prim.ListConstruct %int4608_11187, %int3072_11188 : (!torch.int, !torch.int) -> !torch.list<int>
    %8021 = torch.aten.view %8019, %8020 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.11.linear1.weight = util.global.load @__auto.sampler.single_blocks.11.linear1.weight : tensor<21504x3072xf16>
    %8022 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_11189 = torch.constant.int 0
    %int1_11190 = torch.constant.int 1
    %8023 = torch.aten.transpose.int %8022, %int0_11189, %int1_11190 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.11.linear1.bias = util.global.load @__auto.sampler.single_blocks.11.linear1.bias : tensor<21504xf16>
    %8024 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_11191 = torch.constant.int 6
    %8025 = torch.prims.convert_element_type %8024, %int6_11191 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_11192 = torch.constant.int 6
    %8026 = torch.prims.convert_element_type %8021, %int6_11192 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_11193 = torch.constant.int 6
    %8027 = torch.prims.convert_element_type %8023, %int6_11193 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8028 = torch.aten.mm %8026, %8027 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_11194 = torch.constant.int 1
    %8029 = torch.aten.mul.Scalar %8028, %int1_11194 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_11195 = torch.constant.int 1
    %8030 = torch.aten.mul.Scalar %8025, %int1_11195 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_11196 = torch.constant.int 1
    %8031 = torch.aten.add.Tensor %8029, %8030, %int1_11196 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_11197 = torch.constant.int 5
    %8032 = torch.prims.convert_element_type %8031, %int5_11197 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_11198 = torch.constant.int 1
    %int4608_11199 = torch.constant.int 4608
    %int21504_11200 = torch.constant.int 21504
    %8033 = torch.prim.ListConstruct %int1_11198, %int4608_11199, %int21504_11200 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8034 = torch.aten.view %8032, %8033 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_11201 = torch.constant.int -1
    %int0_11202 = torch.constant.int 0
    %int9216_11203 = torch.constant.int 9216
    %int1_11204 = torch.constant.int 1
    %8035 = torch.aten.slice.Tensor %8034, %int-1_11201, %int0_11202, %int9216_11203, %int1_11204 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_11205 = torch.constant.int -1
    %int9216_11206 = torch.constant.int 9216
    %int21504_11207 = torch.constant.int 21504
    %int1_11208 = torch.constant.int 1
    %8036 = torch.aten.slice.Tensor %8034, %int-1_11205, %int9216_11206, %int21504_11207, %int1_11208 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_11209 = torch.constant.int 1
    %int4608_11210 = torch.constant.int 4608
    %int3_11211 = torch.constant.int 3
    %int24_11212 = torch.constant.int 24
    %int128_11213 = torch.constant.int 128
    %8037 = torch.prim.ListConstruct %int1_11209, %int4608_11210, %int3_11211, %int24_11212, %int128_11213 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8038 = torch.aten.view %8035, %8037 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_11214 = torch.constant.int 2
    %int0_11215 = torch.constant.int 0
    %int3_11216 = torch.constant.int 3
    %int1_11217 = torch.constant.int 1
    %int4_11218 = torch.constant.int 4
    %8039 = torch.prim.ListConstruct %int2_11214, %int0_11215, %int3_11216, %int1_11217, %int4_11218 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8040 = torch.aten.permute %8038, %8039 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_11219 = torch.constant.int 0
    %int0_11220 = torch.constant.int 0
    %8041 = torch.aten.select.int %8040, %int0_11219, %int0_11220 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11221 = torch.constant.int 0
    %int1_11222 = torch.constant.int 1
    %8042 = torch.aten.select.int %8040, %int0_11221, %int1_11222 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11223 = torch.constant.int 0
    %int2_11224 = torch.constant.int 2
    %8043 = torch.aten.select.int %8040, %int0_11223, %int2_11224 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11225 = torch.constant.int 6
    %8044 = torch.prims.convert_element_type %8041, %int6_11225 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11226 = torch.constant.int 2
    %8045 = torch.aten.pow.Tensor_Scalar %8044, %int2_11226 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11227 = torch.constant.int -1
    %8046 = torch.prim.ListConstruct %int-1_11227 : (!torch.int) -> !torch.list<int>
    %true_11228 = torch.constant.bool true
    %none_11229 = torch.constant.none
    %8047 = torch.aten.mean.dim %8045, %8046, %true_11228, %none_11229 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11230 = torch.constant.float 9.9999999999999995E-7
    %int1_11231 = torch.constant.int 1
    %8048 = torch.aten.add.Scalar %8047, %float9.999990e-07_11230, %int1_11231 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8049 = torch.aten.rsqrt %8048 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8050 = torch.aten.mul.Tensor %8044, %8049 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11232 = torch.constant.int 5
    %8051 = torch.prims.convert_element_type %8050, %int5_11232 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.11.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.11.norm.query_norm.scale : tensor<128xf16>
    %8052 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8053 = torch.aten.mul.Tensor %8051, %8052 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11233 = torch.constant.int 6
    %8054 = torch.prims.convert_element_type %8042, %int6_11233 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11234 = torch.constant.int 2
    %8055 = torch.aten.pow.Tensor_Scalar %8054, %int2_11234 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11235 = torch.constant.int -1
    %8056 = torch.prim.ListConstruct %int-1_11235 : (!torch.int) -> !torch.list<int>
    %true_11236 = torch.constant.bool true
    %none_11237 = torch.constant.none
    %8057 = torch.aten.mean.dim %8055, %8056, %true_11236, %none_11237 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11238 = torch.constant.float 9.9999999999999995E-7
    %int1_11239 = torch.constant.int 1
    %8058 = torch.aten.add.Scalar %8057, %float9.999990e-07_11238, %int1_11239 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8059 = torch.aten.rsqrt %8058 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8060 = torch.aten.mul.Tensor %8054, %8059 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11240 = torch.constant.int 5
    %8061 = torch.prims.convert_element_type %8060, %int5_11240 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.11.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.11.norm.key_norm.scale : tensor<128xf16>
    %8062 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8063 = torch.aten.mul.Tensor %8061, %8062 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11241 = torch.constant.int 5
    %8064 = torch.prims.convert_element_type %8053, %int5_11241 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11242 = torch.constant.int 5
    %8065 = torch.prims.convert_element_type %8063, %int5_11242 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11243 = torch.constant.int 6
    %8066 = torch.prims.convert_element_type %8064, %int6_11243 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11244 = torch.constant.int 1
    %int24_11245 = torch.constant.int 24
    %int4608_11246 = torch.constant.int 4608
    %int64_11247 = torch.constant.int 64
    %int1_11248 = torch.constant.int 1
    %int2_11249 = torch.constant.int 2
    %8067 = torch.prim.ListConstruct %int1_11244, %int24_11245, %int4608_11246, %int64_11247, %int1_11248, %int2_11249 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8068 = torch.aten.view %8066, %8067 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_11250 = torch.constant.int 6
    %8069 = torch.prims.convert_element_type %8065, %int6_11250 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11251 = torch.constant.int 1
    %int24_11252 = torch.constant.int 24
    %int4608_11253 = torch.constant.int 4608
    %int64_11254 = torch.constant.int 64
    %int1_11255 = torch.constant.int 1
    %int2_11256 = torch.constant.int 2
    %8070 = torch.prim.ListConstruct %int1_11251, %int24_11252, %int4608_11253, %int64_11254, %int1_11255, %int2_11256 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8071 = torch.aten.view %8069, %8070 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_11257 = torch.constant.int 5
    %int0_11258 = torch.constant.int 0
    %8072 = torch.aten.select.int %211, %int5_11257, %int0_11258 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11259 = torch.constant.int 5
    %int0_11260 = torch.constant.int 0
    %8073 = torch.aten.select.int %8068, %int5_11259, %int0_11260 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8074 = torch.aten.mul.Tensor %8072, %8073 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11261 = torch.constant.int 5
    %int1_11262 = torch.constant.int 1
    %8075 = torch.aten.select.int %211, %int5_11261, %int1_11262 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11263 = torch.constant.int 5
    %int1_11264 = torch.constant.int 1
    %8076 = torch.aten.select.int %8068, %int5_11263, %int1_11264 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8077 = torch.aten.mul.Tensor %8075, %8076 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11265 = torch.constant.int 1
    %8078 = torch.aten.add.Tensor %8074, %8077, %int1_11265 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11266 = torch.constant.int 5
    %int0_11267 = torch.constant.int 0
    %8079 = torch.aten.select.int %211, %int5_11266, %int0_11267 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11268 = torch.constant.int 5
    %int0_11269 = torch.constant.int 0
    %8080 = torch.aten.select.int %8071, %int5_11268, %int0_11269 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8081 = torch.aten.mul.Tensor %8079, %8080 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11270 = torch.constant.int 5
    %int1_11271 = torch.constant.int 1
    %8082 = torch.aten.select.int %211, %int5_11270, %int1_11271 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11272 = torch.constant.int 5
    %int1_11273 = torch.constant.int 1
    %8083 = torch.aten.select.int %8071, %int5_11272, %int1_11273 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8084 = torch.aten.mul.Tensor %8082, %8083 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11274 = torch.constant.int 1
    %8085 = torch.aten.add.Tensor %8081, %8084, %int1_11274 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11275 = torch.constant.int 1
    %int24_11276 = torch.constant.int 24
    %int4608_11277 = torch.constant.int 4608
    %int128_11278 = torch.constant.int 128
    %8086 = torch.prim.ListConstruct %int1_11275, %int24_11276, %int4608_11277, %int128_11278 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8087 = torch.aten.view %8078, %8086 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11279 = torch.constant.int 5
    %8088 = torch.prims.convert_element_type %8087, %int5_11279 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_11280 = torch.constant.int 1
    %int24_11281 = torch.constant.int 24
    %int4608_11282 = torch.constant.int 4608
    %int128_11283 = torch.constant.int 128
    %8089 = torch.prim.ListConstruct %int1_11280, %int24_11281, %int4608_11282, %int128_11283 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8090 = torch.aten.view %8085, %8089 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11284 = torch.constant.int 5
    %8091 = torch.prims.convert_element_type %8090, %int5_11284 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_11285 = torch.constant.float 0.000000e+00
    %false_11286 = torch.constant.bool false
    %none_11287 = torch.constant.none
    %none_11288 = torch.constant.none
    %8092:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8088, %8091, %8043, %float0.000000e00_11285, %false_11286, %none_11287, %none_11288) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_11289 = torch.constant.int 0
    %int2_11290 = torch.constant.int 2
    %int1_11291 = torch.constant.int 1
    %int3_11292 = torch.constant.int 3
    %8093 = torch.prim.ListConstruct %int0_11289, %int2_11290, %int1_11291, %int3_11292 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8094 = torch.aten.permute %8092#0, %8093 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_11293 = torch.constant.int 1
    %int4608_11294 = torch.constant.int 4608
    %int3072_11295 = torch.constant.int 3072
    %8095 = torch.prim.ListConstruct %int1_11293, %int4608_11294, %int3072_11295 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8096 = torch.aten.view %8094, %8095 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_11296 = torch.constant.str "tanh"
    %8097 = torch.aten.gelu %8036, %str_11296 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8098 = torch.prim.ListConstruct %8096, %8097 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_11297 = torch.constant.int 2
    %8099 = torch.aten.cat %8098, %int2_11297 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_11298 = torch.constant.int 4608
    %int15360_11299 = torch.constant.int 15360
    %8100 = torch.prim.ListConstruct %int4608_11298, %int15360_11299 : (!torch.int, !torch.int) -> !torch.list<int>
    %8101 = torch.aten.view %8099, %8100 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.11.linear2.weight = util.global.load @__auto.sampler.single_blocks.11.linear2.weight : tensor<3072x15360xf16>
    %8102 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_11300 = torch.constant.int 0
    %int1_11301 = torch.constant.int 1
    %8103 = torch.aten.transpose.int %8102, %int0_11300, %int1_11301 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.11.linear2.bias = util.global.load @__auto.sampler.single_blocks.11.linear2.bias : tensor<3072xf16>
    %8104 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_11302 = torch.constant.int 6
    %8105 = torch.prims.convert_element_type %8104, %int6_11302 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_11303 = torch.constant.int 6
    %8106 = torch.prims.convert_element_type %8101, %int6_11303 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_11304 = torch.constant.int 6
    %8107 = torch.prims.convert_element_type %8103, %int6_11304 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8108 = torch.aten.mm %8106, %8107 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_11305 = torch.constant.int 1
    %8109 = torch.aten.mul.Scalar %8108, %int1_11305 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_11306 = torch.constant.int 1
    %8110 = torch.aten.mul.Scalar %8105, %int1_11306 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_11307 = torch.constant.int 1
    %8111 = torch.aten.add.Tensor %8109, %8110, %int1_11307 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_11308 = torch.constant.int 5
    %8112 = torch.prims.convert_element_type %8111, %int5_11308 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_11309 = torch.constant.int 1
    %int4608_11310 = torch.constant.int 4608
    %int3072_11311 = torch.constant.int 3072
    %8113 = torch.prim.ListConstruct %int1_11309, %int4608_11310, %int3072_11311 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8114 = torch.aten.view %8112, %8113 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8115 = torch.aten.mul.Tensor %8009, %8114 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11312 = torch.constant.int 1
    %8116 = torch.aten.add.Tensor %7991, %8115, %int1_11312 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8117 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.12.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.12.modulation.lin.weight : tensor<9216x3072xf16>
    %8118 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_11313 = torch.constant.int 0
    %int1_11314 = torch.constant.int 1
    %8119 = torch.aten.transpose.int %8118, %int0_11313, %int1_11314 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.12.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.12.modulation.lin.bias : tensor<9216xf16>
    %8120 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_11315 = torch.constant.int 6
    %8121 = torch.prims.convert_element_type %8120, %int6_11315 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_11316 = torch.constant.int 6
    %8122 = torch.prims.convert_element_type %8117, %int6_11316 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_11317 = torch.constant.int 6
    %8123 = torch.prims.convert_element_type %8119, %int6_11317 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8124 = torch.aten.mm %8122, %8123 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_11318 = torch.constant.int 1
    %8125 = torch.aten.mul.Scalar %8124, %int1_11318 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_11319 = torch.constant.int 1
    %8126 = torch.aten.mul.Scalar %8121, %int1_11319 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_11320 = torch.constant.int 1
    %8127 = torch.aten.add.Tensor %8125, %8126, %int1_11320 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_11321 = torch.constant.int 5
    %8128 = torch.prims.convert_element_type %8127, %int5_11321 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_11322 = torch.constant.int 0
    %int0_11323 = torch.constant.int 0
    %int9223372036854775807_11324 = torch.constant.int 9223372036854775807
    %int1_11325 = torch.constant.int 1
    %8129 = torch.aten.slice.Tensor %8128, %int0_11322, %int0_11323, %int9223372036854775807_11324, %int1_11325 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_11326 = torch.constant.int 1
    %8130 = torch.aten.unsqueeze %8129, %int1_11326 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_11327 = torch.constant.int 2
    %int0_11328 = torch.constant.int 0
    %int9223372036854775807_11329 = torch.constant.int 9223372036854775807
    %int1_11330 = torch.constant.int 1
    %8131 = torch.aten.slice.Tensor %8130, %int2_11327, %int0_11328, %int9223372036854775807_11329, %int1_11330 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_11331 = torch.constant.int -1
    %int0_11332 = torch.constant.int 0
    %int3072_11333 = torch.constant.int 3072
    %int1_11334 = torch.constant.int 1
    %8132 = torch.aten.slice.Tensor %8131, %int-1_11331, %int0_11332, %int3072_11333, %int1_11334 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11335 = torch.constant.int -1
    %int3072_11336 = torch.constant.int 3072
    %int6144_11337 = torch.constant.int 6144
    %int1_11338 = torch.constant.int 1
    %8133 = torch.aten.slice.Tensor %8131, %int-1_11335, %int3072_11336, %int6144_11337, %int1_11338 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11339 = torch.constant.int -1
    %int6144_11340 = torch.constant.int 6144
    %int9216_11341 = torch.constant.int 9216
    %int1_11342 = torch.constant.int 1
    %8134 = torch.aten.slice.Tensor %8131, %int-1_11339, %int6144_11340, %int9216_11341, %int1_11342 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_11343 = torch.constant.int 1
    %int1_11344 = torch.constant.int 1
    %8135 = torch.aten.add.Scalar %8133, %int1_11343, %int1_11344 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_11345 = torch.constant.int 6
    %8136 = torch.prims.convert_element_type %8116, %int6_11345 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_11346 = torch.constant.int 2
    %8137 = torch.prim.ListConstruct %int2_11346 : (!torch.int) -> !torch.list<int>
    %int0_11347 = torch.constant.int 0
    %true_11348 = torch.constant.bool true
    %result0_11349, %result1_11350 = torch.aten.var_mean.correction %8136, %8137, %int0_11347, %true_11348 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_11351 = torch.constant.float 9.9999999999999995E-7
    %int1_11352 = torch.constant.int 1
    %8138 = torch.aten.add.Scalar %result0_11349, %float9.999990e-07_11351, %int1_11352 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8139 = torch.aten.rsqrt %8138 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_11353 = torch.constant.int 1
    %8140 = torch.aten.sub.Tensor %8116, %result1_11350, %int1_11353 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8141 = torch.aten.mul.Tensor %8140, %8139 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_11354 = torch.constant.int 5
    %8142 = torch.prims.convert_element_type %8141, %int5_11354 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8143 = torch.aten.mul.Tensor %8135, %8142 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11355 = torch.constant.int 1
    %8144 = torch.aten.add.Tensor %8143, %8132, %int1_11355 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_11356 = torch.constant.int 4608
    %int3072_11357 = torch.constant.int 3072
    %8145 = torch.prim.ListConstruct %int4608_11356, %int3072_11357 : (!torch.int, !torch.int) -> !torch.list<int>
    %8146 = torch.aten.view %8144, %8145 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.12.linear1.weight = util.global.load @__auto.sampler.single_blocks.12.linear1.weight : tensor<21504x3072xf16>
    %8147 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_11358 = torch.constant.int 0
    %int1_11359 = torch.constant.int 1
    %8148 = torch.aten.transpose.int %8147, %int0_11358, %int1_11359 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.12.linear1.bias = util.global.load @__auto.sampler.single_blocks.12.linear1.bias : tensor<21504xf16>
    %8149 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_11360 = torch.constant.int 6
    %8150 = torch.prims.convert_element_type %8149, %int6_11360 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_11361 = torch.constant.int 6
    %8151 = torch.prims.convert_element_type %8146, %int6_11361 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_11362 = torch.constant.int 6
    %8152 = torch.prims.convert_element_type %8148, %int6_11362 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8153 = torch.aten.mm %8151, %8152 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_11363 = torch.constant.int 1
    %8154 = torch.aten.mul.Scalar %8153, %int1_11363 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_11364 = torch.constant.int 1
    %8155 = torch.aten.mul.Scalar %8150, %int1_11364 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_11365 = torch.constant.int 1
    %8156 = torch.aten.add.Tensor %8154, %8155, %int1_11365 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_11366 = torch.constant.int 5
    %8157 = torch.prims.convert_element_type %8156, %int5_11366 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_11367 = torch.constant.int 1
    %int4608_11368 = torch.constant.int 4608
    %int21504_11369 = torch.constant.int 21504
    %8158 = torch.prim.ListConstruct %int1_11367, %int4608_11368, %int21504_11369 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8159 = torch.aten.view %8157, %8158 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_11370 = torch.constant.int -1
    %int0_11371 = torch.constant.int 0
    %int9216_11372 = torch.constant.int 9216
    %int1_11373 = torch.constant.int 1
    %8160 = torch.aten.slice.Tensor %8159, %int-1_11370, %int0_11371, %int9216_11372, %int1_11373 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_11374 = torch.constant.int -1
    %int9216_11375 = torch.constant.int 9216
    %int21504_11376 = torch.constant.int 21504
    %int1_11377 = torch.constant.int 1
    %8161 = torch.aten.slice.Tensor %8159, %int-1_11374, %int9216_11375, %int21504_11376, %int1_11377 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_11378 = torch.constant.int 1
    %int4608_11379 = torch.constant.int 4608
    %int3_11380 = torch.constant.int 3
    %int24_11381 = torch.constant.int 24
    %int128_11382 = torch.constant.int 128
    %8162 = torch.prim.ListConstruct %int1_11378, %int4608_11379, %int3_11380, %int24_11381, %int128_11382 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8163 = torch.aten.view %8160, %8162 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_11383 = torch.constant.int 2
    %int0_11384 = torch.constant.int 0
    %int3_11385 = torch.constant.int 3
    %int1_11386 = torch.constant.int 1
    %int4_11387 = torch.constant.int 4
    %8164 = torch.prim.ListConstruct %int2_11383, %int0_11384, %int3_11385, %int1_11386, %int4_11387 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8165 = torch.aten.permute %8163, %8164 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_11388 = torch.constant.int 0
    %int0_11389 = torch.constant.int 0
    %8166 = torch.aten.select.int %8165, %int0_11388, %int0_11389 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11390 = torch.constant.int 0
    %int1_11391 = torch.constant.int 1
    %8167 = torch.aten.select.int %8165, %int0_11390, %int1_11391 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11392 = torch.constant.int 0
    %int2_11393 = torch.constant.int 2
    %8168 = torch.aten.select.int %8165, %int0_11392, %int2_11393 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11394 = torch.constant.int 6
    %8169 = torch.prims.convert_element_type %8166, %int6_11394 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11395 = torch.constant.int 2
    %8170 = torch.aten.pow.Tensor_Scalar %8169, %int2_11395 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11396 = torch.constant.int -1
    %8171 = torch.prim.ListConstruct %int-1_11396 : (!torch.int) -> !torch.list<int>
    %true_11397 = torch.constant.bool true
    %none_11398 = torch.constant.none
    %8172 = torch.aten.mean.dim %8170, %8171, %true_11397, %none_11398 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11399 = torch.constant.float 9.9999999999999995E-7
    %int1_11400 = torch.constant.int 1
    %8173 = torch.aten.add.Scalar %8172, %float9.999990e-07_11399, %int1_11400 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8174 = torch.aten.rsqrt %8173 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8175 = torch.aten.mul.Tensor %8169, %8174 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11401 = torch.constant.int 5
    %8176 = torch.prims.convert_element_type %8175, %int5_11401 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.12.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.12.norm.query_norm.scale : tensor<128xf16>
    %8177 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8178 = torch.aten.mul.Tensor %8176, %8177 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11402 = torch.constant.int 6
    %8179 = torch.prims.convert_element_type %8167, %int6_11402 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11403 = torch.constant.int 2
    %8180 = torch.aten.pow.Tensor_Scalar %8179, %int2_11403 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11404 = torch.constant.int -1
    %8181 = torch.prim.ListConstruct %int-1_11404 : (!torch.int) -> !torch.list<int>
    %true_11405 = torch.constant.bool true
    %none_11406 = torch.constant.none
    %8182 = torch.aten.mean.dim %8180, %8181, %true_11405, %none_11406 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11407 = torch.constant.float 9.9999999999999995E-7
    %int1_11408 = torch.constant.int 1
    %8183 = torch.aten.add.Scalar %8182, %float9.999990e-07_11407, %int1_11408 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8184 = torch.aten.rsqrt %8183 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8185 = torch.aten.mul.Tensor %8179, %8184 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11409 = torch.constant.int 5
    %8186 = torch.prims.convert_element_type %8185, %int5_11409 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.12.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.12.norm.key_norm.scale : tensor<128xf16>
    %8187 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8188 = torch.aten.mul.Tensor %8186, %8187 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11410 = torch.constant.int 5
    %8189 = torch.prims.convert_element_type %8178, %int5_11410 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11411 = torch.constant.int 5
    %8190 = torch.prims.convert_element_type %8188, %int5_11411 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11412 = torch.constant.int 6
    %8191 = torch.prims.convert_element_type %8189, %int6_11412 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11413 = torch.constant.int 1
    %int24_11414 = torch.constant.int 24
    %int4608_11415 = torch.constant.int 4608
    %int64_11416 = torch.constant.int 64
    %int1_11417 = torch.constant.int 1
    %int2_11418 = torch.constant.int 2
    %8192 = torch.prim.ListConstruct %int1_11413, %int24_11414, %int4608_11415, %int64_11416, %int1_11417, %int2_11418 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8193 = torch.aten.view %8191, %8192 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_11419 = torch.constant.int 6
    %8194 = torch.prims.convert_element_type %8190, %int6_11419 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11420 = torch.constant.int 1
    %int24_11421 = torch.constant.int 24
    %int4608_11422 = torch.constant.int 4608
    %int64_11423 = torch.constant.int 64
    %int1_11424 = torch.constant.int 1
    %int2_11425 = torch.constant.int 2
    %8195 = torch.prim.ListConstruct %int1_11420, %int24_11421, %int4608_11422, %int64_11423, %int1_11424, %int2_11425 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8196 = torch.aten.view %8194, %8195 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_11426 = torch.constant.int 5
    %int0_11427 = torch.constant.int 0
    %8197 = torch.aten.select.int %211, %int5_11426, %int0_11427 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11428 = torch.constant.int 5
    %int0_11429 = torch.constant.int 0
    %8198 = torch.aten.select.int %8193, %int5_11428, %int0_11429 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8199 = torch.aten.mul.Tensor %8197, %8198 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11430 = torch.constant.int 5
    %int1_11431 = torch.constant.int 1
    %8200 = torch.aten.select.int %211, %int5_11430, %int1_11431 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11432 = torch.constant.int 5
    %int1_11433 = torch.constant.int 1
    %8201 = torch.aten.select.int %8193, %int5_11432, %int1_11433 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8202 = torch.aten.mul.Tensor %8200, %8201 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11434 = torch.constant.int 1
    %8203 = torch.aten.add.Tensor %8199, %8202, %int1_11434 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11435 = torch.constant.int 5
    %int0_11436 = torch.constant.int 0
    %8204 = torch.aten.select.int %211, %int5_11435, %int0_11436 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11437 = torch.constant.int 5
    %int0_11438 = torch.constant.int 0
    %8205 = torch.aten.select.int %8196, %int5_11437, %int0_11438 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8206 = torch.aten.mul.Tensor %8204, %8205 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11439 = torch.constant.int 5
    %int1_11440 = torch.constant.int 1
    %8207 = torch.aten.select.int %211, %int5_11439, %int1_11440 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11441 = torch.constant.int 5
    %int1_11442 = torch.constant.int 1
    %8208 = torch.aten.select.int %8196, %int5_11441, %int1_11442 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8209 = torch.aten.mul.Tensor %8207, %8208 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11443 = torch.constant.int 1
    %8210 = torch.aten.add.Tensor %8206, %8209, %int1_11443 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11444 = torch.constant.int 1
    %int24_11445 = torch.constant.int 24
    %int4608_11446 = torch.constant.int 4608
    %int128_11447 = torch.constant.int 128
    %8211 = torch.prim.ListConstruct %int1_11444, %int24_11445, %int4608_11446, %int128_11447 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8212 = torch.aten.view %8203, %8211 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11448 = torch.constant.int 5
    %8213 = torch.prims.convert_element_type %8212, %int5_11448 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_11449 = torch.constant.int 1
    %int24_11450 = torch.constant.int 24
    %int4608_11451 = torch.constant.int 4608
    %int128_11452 = torch.constant.int 128
    %8214 = torch.prim.ListConstruct %int1_11449, %int24_11450, %int4608_11451, %int128_11452 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8215 = torch.aten.view %8210, %8214 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11453 = torch.constant.int 5
    %8216 = torch.prims.convert_element_type %8215, %int5_11453 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_11454 = torch.constant.float 0.000000e+00
    %false_11455 = torch.constant.bool false
    %none_11456 = torch.constant.none
    %none_11457 = torch.constant.none
    %8217:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8213, %8216, %8168, %float0.000000e00_11454, %false_11455, %none_11456, %none_11457) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_11458 = torch.constant.int 0
    %int2_11459 = torch.constant.int 2
    %int1_11460 = torch.constant.int 1
    %int3_11461 = torch.constant.int 3
    %8218 = torch.prim.ListConstruct %int0_11458, %int2_11459, %int1_11460, %int3_11461 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8219 = torch.aten.permute %8217#0, %8218 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_11462 = torch.constant.int 1
    %int4608_11463 = torch.constant.int 4608
    %int3072_11464 = torch.constant.int 3072
    %8220 = torch.prim.ListConstruct %int1_11462, %int4608_11463, %int3072_11464 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8221 = torch.aten.view %8219, %8220 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_11465 = torch.constant.str "tanh"
    %8222 = torch.aten.gelu %8161, %str_11465 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8223 = torch.prim.ListConstruct %8221, %8222 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_11466 = torch.constant.int 2
    %8224 = torch.aten.cat %8223, %int2_11466 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_11467 = torch.constant.int 4608
    %int15360_11468 = torch.constant.int 15360
    %8225 = torch.prim.ListConstruct %int4608_11467, %int15360_11468 : (!torch.int, !torch.int) -> !torch.list<int>
    %8226 = torch.aten.view %8224, %8225 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.12.linear2.weight = util.global.load @__auto.sampler.single_blocks.12.linear2.weight : tensor<3072x15360xf16>
    %8227 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_11469 = torch.constant.int 0
    %int1_11470 = torch.constant.int 1
    %8228 = torch.aten.transpose.int %8227, %int0_11469, %int1_11470 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.12.linear2.bias = util.global.load @__auto.sampler.single_blocks.12.linear2.bias : tensor<3072xf16>
    %8229 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_11471 = torch.constant.int 6
    %8230 = torch.prims.convert_element_type %8229, %int6_11471 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_11472 = torch.constant.int 6
    %8231 = torch.prims.convert_element_type %8226, %int6_11472 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_11473 = torch.constant.int 6
    %8232 = torch.prims.convert_element_type %8228, %int6_11473 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8233 = torch.aten.mm %8231, %8232 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_11474 = torch.constant.int 1
    %8234 = torch.aten.mul.Scalar %8233, %int1_11474 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_11475 = torch.constant.int 1
    %8235 = torch.aten.mul.Scalar %8230, %int1_11475 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_11476 = torch.constant.int 1
    %8236 = torch.aten.add.Tensor %8234, %8235, %int1_11476 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_11477 = torch.constant.int 5
    %8237 = torch.prims.convert_element_type %8236, %int5_11477 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_11478 = torch.constant.int 1
    %int4608_11479 = torch.constant.int 4608
    %int3072_11480 = torch.constant.int 3072
    %8238 = torch.prim.ListConstruct %int1_11478, %int4608_11479, %int3072_11480 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8239 = torch.aten.view %8237, %8238 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8240 = torch.aten.mul.Tensor %8134, %8239 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11481 = torch.constant.int 1
    %8241 = torch.aten.add.Tensor %8116, %8240, %int1_11481 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8242 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.13.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.13.modulation.lin.weight : tensor<9216x3072xf16>
    %8243 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_11482 = torch.constant.int 0
    %int1_11483 = torch.constant.int 1
    %8244 = torch.aten.transpose.int %8243, %int0_11482, %int1_11483 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.13.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.13.modulation.lin.bias : tensor<9216xf16>
    %8245 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_11484 = torch.constant.int 6
    %8246 = torch.prims.convert_element_type %8245, %int6_11484 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_11485 = torch.constant.int 6
    %8247 = torch.prims.convert_element_type %8242, %int6_11485 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_11486 = torch.constant.int 6
    %8248 = torch.prims.convert_element_type %8244, %int6_11486 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8249 = torch.aten.mm %8247, %8248 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_11487 = torch.constant.int 1
    %8250 = torch.aten.mul.Scalar %8249, %int1_11487 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_11488 = torch.constant.int 1
    %8251 = torch.aten.mul.Scalar %8246, %int1_11488 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_11489 = torch.constant.int 1
    %8252 = torch.aten.add.Tensor %8250, %8251, %int1_11489 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_11490 = torch.constant.int 5
    %8253 = torch.prims.convert_element_type %8252, %int5_11490 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_11491 = torch.constant.int 0
    %int0_11492 = torch.constant.int 0
    %int9223372036854775807_11493 = torch.constant.int 9223372036854775807
    %int1_11494 = torch.constant.int 1
    %8254 = torch.aten.slice.Tensor %8253, %int0_11491, %int0_11492, %int9223372036854775807_11493, %int1_11494 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_11495 = torch.constant.int 1
    %8255 = torch.aten.unsqueeze %8254, %int1_11495 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_11496 = torch.constant.int 2
    %int0_11497 = torch.constant.int 0
    %int9223372036854775807_11498 = torch.constant.int 9223372036854775807
    %int1_11499 = torch.constant.int 1
    %8256 = torch.aten.slice.Tensor %8255, %int2_11496, %int0_11497, %int9223372036854775807_11498, %int1_11499 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_11500 = torch.constant.int -1
    %int0_11501 = torch.constant.int 0
    %int3072_11502 = torch.constant.int 3072
    %int1_11503 = torch.constant.int 1
    %8257 = torch.aten.slice.Tensor %8256, %int-1_11500, %int0_11501, %int3072_11502, %int1_11503 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11504 = torch.constant.int -1
    %int3072_11505 = torch.constant.int 3072
    %int6144_11506 = torch.constant.int 6144
    %int1_11507 = torch.constant.int 1
    %8258 = torch.aten.slice.Tensor %8256, %int-1_11504, %int3072_11505, %int6144_11506, %int1_11507 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11508 = torch.constant.int -1
    %int6144_11509 = torch.constant.int 6144
    %int9216_11510 = torch.constant.int 9216
    %int1_11511 = torch.constant.int 1
    %8259 = torch.aten.slice.Tensor %8256, %int-1_11508, %int6144_11509, %int9216_11510, %int1_11511 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_11512 = torch.constant.int 1
    %int1_11513 = torch.constant.int 1
    %8260 = torch.aten.add.Scalar %8258, %int1_11512, %int1_11513 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_11514 = torch.constant.int 6
    %8261 = torch.prims.convert_element_type %8241, %int6_11514 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_11515 = torch.constant.int 2
    %8262 = torch.prim.ListConstruct %int2_11515 : (!torch.int) -> !torch.list<int>
    %int0_11516 = torch.constant.int 0
    %true_11517 = torch.constant.bool true
    %result0_11518, %result1_11519 = torch.aten.var_mean.correction %8261, %8262, %int0_11516, %true_11517 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_11520 = torch.constant.float 9.9999999999999995E-7
    %int1_11521 = torch.constant.int 1
    %8263 = torch.aten.add.Scalar %result0_11518, %float9.999990e-07_11520, %int1_11521 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8264 = torch.aten.rsqrt %8263 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_11522 = torch.constant.int 1
    %8265 = torch.aten.sub.Tensor %8241, %result1_11519, %int1_11522 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8266 = torch.aten.mul.Tensor %8265, %8264 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_11523 = torch.constant.int 5
    %8267 = torch.prims.convert_element_type %8266, %int5_11523 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8268 = torch.aten.mul.Tensor %8260, %8267 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11524 = torch.constant.int 1
    %8269 = torch.aten.add.Tensor %8268, %8257, %int1_11524 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_11525 = torch.constant.int 4608
    %int3072_11526 = torch.constant.int 3072
    %8270 = torch.prim.ListConstruct %int4608_11525, %int3072_11526 : (!torch.int, !torch.int) -> !torch.list<int>
    %8271 = torch.aten.view %8269, %8270 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.13.linear1.weight = util.global.load @__auto.sampler.single_blocks.13.linear1.weight : tensor<21504x3072xf16>
    %8272 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_11527 = torch.constant.int 0
    %int1_11528 = torch.constant.int 1
    %8273 = torch.aten.transpose.int %8272, %int0_11527, %int1_11528 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.13.linear1.bias = util.global.load @__auto.sampler.single_blocks.13.linear1.bias : tensor<21504xf16>
    %8274 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_11529 = torch.constant.int 6
    %8275 = torch.prims.convert_element_type %8274, %int6_11529 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_11530 = torch.constant.int 6
    %8276 = torch.prims.convert_element_type %8271, %int6_11530 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_11531 = torch.constant.int 6
    %8277 = torch.prims.convert_element_type %8273, %int6_11531 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8278 = torch.aten.mm %8276, %8277 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_11532 = torch.constant.int 1
    %8279 = torch.aten.mul.Scalar %8278, %int1_11532 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_11533 = torch.constant.int 1
    %8280 = torch.aten.mul.Scalar %8275, %int1_11533 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_11534 = torch.constant.int 1
    %8281 = torch.aten.add.Tensor %8279, %8280, %int1_11534 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_11535 = torch.constant.int 5
    %8282 = torch.prims.convert_element_type %8281, %int5_11535 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_11536 = torch.constant.int 1
    %int4608_11537 = torch.constant.int 4608
    %int21504_11538 = torch.constant.int 21504
    %8283 = torch.prim.ListConstruct %int1_11536, %int4608_11537, %int21504_11538 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8284 = torch.aten.view %8282, %8283 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_11539 = torch.constant.int -1
    %int0_11540 = torch.constant.int 0
    %int9216_11541 = torch.constant.int 9216
    %int1_11542 = torch.constant.int 1
    %8285 = torch.aten.slice.Tensor %8284, %int-1_11539, %int0_11540, %int9216_11541, %int1_11542 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_11543 = torch.constant.int -1
    %int9216_11544 = torch.constant.int 9216
    %int21504_11545 = torch.constant.int 21504
    %int1_11546 = torch.constant.int 1
    %8286 = torch.aten.slice.Tensor %8284, %int-1_11543, %int9216_11544, %int21504_11545, %int1_11546 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_11547 = torch.constant.int 1
    %int4608_11548 = torch.constant.int 4608
    %int3_11549 = torch.constant.int 3
    %int24_11550 = torch.constant.int 24
    %int128_11551 = torch.constant.int 128
    %8287 = torch.prim.ListConstruct %int1_11547, %int4608_11548, %int3_11549, %int24_11550, %int128_11551 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8288 = torch.aten.view %8285, %8287 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_11552 = torch.constant.int 2
    %int0_11553 = torch.constant.int 0
    %int3_11554 = torch.constant.int 3
    %int1_11555 = torch.constant.int 1
    %int4_11556 = torch.constant.int 4
    %8289 = torch.prim.ListConstruct %int2_11552, %int0_11553, %int3_11554, %int1_11555, %int4_11556 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8290 = torch.aten.permute %8288, %8289 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_11557 = torch.constant.int 0
    %int0_11558 = torch.constant.int 0
    %8291 = torch.aten.select.int %8290, %int0_11557, %int0_11558 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11559 = torch.constant.int 0
    %int1_11560 = torch.constant.int 1
    %8292 = torch.aten.select.int %8290, %int0_11559, %int1_11560 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11561 = torch.constant.int 0
    %int2_11562 = torch.constant.int 2
    %8293 = torch.aten.select.int %8290, %int0_11561, %int2_11562 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11563 = torch.constant.int 6
    %8294 = torch.prims.convert_element_type %8291, %int6_11563 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11564 = torch.constant.int 2
    %8295 = torch.aten.pow.Tensor_Scalar %8294, %int2_11564 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11565 = torch.constant.int -1
    %8296 = torch.prim.ListConstruct %int-1_11565 : (!torch.int) -> !torch.list<int>
    %true_11566 = torch.constant.bool true
    %none_11567 = torch.constant.none
    %8297 = torch.aten.mean.dim %8295, %8296, %true_11566, %none_11567 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11568 = torch.constant.float 9.9999999999999995E-7
    %int1_11569 = torch.constant.int 1
    %8298 = torch.aten.add.Scalar %8297, %float9.999990e-07_11568, %int1_11569 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8299 = torch.aten.rsqrt %8298 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8300 = torch.aten.mul.Tensor %8294, %8299 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11570 = torch.constant.int 5
    %8301 = torch.prims.convert_element_type %8300, %int5_11570 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.13.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.13.norm.query_norm.scale : tensor<128xf16>
    %8302 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8303 = torch.aten.mul.Tensor %8301, %8302 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11571 = torch.constant.int 6
    %8304 = torch.prims.convert_element_type %8292, %int6_11571 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11572 = torch.constant.int 2
    %8305 = torch.aten.pow.Tensor_Scalar %8304, %int2_11572 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11573 = torch.constant.int -1
    %8306 = torch.prim.ListConstruct %int-1_11573 : (!torch.int) -> !torch.list<int>
    %true_11574 = torch.constant.bool true
    %none_11575 = torch.constant.none
    %8307 = torch.aten.mean.dim %8305, %8306, %true_11574, %none_11575 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11576 = torch.constant.float 9.9999999999999995E-7
    %int1_11577 = torch.constant.int 1
    %8308 = torch.aten.add.Scalar %8307, %float9.999990e-07_11576, %int1_11577 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8309 = torch.aten.rsqrt %8308 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8310 = torch.aten.mul.Tensor %8304, %8309 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11578 = torch.constant.int 5
    %8311 = torch.prims.convert_element_type %8310, %int5_11578 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.13.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.13.norm.key_norm.scale : tensor<128xf16>
    %8312 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8313 = torch.aten.mul.Tensor %8311, %8312 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11579 = torch.constant.int 5
    %8314 = torch.prims.convert_element_type %8303, %int5_11579 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11580 = torch.constant.int 5
    %8315 = torch.prims.convert_element_type %8313, %int5_11580 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11581 = torch.constant.int 6
    %8316 = torch.prims.convert_element_type %8314, %int6_11581 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11582 = torch.constant.int 1
    %int24_11583 = torch.constant.int 24
    %int4608_11584 = torch.constant.int 4608
    %int64_11585 = torch.constant.int 64
    %int1_11586 = torch.constant.int 1
    %int2_11587 = torch.constant.int 2
    %8317 = torch.prim.ListConstruct %int1_11582, %int24_11583, %int4608_11584, %int64_11585, %int1_11586, %int2_11587 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8318 = torch.aten.view %8316, %8317 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_11588 = torch.constant.int 6
    %8319 = torch.prims.convert_element_type %8315, %int6_11588 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11589 = torch.constant.int 1
    %int24_11590 = torch.constant.int 24
    %int4608_11591 = torch.constant.int 4608
    %int64_11592 = torch.constant.int 64
    %int1_11593 = torch.constant.int 1
    %int2_11594 = torch.constant.int 2
    %8320 = torch.prim.ListConstruct %int1_11589, %int24_11590, %int4608_11591, %int64_11592, %int1_11593, %int2_11594 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8321 = torch.aten.view %8319, %8320 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_11595 = torch.constant.int 5
    %int0_11596 = torch.constant.int 0
    %8322 = torch.aten.select.int %211, %int5_11595, %int0_11596 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11597 = torch.constant.int 5
    %int0_11598 = torch.constant.int 0
    %8323 = torch.aten.select.int %8318, %int5_11597, %int0_11598 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8324 = torch.aten.mul.Tensor %8322, %8323 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11599 = torch.constant.int 5
    %int1_11600 = torch.constant.int 1
    %8325 = torch.aten.select.int %211, %int5_11599, %int1_11600 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11601 = torch.constant.int 5
    %int1_11602 = torch.constant.int 1
    %8326 = torch.aten.select.int %8318, %int5_11601, %int1_11602 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8327 = torch.aten.mul.Tensor %8325, %8326 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11603 = torch.constant.int 1
    %8328 = torch.aten.add.Tensor %8324, %8327, %int1_11603 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11604 = torch.constant.int 5
    %int0_11605 = torch.constant.int 0
    %8329 = torch.aten.select.int %211, %int5_11604, %int0_11605 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11606 = torch.constant.int 5
    %int0_11607 = torch.constant.int 0
    %8330 = torch.aten.select.int %8321, %int5_11606, %int0_11607 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8331 = torch.aten.mul.Tensor %8329, %8330 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11608 = torch.constant.int 5
    %int1_11609 = torch.constant.int 1
    %8332 = torch.aten.select.int %211, %int5_11608, %int1_11609 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11610 = torch.constant.int 5
    %int1_11611 = torch.constant.int 1
    %8333 = torch.aten.select.int %8321, %int5_11610, %int1_11611 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8334 = torch.aten.mul.Tensor %8332, %8333 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11612 = torch.constant.int 1
    %8335 = torch.aten.add.Tensor %8331, %8334, %int1_11612 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11613 = torch.constant.int 1
    %int24_11614 = torch.constant.int 24
    %int4608_11615 = torch.constant.int 4608
    %int128_11616 = torch.constant.int 128
    %8336 = torch.prim.ListConstruct %int1_11613, %int24_11614, %int4608_11615, %int128_11616 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8337 = torch.aten.view %8328, %8336 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11617 = torch.constant.int 5
    %8338 = torch.prims.convert_element_type %8337, %int5_11617 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_11618 = torch.constant.int 1
    %int24_11619 = torch.constant.int 24
    %int4608_11620 = torch.constant.int 4608
    %int128_11621 = torch.constant.int 128
    %8339 = torch.prim.ListConstruct %int1_11618, %int24_11619, %int4608_11620, %int128_11621 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8340 = torch.aten.view %8335, %8339 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11622 = torch.constant.int 5
    %8341 = torch.prims.convert_element_type %8340, %int5_11622 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_11623 = torch.constant.float 0.000000e+00
    %false_11624 = torch.constant.bool false
    %none_11625 = torch.constant.none
    %none_11626 = torch.constant.none
    %8342:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8338, %8341, %8293, %float0.000000e00_11623, %false_11624, %none_11625, %none_11626) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_11627 = torch.constant.int 0
    %int2_11628 = torch.constant.int 2
    %int1_11629 = torch.constant.int 1
    %int3_11630 = torch.constant.int 3
    %8343 = torch.prim.ListConstruct %int0_11627, %int2_11628, %int1_11629, %int3_11630 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8344 = torch.aten.permute %8342#0, %8343 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_11631 = torch.constant.int 1
    %int4608_11632 = torch.constant.int 4608
    %int3072_11633 = torch.constant.int 3072
    %8345 = torch.prim.ListConstruct %int1_11631, %int4608_11632, %int3072_11633 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8346 = torch.aten.view %8344, %8345 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_11634 = torch.constant.str "tanh"
    %8347 = torch.aten.gelu %8286, %str_11634 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8348 = torch.prim.ListConstruct %8346, %8347 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_11635 = torch.constant.int 2
    %8349 = torch.aten.cat %8348, %int2_11635 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_11636 = torch.constant.int 4608
    %int15360_11637 = torch.constant.int 15360
    %8350 = torch.prim.ListConstruct %int4608_11636, %int15360_11637 : (!torch.int, !torch.int) -> !torch.list<int>
    %8351 = torch.aten.view %8349, %8350 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.13.linear2.weight = util.global.load @__auto.sampler.single_blocks.13.linear2.weight : tensor<3072x15360xf16>
    %8352 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_11638 = torch.constant.int 0
    %int1_11639 = torch.constant.int 1
    %8353 = torch.aten.transpose.int %8352, %int0_11638, %int1_11639 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.13.linear2.bias = util.global.load @__auto.sampler.single_blocks.13.linear2.bias : tensor<3072xf16>
    %8354 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_11640 = torch.constant.int 6
    %8355 = torch.prims.convert_element_type %8354, %int6_11640 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_11641 = torch.constant.int 6
    %8356 = torch.prims.convert_element_type %8351, %int6_11641 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_11642 = torch.constant.int 6
    %8357 = torch.prims.convert_element_type %8353, %int6_11642 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8358 = torch.aten.mm %8356, %8357 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_11643 = torch.constant.int 1
    %8359 = torch.aten.mul.Scalar %8358, %int1_11643 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_11644 = torch.constant.int 1
    %8360 = torch.aten.mul.Scalar %8355, %int1_11644 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_11645 = torch.constant.int 1
    %8361 = torch.aten.add.Tensor %8359, %8360, %int1_11645 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_11646 = torch.constant.int 5
    %8362 = torch.prims.convert_element_type %8361, %int5_11646 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_11647 = torch.constant.int 1
    %int4608_11648 = torch.constant.int 4608
    %int3072_11649 = torch.constant.int 3072
    %8363 = torch.prim.ListConstruct %int1_11647, %int4608_11648, %int3072_11649 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8364 = torch.aten.view %8362, %8363 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8365 = torch.aten.mul.Tensor %8259, %8364 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11650 = torch.constant.int 1
    %8366 = torch.aten.add.Tensor %8241, %8365, %int1_11650 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8367 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.14.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.14.modulation.lin.weight : tensor<9216x3072xf16>
    %8368 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_11651 = torch.constant.int 0
    %int1_11652 = torch.constant.int 1
    %8369 = torch.aten.transpose.int %8368, %int0_11651, %int1_11652 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.14.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.14.modulation.lin.bias : tensor<9216xf16>
    %8370 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_11653 = torch.constant.int 6
    %8371 = torch.prims.convert_element_type %8370, %int6_11653 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_11654 = torch.constant.int 6
    %8372 = torch.prims.convert_element_type %8367, %int6_11654 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_11655 = torch.constant.int 6
    %8373 = torch.prims.convert_element_type %8369, %int6_11655 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8374 = torch.aten.mm %8372, %8373 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_11656 = torch.constant.int 1
    %8375 = torch.aten.mul.Scalar %8374, %int1_11656 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_11657 = torch.constant.int 1
    %8376 = torch.aten.mul.Scalar %8371, %int1_11657 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_11658 = torch.constant.int 1
    %8377 = torch.aten.add.Tensor %8375, %8376, %int1_11658 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_11659 = torch.constant.int 5
    %8378 = torch.prims.convert_element_type %8377, %int5_11659 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_11660 = torch.constant.int 0
    %int0_11661 = torch.constant.int 0
    %int9223372036854775807_11662 = torch.constant.int 9223372036854775807
    %int1_11663 = torch.constant.int 1
    %8379 = torch.aten.slice.Tensor %8378, %int0_11660, %int0_11661, %int9223372036854775807_11662, %int1_11663 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_11664 = torch.constant.int 1
    %8380 = torch.aten.unsqueeze %8379, %int1_11664 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_11665 = torch.constant.int 2
    %int0_11666 = torch.constant.int 0
    %int9223372036854775807_11667 = torch.constant.int 9223372036854775807
    %int1_11668 = torch.constant.int 1
    %8381 = torch.aten.slice.Tensor %8380, %int2_11665, %int0_11666, %int9223372036854775807_11667, %int1_11668 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_11669 = torch.constant.int -1
    %int0_11670 = torch.constant.int 0
    %int3072_11671 = torch.constant.int 3072
    %int1_11672 = torch.constant.int 1
    %8382 = torch.aten.slice.Tensor %8381, %int-1_11669, %int0_11670, %int3072_11671, %int1_11672 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11673 = torch.constant.int -1
    %int3072_11674 = torch.constant.int 3072
    %int6144_11675 = torch.constant.int 6144
    %int1_11676 = torch.constant.int 1
    %8383 = torch.aten.slice.Tensor %8381, %int-1_11673, %int3072_11674, %int6144_11675, %int1_11676 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11677 = torch.constant.int -1
    %int6144_11678 = torch.constant.int 6144
    %int9216_11679 = torch.constant.int 9216
    %int1_11680 = torch.constant.int 1
    %8384 = torch.aten.slice.Tensor %8381, %int-1_11677, %int6144_11678, %int9216_11679, %int1_11680 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_11681 = torch.constant.int 1
    %int1_11682 = torch.constant.int 1
    %8385 = torch.aten.add.Scalar %8383, %int1_11681, %int1_11682 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_11683 = torch.constant.int 6
    %8386 = torch.prims.convert_element_type %8366, %int6_11683 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_11684 = torch.constant.int 2
    %8387 = torch.prim.ListConstruct %int2_11684 : (!torch.int) -> !torch.list<int>
    %int0_11685 = torch.constant.int 0
    %true_11686 = torch.constant.bool true
    %result0_11687, %result1_11688 = torch.aten.var_mean.correction %8386, %8387, %int0_11685, %true_11686 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_11689 = torch.constant.float 9.9999999999999995E-7
    %int1_11690 = torch.constant.int 1
    %8388 = torch.aten.add.Scalar %result0_11687, %float9.999990e-07_11689, %int1_11690 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8389 = torch.aten.rsqrt %8388 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_11691 = torch.constant.int 1
    %8390 = torch.aten.sub.Tensor %8366, %result1_11688, %int1_11691 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8391 = torch.aten.mul.Tensor %8390, %8389 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_11692 = torch.constant.int 5
    %8392 = torch.prims.convert_element_type %8391, %int5_11692 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8393 = torch.aten.mul.Tensor %8385, %8392 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11693 = torch.constant.int 1
    %8394 = torch.aten.add.Tensor %8393, %8382, %int1_11693 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_11694 = torch.constant.int 4608
    %int3072_11695 = torch.constant.int 3072
    %8395 = torch.prim.ListConstruct %int4608_11694, %int3072_11695 : (!torch.int, !torch.int) -> !torch.list<int>
    %8396 = torch.aten.view %8394, %8395 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.14.linear1.weight = util.global.load @__auto.sampler.single_blocks.14.linear1.weight : tensor<21504x3072xf16>
    %8397 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_11696 = torch.constant.int 0
    %int1_11697 = torch.constant.int 1
    %8398 = torch.aten.transpose.int %8397, %int0_11696, %int1_11697 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.14.linear1.bias = util.global.load @__auto.sampler.single_blocks.14.linear1.bias : tensor<21504xf16>
    %8399 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_11698 = torch.constant.int 6
    %8400 = torch.prims.convert_element_type %8399, %int6_11698 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_11699 = torch.constant.int 6
    %8401 = torch.prims.convert_element_type %8396, %int6_11699 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_11700 = torch.constant.int 6
    %8402 = torch.prims.convert_element_type %8398, %int6_11700 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8403 = torch.aten.mm %8401, %8402 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_11701 = torch.constant.int 1
    %8404 = torch.aten.mul.Scalar %8403, %int1_11701 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_11702 = torch.constant.int 1
    %8405 = torch.aten.mul.Scalar %8400, %int1_11702 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_11703 = torch.constant.int 1
    %8406 = torch.aten.add.Tensor %8404, %8405, %int1_11703 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_11704 = torch.constant.int 5
    %8407 = torch.prims.convert_element_type %8406, %int5_11704 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_11705 = torch.constant.int 1
    %int4608_11706 = torch.constant.int 4608
    %int21504_11707 = torch.constant.int 21504
    %8408 = torch.prim.ListConstruct %int1_11705, %int4608_11706, %int21504_11707 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8409 = torch.aten.view %8407, %8408 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_11708 = torch.constant.int -1
    %int0_11709 = torch.constant.int 0
    %int9216_11710 = torch.constant.int 9216
    %int1_11711 = torch.constant.int 1
    %8410 = torch.aten.slice.Tensor %8409, %int-1_11708, %int0_11709, %int9216_11710, %int1_11711 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_11712 = torch.constant.int -1
    %int9216_11713 = torch.constant.int 9216
    %int21504_11714 = torch.constant.int 21504
    %int1_11715 = torch.constant.int 1
    %8411 = torch.aten.slice.Tensor %8409, %int-1_11712, %int9216_11713, %int21504_11714, %int1_11715 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_11716 = torch.constant.int 1
    %int4608_11717 = torch.constant.int 4608
    %int3_11718 = torch.constant.int 3
    %int24_11719 = torch.constant.int 24
    %int128_11720 = torch.constant.int 128
    %8412 = torch.prim.ListConstruct %int1_11716, %int4608_11717, %int3_11718, %int24_11719, %int128_11720 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8413 = torch.aten.view %8410, %8412 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_11721 = torch.constant.int 2
    %int0_11722 = torch.constant.int 0
    %int3_11723 = torch.constant.int 3
    %int1_11724 = torch.constant.int 1
    %int4_11725 = torch.constant.int 4
    %8414 = torch.prim.ListConstruct %int2_11721, %int0_11722, %int3_11723, %int1_11724, %int4_11725 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8415 = torch.aten.permute %8413, %8414 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_11726 = torch.constant.int 0
    %int0_11727 = torch.constant.int 0
    %8416 = torch.aten.select.int %8415, %int0_11726, %int0_11727 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11728 = torch.constant.int 0
    %int1_11729 = torch.constant.int 1
    %8417 = torch.aten.select.int %8415, %int0_11728, %int1_11729 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11730 = torch.constant.int 0
    %int2_11731 = torch.constant.int 2
    %8418 = torch.aten.select.int %8415, %int0_11730, %int2_11731 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11732 = torch.constant.int 6
    %8419 = torch.prims.convert_element_type %8416, %int6_11732 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11733 = torch.constant.int 2
    %8420 = torch.aten.pow.Tensor_Scalar %8419, %int2_11733 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11734 = torch.constant.int -1
    %8421 = torch.prim.ListConstruct %int-1_11734 : (!torch.int) -> !torch.list<int>
    %true_11735 = torch.constant.bool true
    %none_11736 = torch.constant.none
    %8422 = torch.aten.mean.dim %8420, %8421, %true_11735, %none_11736 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11737 = torch.constant.float 9.9999999999999995E-7
    %int1_11738 = torch.constant.int 1
    %8423 = torch.aten.add.Scalar %8422, %float9.999990e-07_11737, %int1_11738 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8424 = torch.aten.rsqrt %8423 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8425 = torch.aten.mul.Tensor %8419, %8424 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11739 = torch.constant.int 5
    %8426 = torch.prims.convert_element_type %8425, %int5_11739 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.14.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.14.norm.query_norm.scale : tensor<128xf16>
    %8427 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8428 = torch.aten.mul.Tensor %8426, %8427 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11740 = torch.constant.int 6
    %8429 = torch.prims.convert_element_type %8417, %int6_11740 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11741 = torch.constant.int 2
    %8430 = torch.aten.pow.Tensor_Scalar %8429, %int2_11741 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11742 = torch.constant.int -1
    %8431 = torch.prim.ListConstruct %int-1_11742 : (!torch.int) -> !torch.list<int>
    %true_11743 = torch.constant.bool true
    %none_11744 = torch.constant.none
    %8432 = torch.aten.mean.dim %8430, %8431, %true_11743, %none_11744 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11745 = torch.constant.float 9.9999999999999995E-7
    %int1_11746 = torch.constant.int 1
    %8433 = torch.aten.add.Scalar %8432, %float9.999990e-07_11745, %int1_11746 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8434 = torch.aten.rsqrt %8433 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8435 = torch.aten.mul.Tensor %8429, %8434 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11747 = torch.constant.int 5
    %8436 = torch.prims.convert_element_type %8435, %int5_11747 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.14.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.14.norm.key_norm.scale : tensor<128xf16>
    %8437 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8438 = torch.aten.mul.Tensor %8436, %8437 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11748 = torch.constant.int 5
    %8439 = torch.prims.convert_element_type %8428, %int5_11748 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11749 = torch.constant.int 5
    %8440 = torch.prims.convert_element_type %8438, %int5_11749 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11750 = torch.constant.int 6
    %8441 = torch.prims.convert_element_type %8439, %int6_11750 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11751 = torch.constant.int 1
    %int24_11752 = torch.constant.int 24
    %int4608_11753 = torch.constant.int 4608
    %int64_11754 = torch.constant.int 64
    %int1_11755 = torch.constant.int 1
    %int2_11756 = torch.constant.int 2
    %8442 = torch.prim.ListConstruct %int1_11751, %int24_11752, %int4608_11753, %int64_11754, %int1_11755, %int2_11756 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8443 = torch.aten.view %8441, %8442 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_11757 = torch.constant.int 6
    %8444 = torch.prims.convert_element_type %8440, %int6_11757 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11758 = torch.constant.int 1
    %int24_11759 = torch.constant.int 24
    %int4608_11760 = torch.constant.int 4608
    %int64_11761 = torch.constant.int 64
    %int1_11762 = torch.constant.int 1
    %int2_11763 = torch.constant.int 2
    %8445 = torch.prim.ListConstruct %int1_11758, %int24_11759, %int4608_11760, %int64_11761, %int1_11762, %int2_11763 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8446 = torch.aten.view %8444, %8445 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_11764 = torch.constant.int 5
    %int0_11765 = torch.constant.int 0
    %8447 = torch.aten.select.int %211, %int5_11764, %int0_11765 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11766 = torch.constant.int 5
    %int0_11767 = torch.constant.int 0
    %8448 = torch.aten.select.int %8443, %int5_11766, %int0_11767 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8449 = torch.aten.mul.Tensor %8447, %8448 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11768 = torch.constant.int 5
    %int1_11769 = torch.constant.int 1
    %8450 = torch.aten.select.int %211, %int5_11768, %int1_11769 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11770 = torch.constant.int 5
    %int1_11771 = torch.constant.int 1
    %8451 = torch.aten.select.int %8443, %int5_11770, %int1_11771 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8452 = torch.aten.mul.Tensor %8450, %8451 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11772 = torch.constant.int 1
    %8453 = torch.aten.add.Tensor %8449, %8452, %int1_11772 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11773 = torch.constant.int 5
    %int0_11774 = torch.constant.int 0
    %8454 = torch.aten.select.int %211, %int5_11773, %int0_11774 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11775 = torch.constant.int 5
    %int0_11776 = torch.constant.int 0
    %8455 = torch.aten.select.int %8446, %int5_11775, %int0_11776 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8456 = torch.aten.mul.Tensor %8454, %8455 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11777 = torch.constant.int 5
    %int1_11778 = torch.constant.int 1
    %8457 = torch.aten.select.int %211, %int5_11777, %int1_11778 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11779 = torch.constant.int 5
    %int1_11780 = torch.constant.int 1
    %8458 = torch.aten.select.int %8446, %int5_11779, %int1_11780 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8459 = torch.aten.mul.Tensor %8457, %8458 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11781 = torch.constant.int 1
    %8460 = torch.aten.add.Tensor %8456, %8459, %int1_11781 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11782 = torch.constant.int 1
    %int24_11783 = torch.constant.int 24
    %int4608_11784 = torch.constant.int 4608
    %int128_11785 = torch.constant.int 128
    %8461 = torch.prim.ListConstruct %int1_11782, %int24_11783, %int4608_11784, %int128_11785 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8462 = torch.aten.view %8453, %8461 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11786 = torch.constant.int 5
    %8463 = torch.prims.convert_element_type %8462, %int5_11786 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_11787 = torch.constant.int 1
    %int24_11788 = torch.constant.int 24
    %int4608_11789 = torch.constant.int 4608
    %int128_11790 = torch.constant.int 128
    %8464 = torch.prim.ListConstruct %int1_11787, %int24_11788, %int4608_11789, %int128_11790 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8465 = torch.aten.view %8460, %8464 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11791 = torch.constant.int 5
    %8466 = torch.prims.convert_element_type %8465, %int5_11791 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_11792 = torch.constant.float 0.000000e+00
    %false_11793 = torch.constant.bool false
    %none_11794 = torch.constant.none
    %none_11795 = torch.constant.none
    %8467:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8463, %8466, %8418, %float0.000000e00_11792, %false_11793, %none_11794, %none_11795) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_11796 = torch.constant.int 0
    %int2_11797 = torch.constant.int 2
    %int1_11798 = torch.constant.int 1
    %int3_11799 = torch.constant.int 3
    %8468 = torch.prim.ListConstruct %int0_11796, %int2_11797, %int1_11798, %int3_11799 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8469 = torch.aten.permute %8467#0, %8468 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_11800 = torch.constant.int 1
    %int4608_11801 = torch.constant.int 4608
    %int3072_11802 = torch.constant.int 3072
    %8470 = torch.prim.ListConstruct %int1_11800, %int4608_11801, %int3072_11802 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8471 = torch.aten.view %8469, %8470 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_11803 = torch.constant.str "tanh"
    %8472 = torch.aten.gelu %8411, %str_11803 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8473 = torch.prim.ListConstruct %8471, %8472 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_11804 = torch.constant.int 2
    %8474 = torch.aten.cat %8473, %int2_11804 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_11805 = torch.constant.int 4608
    %int15360_11806 = torch.constant.int 15360
    %8475 = torch.prim.ListConstruct %int4608_11805, %int15360_11806 : (!torch.int, !torch.int) -> !torch.list<int>
    %8476 = torch.aten.view %8474, %8475 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.14.linear2.weight = util.global.load @__auto.sampler.single_blocks.14.linear2.weight : tensor<3072x15360xf16>
    %8477 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_11807 = torch.constant.int 0
    %int1_11808 = torch.constant.int 1
    %8478 = torch.aten.transpose.int %8477, %int0_11807, %int1_11808 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.14.linear2.bias = util.global.load @__auto.sampler.single_blocks.14.linear2.bias : tensor<3072xf16>
    %8479 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_11809 = torch.constant.int 6
    %8480 = torch.prims.convert_element_type %8479, %int6_11809 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_11810 = torch.constant.int 6
    %8481 = torch.prims.convert_element_type %8476, %int6_11810 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_11811 = torch.constant.int 6
    %8482 = torch.prims.convert_element_type %8478, %int6_11811 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8483 = torch.aten.mm %8481, %8482 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_11812 = torch.constant.int 1
    %8484 = torch.aten.mul.Scalar %8483, %int1_11812 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_11813 = torch.constant.int 1
    %8485 = torch.aten.mul.Scalar %8480, %int1_11813 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_11814 = torch.constant.int 1
    %8486 = torch.aten.add.Tensor %8484, %8485, %int1_11814 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_11815 = torch.constant.int 5
    %8487 = torch.prims.convert_element_type %8486, %int5_11815 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_11816 = torch.constant.int 1
    %int4608_11817 = torch.constant.int 4608
    %int3072_11818 = torch.constant.int 3072
    %8488 = torch.prim.ListConstruct %int1_11816, %int4608_11817, %int3072_11818 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8489 = torch.aten.view %8487, %8488 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8490 = torch.aten.mul.Tensor %8384, %8489 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11819 = torch.constant.int 1
    %8491 = torch.aten.add.Tensor %8366, %8490, %int1_11819 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8492 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.15.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.15.modulation.lin.weight : tensor<9216x3072xf16>
    %8493 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_11820 = torch.constant.int 0
    %int1_11821 = torch.constant.int 1
    %8494 = torch.aten.transpose.int %8493, %int0_11820, %int1_11821 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.15.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.15.modulation.lin.bias : tensor<9216xf16>
    %8495 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_11822 = torch.constant.int 6
    %8496 = torch.prims.convert_element_type %8495, %int6_11822 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_11823 = torch.constant.int 6
    %8497 = torch.prims.convert_element_type %8492, %int6_11823 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_11824 = torch.constant.int 6
    %8498 = torch.prims.convert_element_type %8494, %int6_11824 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8499 = torch.aten.mm %8497, %8498 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_11825 = torch.constant.int 1
    %8500 = torch.aten.mul.Scalar %8499, %int1_11825 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_11826 = torch.constant.int 1
    %8501 = torch.aten.mul.Scalar %8496, %int1_11826 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_11827 = torch.constant.int 1
    %8502 = torch.aten.add.Tensor %8500, %8501, %int1_11827 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_11828 = torch.constant.int 5
    %8503 = torch.prims.convert_element_type %8502, %int5_11828 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_11829 = torch.constant.int 0
    %int0_11830 = torch.constant.int 0
    %int9223372036854775807_11831 = torch.constant.int 9223372036854775807
    %int1_11832 = torch.constant.int 1
    %8504 = torch.aten.slice.Tensor %8503, %int0_11829, %int0_11830, %int9223372036854775807_11831, %int1_11832 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_11833 = torch.constant.int 1
    %8505 = torch.aten.unsqueeze %8504, %int1_11833 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_11834 = torch.constant.int 2
    %int0_11835 = torch.constant.int 0
    %int9223372036854775807_11836 = torch.constant.int 9223372036854775807
    %int1_11837 = torch.constant.int 1
    %8506 = torch.aten.slice.Tensor %8505, %int2_11834, %int0_11835, %int9223372036854775807_11836, %int1_11837 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_11838 = torch.constant.int -1
    %int0_11839 = torch.constant.int 0
    %int3072_11840 = torch.constant.int 3072
    %int1_11841 = torch.constant.int 1
    %8507 = torch.aten.slice.Tensor %8506, %int-1_11838, %int0_11839, %int3072_11840, %int1_11841 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11842 = torch.constant.int -1
    %int3072_11843 = torch.constant.int 3072
    %int6144_11844 = torch.constant.int 6144
    %int1_11845 = torch.constant.int 1
    %8508 = torch.aten.slice.Tensor %8506, %int-1_11842, %int3072_11843, %int6144_11844, %int1_11845 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11846 = torch.constant.int -1
    %int6144_11847 = torch.constant.int 6144
    %int9216_11848 = torch.constant.int 9216
    %int1_11849 = torch.constant.int 1
    %8509 = torch.aten.slice.Tensor %8506, %int-1_11846, %int6144_11847, %int9216_11848, %int1_11849 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_11850 = torch.constant.int 1
    %int1_11851 = torch.constant.int 1
    %8510 = torch.aten.add.Scalar %8508, %int1_11850, %int1_11851 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_11852 = torch.constant.int 6
    %8511 = torch.prims.convert_element_type %8491, %int6_11852 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_11853 = torch.constant.int 2
    %8512 = torch.prim.ListConstruct %int2_11853 : (!torch.int) -> !torch.list<int>
    %int0_11854 = torch.constant.int 0
    %true_11855 = torch.constant.bool true
    %result0_11856, %result1_11857 = torch.aten.var_mean.correction %8511, %8512, %int0_11854, %true_11855 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_11858 = torch.constant.float 9.9999999999999995E-7
    %int1_11859 = torch.constant.int 1
    %8513 = torch.aten.add.Scalar %result0_11856, %float9.999990e-07_11858, %int1_11859 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8514 = torch.aten.rsqrt %8513 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_11860 = torch.constant.int 1
    %8515 = torch.aten.sub.Tensor %8491, %result1_11857, %int1_11860 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8516 = torch.aten.mul.Tensor %8515, %8514 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_11861 = torch.constant.int 5
    %8517 = torch.prims.convert_element_type %8516, %int5_11861 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8518 = torch.aten.mul.Tensor %8510, %8517 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11862 = torch.constant.int 1
    %8519 = torch.aten.add.Tensor %8518, %8507, %int1_11862 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_11863 = torch.constant.int 4608
    %int3072_11864 = torch.constant.int 3072
    %8520 = torch.prim.ListConstruct %int4608_11863, %int3072_11864 : (!torch.int, !torch.int) -> !torch.list<int>
    %8521 = torch.aten.view %8519, %8520 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.15.linear1.weight = util.global.load @__auto.sampler.single_blocks.15.linear1.weight : tensor<21504x3072xf16>
    %8522 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_11865 = torch.constant.int 0
    %int1_11866 = torch.constant.int 1
    %8523 = torch.aten.transpose.int %8522, %int0_11865, %int1_11866 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.15.linear1.bias = util.global.load @__auto.sampler.single_blocks.15.linear1.bias : tensor<21504xf16>
    %8524 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_11867 = torch.constant.int 6
    %8525 = torch.prims.convert_element_type %8524, %int6_11867 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_11868 = torch.constant.int 6
    %8526 = torch.prims.convert_element_type %8521, %int6_11868 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_11869 = torch.constant.int 6
    %8527 = torch.prims.convert_element_type %8523, %int6_11869 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8528 = torch.aten.mm %8526, %8527 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_11870 = torch.constant.int 1
    %8529 = torch.aten.mul.Scalar %8528, %int1_11870 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_11871 = torch.constant.int 1
    %8530 = torch.aten.mul.Scalar %8525, %int1_11871 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_11872 = torch.constant.int 1
    %8531 = torch.aten.add.Tensor %8529, %8530, %int1_11872 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_11873 = torch.constant.int 5
    %8532 = torch.prims.convert_element_type %8531, %int5_11873 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_11874 = torch.constant.int 1
    %int4608_11875 = torch.constant.int 4608
    %int21504_11876 = torch.constant.int 21504
    %8533 = torch.prim.ListConstruct %int1_11874, %int4608_11875, %int21504_11876 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8534 = torch.aten.view %8532, %8533 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_11877 = torch.constant.int -1
    %int0_11878 = torch.constant.int 0
    %int9216_11879 = torch.constant.int 9216
    %int1_11880 = torch.constant.int 1
    %8535 = torch.aten.slice.Tensor %8534, %int-1_11877, %int0_11878, %int9216_11879, %int1_11880 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_11881 = torch.constant.int -1
    %int9216_11882 = torch.constant.int 9216
    %int21504_11883 = torch.constant.int 21504
    %int1_11884 = torch.constant.int 1
    %8536 = torch.aten.slice.Tensor %8534, %int-1_11881, %int9216_11882, %int21504_11883, %int1_11884 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_11885 = torch.constant.int 1
    %int4608_11886 = torch.constant.int 4608
    %int3_11887 = torch.constant.int 3
    %int24_11888 = torch.constant.int 24
    %int128_11889 = torch.constant.int 128
    %8537 = torch.prim.ListConstruct %int1_11885, %int4608_11886, %int3_11887, %int24_11888, %int128_11889 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8538 = torch.aten.view %8535, %8537 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_11890 = torch.constant.int 2
    %int0_11891 = torch.constant.int 0
    %int3_11892 = torch.constant.int 3
    %int1_11893 = torch.constant.int 1
    %int4_11894 = torch.constant.int 4
    %8539 = torch.prim.ListConstruct %int2_11890, %int0_11891, %int3_11892, %int1_11893, %int4_11894 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8540 = torch.aten.permute %8538, %8539 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_11895 = torch.constant.int 0
    %int0_11896 = torch.constant.int 0
    %8541 = torch.aten.select.int %8540, %int0_11895, %int0_11896 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11897 = torch.constant.int 0
    %int1_11898 = torch.constant.int 1
    %8542 = torch.aten.select.int %8540, %int0_11897, %int1_11898 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11899 = torch.constant.int 0
    %int2_11900 = torch.constant.int 2
    %8543 = torch.aten.select.int %8540, %int0_11899, %int2_11900 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11901 = torch.constant.int 6
    %8544 = torch.prims.convert_element_type %8541, %int6_11901 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11902 = torch.constant.int 2
    %8545 = torch.aten.pow.Tensor_Scalar %8544, %int2_11902 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11903 = torch.constant.int -1
    %8546 = torch.prim.ListConstruct %int-1_11903 : (!torch.int) -> !torch.list<int>
    %true_11904 = torch.constant.bool true
    %none_11905 = torch.constant.none
    %8547 = torch.aten.mean.dim %8545, %8546, %true_11904, %none_11905 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11906 = torch.constant.float 9.9999999999999995E-7
    %int1_11907 = torch.constant.int 1
    %8548 = torch.aten.add.Scalar %8547, %float9.999990e-07_11906, %int1_11907 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8549 = torch.aten.rsqrt %8548 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8550 = torch.aten.mul.Tensor %8544, %8549 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11908 = torch.constant.int 5
    %8551 = torch.prims.convert_element_type %8550, %int5_11908 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.15.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.15.norm.query_norm.scale : tensor<128xf16>
    %8552 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8553 = torch.aten.mul.Tensor %8551, %8552 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11909 = torch.constant.int 6
    %8554 = torch.prims.convert_element_type %8542, %int6_11909 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11910 = torch.constant.int 2
    %8555 = torch.aten.pow.Tensor_Scalar %8554, %int2_11910 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11911 = torch.constant.int -1
    %8556 = torch.prim.ListConstruct %int-1_11911 : (!torch.int) -> !torch.list<int>
    %true_11912 = torch.constant.bool true
    %none_11913 = torch.constant.none
    %8557 = torch.aten.mean.dim %8555, %8556, %true_11912, %none_11913 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11914 = torch.constant.float 9.9999999999999995E-7
    %int1_11915 = torch.constant.int 1
    %8558 = torch.aten.add.Scalar %8557, %float9.999990e-07_11914, %int1_11915 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8559 = torch.aten.rsqrt %8558 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8560 = torch.aten.mul.Tensor %8554, %8559 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11916 = torch.constant.int 5
    %8561 = torch.prims.convert_element_type %8560, %int5_11916 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.15.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.15.norm.key_norm.scale : tensor<128xf16>
    %8562 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8563 = torch.aten.mul.Tensor %8561, %8562 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11917 = torch.constant.int 5
    %8564 = torch.prims.convert_element_type %8553, %int5_11917 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11918 = torch.constant.int 5
    %8565 = torch.prims.convert_element_type %8563, %int5_11918 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11919 = torch.constant.int 6
    %8566 = torch.prims.convert_element_type %8564, %int6_11919 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11920 = torch.constant.int 1
    %int24_11921 = torch.constant.int 24
    %int4608_11922 = torch.constant.int 4608
    %int64_11923 = torch.constant.int 64
    %int1_11924 = torch.constant.int 1
    %int2_11925 = torch.constant.int 2
    %8567 = torch.prim.ListConstruct %int1_11920, %int24_11921, %int4608_11922, %int64_11923, %int1_11924, %int2_11925 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8568 = torch.aten.view %8566, %8567 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_11926 = torch.constant.int 6
    %8569 = torch.prims.convert_element_type %8565, %int6_11926 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11927 = torch.constant.int 1
    %int24_11928 = torch.constant.int 24
    %int4608_11929 = torch.constant.int 4608
    %int64_11930 = torch.constant.int 64
    %int1_11931 = torch.constant.int 1
    %int2_11932 = torch.constant.int 2
    %8570 = torch.prim.ListConstruct %int1_11927, %int24_11928, %int4608_11929, %int64_11930, %int1_11931, %int2_11932 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8571 = torch.aten.view %8569, %8570 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_11933 = torch.constant.int 5
    %int0_11934 = torch.constant.int 0
    %8572 = torch.aten.select.int %211, %int5_11933, %int0_11934 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11935 = torch.constant.int 5
    %int0_11936 = torch.constant.int 0
    %8573 = torch.aten.select.int %8568, %int5_11935, %int0_11936 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8574 = torch.aten.mul.Tensor %8572, %8573 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11937 = torch.constant.int 5
    %int1_11938 = torch.constant.int 1
    %8575 = torch.aten.select.int %211, %int5_11937, %int1_11938 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11939 = torch.constant.int 5
    %int1_11940 = torch.constant.int 1
    %8576 = torch.aten.select.int %8568, %int5_11939, %int1_11940 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8577 = torch.aten.mul.Tensor %8575, %8576 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11941 = torch.constant.int 1
    %8578 = torch.aten.add.Tensor %8574, %8577, %int1_11941 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11942 = torch.constant.int 5
    %int0_11943 = torch.constant.int 0
    %8579 = torch.aten.select.int %211, %int5_11942, %int0_11943 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11944 = torch.constant.int 5
    %int0_11945 = torch.constant.int 0
    %8580 = torch.aten.select.int %8571, %int5_11944, %int0_11945 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8581 = torch.aten.mul.Tensor %8579, %8580 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11946 = torch.constant.int 5
    %int1_11947 = torch.constant.int 1
    %8582 = torch.aten.select.int %211, %int5_11946, %int1_11947 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11948 = torch.constant.int 5
    %int1_11949 = torch.constant.int 1
    %8583 = torch.aten.select.int %8571, %int5_11948, %int1_11949 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8584 = torch.aten.mul.Tensor %8582, %8583 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11950 = torch.constant.int 1
    %8585 = torch.aten.add.Tensor %8581, %8584, %int1_11950 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11951 = torch.constant.int 1
    %int24_11952 = torch.constant.int 24
    %int4608_11953 = torch.constant.int 4608
    %int128_11954 = torch.constant.int 128
    %8586 = torch.prim.ListConstruct %int1_11951, %int24_11952, %int4608_11953, %int128_11954 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8587 = torch.aten.view %8578, %8586 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11955 = torch.constant.int 5
    %8588 = torch.prims.convert_element_type %8587, %int5_11955 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_11956 = torch.constant.int 1
    %int24_11957 = torch.constant.int 24
    %int4608_11958 = torch.constant.int 4608
    %int128_11959 = torch.constant.int 128
    %8589 = torch.prim.ListConstruct %int1_11956, %int24_11957, %int4608_11958, %int128_11959 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8590 = torch.aten.view %8585, %8589 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11960 = torch.constant.int 5
    %8591 = torch.prims.convert_element_type %8590, %int5_11960 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_11961 = torch.constant.float 0.000000e+00
    %false_11962 = torch.constant.bool false
    %none_11963 = torch.constant.none
    %none_11964 = torch.constant.none
    %8592:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8588, %8591, %8543, %float0.000000e00_11961, %false_11962, %none_11963, %none_11964) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_11965 = torch.constant.int 0
    %int2_11966 = torch.constant.int 2
    %int1_11967 = torch.constant.int 1
    %int3_11968 = torch.constant.int 3
    %8593 = torch.prim.ListConstruct %int0_11965, %int2_11966, %int1_11967, %int3_11968 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8594 = torch.aten.permute %8592#0, %8593 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_11969 = torch.constant.int 1
    %int4608_11970 = torch.constant.int 4608
    %int3072_11971 = torch.constant.int 3072
    %8595 = torch.prim.ListConstruct %int1_11969, %int4608_11970, %int3072_11971 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8596 = torch.aten.view %8594, %8595 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_11972 = torch.constant.str "tanh"
    %8597 = torch.aten.gelu %8536, %str_11972 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8598 = torch.prim.ListConstruct %8596, %8597 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_11973 = torch.constant.int 2
    %8599 = torch.aten.cat %8598, %int2_11973 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_11974 = torch.constant.int 4608
    %int15360_11975 = torch.constant.int 15360
    %8600 = torch.prim.ListConstruct %int4608_11974, %int15360_11975 : (!torch.int, !torch.int) -> !torch.list<int>
    %8601 = torch.aten.view %8599, %8600 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.15.linear2.weight = util.global.load @__auto.sampler.single_blocks.15.linear2.weight : tensor<3072x15360xf16>
    %8602 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_11976 = torch.constant.int 0
    %int1_11977 = torch.constant.int 1
    %8603 = torch.aten.transpose.int %8602, %int0_11976, %int1_11977 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.15.linear2.bias = util.global.load @__auto.sampler.single_blocks.15.linear2.bias : tensor<3072xf16>
    %8604 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_11978 = torch.constant.int 6
    %8605 = torch.prims.convert_element_type %8604, %int6_11978 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_11979 = torch.constant.int 6
    %8606 = torch.prims.convert_element_type %8601, %int6_11979 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_11980 = torch.constant.int 6
    %8607 = torch.prims.convert_element_type %8603, %int6_11980 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8608 = torch.aten.mm %8606, %8607 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_11981 = torch.constant.int 1
    %8609 = torch.aten.mul.Scalar %8608, %int1_11981 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_11982 = torch.constant.int 1
    %8610 = torch.aten.mul.Scalar %8605, %int1_11982 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_11983 = torch.constant.int 1
    %8611 = torch.aten.add.Tensor %8609, %8610, %int1_11983 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_11984 = torch.constant.int 5
    %8612 = torch.prims.convert_element_type %8611, %int5_11984 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_11985 = torch.constant.int 1
    %int4608_11986 = torch.constant.int 4608
    %int3072_11987 = torch.constant.int 3072
    %8613 = torch.prim.ListConstruct %int1_11985, %int4608_11986, %int3072_11987 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8614 = torch.aten.view %8612, %8613 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8615 = torch.aten.mul.Tensor %8509, %8614 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11988 = torch.constant.int 1
    %8616 = torch.aten.add.Tensor %8491, %8615, %int1_11988 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8617 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.16.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.16.modulation.lin.weight : tensor<9216x3072xf16>
    %8618 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_11989 = torch.constant.int 0
    %int1_11990 = torch.constant.int 1
    %8619 = torch.aten.transpose.int %8618, %int0_11989, %int1_11990 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.16.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.16.modulation.lin.bias : tensor<9216xf16>
    %8620 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_11991 = torch.constant.int 6
    %8621 = torch.prims.convert_element_type %8620, %int6_11991 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_11992 = torch.constant.int 6
    %8622 = torch.prims.convert_element_type %8617, %int6_11992 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_11993 = torch.constant.int 6
    %8623 = torch.prims.convert_element_type %8619, %int6_11993 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8624 = torch.aten.mm %8622, %8623 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_11994 = torch.constant.int 1
    %8625 = torch.aten.mul.Scalar %8624, %int1_11994 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_11995 = torch.constant.int 1
    %8626 = torch.aten.mul.Scalar %8621, %int1_11995 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_11996 = torch.constant.int 1
    %8627 = torch.aten.add.Tensor %8625, %8626, %int1_11996 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_11997 = torch.constant.int 5
    %8628 = torch.prims.convert_element_type %8627, %int5_11997 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_11998 = torch.constant.int 0
    %int0_11999 = torch.constant.int 0
    %int9223372036854775807_12000 = torch.constant.int 9223372036854775807
    %int1_12001 = torch.constant.int 1
    %8629 = torch.aten.slice.Tensor %8628, %int0_11998, %int0_11999, %int9223372036854775807_12000, %int1_12001 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_12002 = torch.constant.int 1
    %8630 = torch.aten.unsqueeze %8629, %int1_12002 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_12003 = torch.constant.int 2
    %int0_12004 = torch.constant.int 0
    %int9223372036854775807_12005 = torch.constant.int 9223372036854775807
    %int1_12006 = torch.constant.int 1
    %8631 = torch.aten.slice.Tensor %8630, %int2_12003, %int0_12004, %int9223372036854775807_12005, %int1_12006 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_12007 = torch.constant.int -1
    %int0_12008 = torch.constant.int 0
    %int3072_12009 = torch.constant.int 3072
    %int1_12010 = torch.constant.int 1
    %8632 = torch.aten.slice.Tensor %8631, %int-1_12007, %int0_12008, %int3072_12009, %int1_12010 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12011 = torch.constant.int -1
    %int3072_12012 = torch.constant.int 3072
    %int6144_12013 = torch.constant.int 6144
    %int1_12014 = torch.constant.int 1
    %8633 = torch.aten.slice.Tensor %8631, %int-1_12011, %int3072_12012, %int6144_12013, %int1_12014 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12015 = torch.constant.int -1
    %int6144_12016 = torch.constant.int 6144
    %int9216_12017 = torch.constant.int 9216
    %int1_12018 = torch.constant.int 1
    %8634 = torch.aten.slice.Tensor %8631, %int-1_12015, %int6144_12016, %int9216_12017, %int1_12018 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_12019 = torch.constant.int 1
    %int1_12020 = torch.constant.int 1
    %8635 = torch.aten.add.Scalar %8633, %int1_12019, %int1_12020 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_12021 = torch.constant.int 6
    %8636 = torch.prims.convert_element_type %8616, %int6_12021 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_12022 = torch.constant.int 2
    %8637 = torch.prim.ListConstruct %int2_12022 : (!torch.int) -> !torch.list<int>
    %int0_12023 = torch.constant.int 0
    %true_12024 = torch.constant.bool true
    %result0_12025, %result1_12026 = torch.aten.var_mean.correction %8636, %8637, %int0_12023, %true_12024 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_12027 = torch.constant.float 9.9999999999999995E-7
    %int1_12028 = torch.constant.int 1
    %8638 = torch.aten.add.Scalar %result0_12025, %float9.999990e-07_12027, %int1_12028 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8639 = torch.aten.rsqrt %8638 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_12029 = torch.constant.int 1
    %8640 = torch.aten.sub.Tensor %8616, %result1_12026, %int1_12029 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8641 = torch.aten.mul.Tensor %8640, %8639 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_12030 = torch.constant.int 5
    %8642 = torch.prims.convert_element_type %8641, %int5_12030 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8643 = torch.aten.mul.Tensor %8635, %8642 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12031 = torch.constant.int 1
    %8644 = torch.aten.add.Tensor %8643, %8632, %int1_12031 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_12032 = torch.constant.int 4608
    %int3072_12033 = torch.constant.int 3072
    %8645 = torch.prim.ListConstruct %int4608_12032, %int3072_12033 : (!torch.int, !torch.int) -> !torch.list<int>
    %8646 = torch.aten.view %8644, %8645 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.16.linear1.weight = util.global.load @__auto.sampler.single_blocks.16.linear1.weight : tensor<21504x3072xf16>
    %8647 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_12034 = torch.constant.int 0
    %int1_12035 = torch.constant.int 1
    %8648 = torch.aten.transpose.int %8647, %int0_12034, %int1_12035 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.16.linear1.bias = util.global.load @__auto.sampler.single_blocks.16.linear1.bias : tensor<21504xf16>
    %8649 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_12036 = torch.constant.int 6
    %8650 = torch.prims.convert_element_type %8649, %int6_12036 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_12037 = torch.constant.int 6
    %8651 = torch.prims.convert_element_type %8646, %int6_12037 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_12038 = torch.constant.int 6
    %8652 = torch.prims.convert_element_type %8648, %int6_12038 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8653 = torch.aten.mm %8651, %8652 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_12039 = torch.constant.int 1
    %8654 = torch.aten.mul.Scalar %8653, %int1_12039 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_12040 = torch.constant.int 1
    %8655 = torch.aten.mul.Scalar %8650, %int1_12040 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_12041 = torch.constant.int 1
    %8656 = torch.aten.add.Tensor %8654, %8655, %int1_12041 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_12042 = torch.constant.int 5
    %8657 = torch.prims.convert_element_type %8656, %int5_12042 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_12043 = torch.constant.int 1
    %int4608_12044 = torch.constant.int 4608
    %int21504_12045 = torch.constant.int 21504
    %8658 = torch.prim.ListConstruct %int1_12043, %int4608_12044, %int21504_12045 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8659 = torch.aten.view %8657, %8658 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_12046 = torch.constant.int -1
    %int0_12047 = torch.constant.int 0
    %int9216_12048 = torch.constant.int 9216
    %int1_12049 = torch.constant.int 1
    %8660 = torch.aten.slice.Tensor %8659, %int-1_12046, %int0_12047, %int9216_12048, %int1_12049 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_12050 = torch.constant.int -1
    %int9216_12051 = torch.constant.int 9216
    %int21504_12052 = torch.constant.int 21504
    %int1_12053 = torch.constant.int 1
    %8661 = torch.aten.slice.Tensor %8659, %int-1_12050, %int9216_12051, %int21504_12052, %int1_12053 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_12054 = torch.constant.int 1
    %int4608_12055 = torch.constant.int 4608
    %int3_12056 = torch.constant.int 3
    %int24_12057 = torch.constant.int 24
    %int128_12058 = torch.constant.int 128
    %8662 = torch.prim.ListConstruct %int1_12054, %int4608_12055, %int3_12056, %int24_12057, %int128_12058 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8663 = torch.aten.view %8660, %8662 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_12059 = torch.constant.int 2
    %int0_12060 = torch.constant.int 0
    %int3_12061 = torch.constant.int 3
    %int1_12062 = torch.constant.int 1
    %int4_12063 = torch.constant.int 4
    %8664 = torch.prim.ListConstruct %int2_12059, %int0_12060, %int3_12061, %int1_12062, %int4_12063 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8665 = torch.aten.permute %8663, %8664 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_12064 = torch.constant.int 0
    %int0_12065 = torch.constant.int 0
    %8666 = torch.aten.select.int %8665, %int0_12064, %int0_12065 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12066 = torch.constant.int 0
    %int1_12067 = torch.constant.int 1
    %8667 = torch.aten.select.int %8665, %int0_12066, %int1_12067 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12068 = torch.constant.int 0
    %int2_12069 = torch.constant.int 2
    %8668 = torch.aten.select.int %8665, %int0_12068, %int2_12069 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12070 = torch.constant.int 6
    %8669 = torch.prims.convert_element_type %8666, %int6_12070 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12071 = torch.constant.int 2
    %8670 = torch.aten.pow.Tensor_Scalar %8669, %int2_12071 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12072 = torch.constant.int -1
    %8671 = torch.prim.ListConstruct %int-1_12072 : (!torch.int) -> !torch.list<int>
    %true_12073 = torch.constant.bool true
    %none_12074 = torch.constant.none
    %8672 = torch.aten.mean.dim %8670, %8671, %true_12073, %none_12074 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12075 = torch.constant.float 9.9999999999999995E-7
    %int1_12076 = torch.constant.int 1
    %8673 = torch.aten.add.Scalar %8672, %float9.999990e-07_12075, %int1_12076 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8674 = torch.aten.rsqrt %8673 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8675 = torch.aten.mul.Tensor %8669, %8674 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12077 = torch.constant.int 5
    %8676 = torch.prims.convert_element_type %8675, %int5_12077 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.16.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.16.norm.query_norm.scale : tensor<128xf16>
    %8677 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8678 = torch.aten.mul.Tensor %8676, %8677 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12078 = torch.constant.int 6
    %8679 = torch.prims.convert_element_type %8667, %int6_12078 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12079 = torch.constant.int 2
    %8680 = torch.aten.pow.Tensor_Scalar %8679, %int2_12079 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12080 = torch.constant.int -1
    %8681 = torch.prim.ListConstruct %int-1_12080 : (!torch.int) -> !torch.list<int>
    %true_12081 = torch.constant.bool true
    %none_12082 = torch.constant.none
    %8682 = torch.aten.mean.dim %8680, %8681, %true_12081, %none_12082 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12083 = torch.constant.float 9.9999999999999995E-7
    %int1_12084 = torch.constant.int 1
    %8683 = torch.aten.add.Scalar %8682, %float9.999990e-07_12083, %int1_12084 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8684 = torch.aten.rsqrt %8683 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8685 = torch.aten.mul.Tensor %8679, %8684 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12085 = torch.constant.int 5
    %8686 = torch.prims.convert_element_type %8685, %int5_12085 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.16.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.16.norm.key_norm.scale : tensor<128xf16>
    %8687 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8688 = torch.aten.mul.Tensor %8686, %8687 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12086 = torch.constant.int 5
    %8689 = torch.prims.convert_element_type %8678, %int5_12086 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12087 = torch.constant.int 5
    %8690 = torch.prims.convert_element_type %8688, %int5_12087 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12088 = torch.constant.int 6
    %8691 = torch.prims.convert_element_type %8689, %int6_12088 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12089 = torch.constant.int 1
    %int24_12090 = torch.constant.int 24
    %int4608_12091 = torch.constant.int 4608
    %int64_12092 = torch.constant.int 64
    %int1_12093 = torch.constant.int 1
    %int2_12094 = torch.constant.int 2
    %8692 = torch.prim.ListConstruct %int1_12089, %int24_12090, %int4608_12091, %int64_12092, %int1_12093, %int2_12094 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8693 = torch.aten.view %8691, %8692 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_12095 = torch.constant.int 6
    %8694 = torch.prims.convert_element_type %8690, %int6_12095 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12096 = torch.constant.int 1
    %int24_12097 = torch.constant.int 24
    %int4608_12098 = torch.constant.int 4608
    %int64_12099 = torch.constant.int 64
    %int1_12100 = torch.constant.int 1
    %int2_12101 = torch.constant.int 2
    %8695 = torch.prim.ListConstruct %int1_12096, %int24_12097, %int4608_12098, %int64_12099, %int1_12100, %int2_12101 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8696 = torch.aten.view %8694, %8695 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_12102 = torch.constant.int 5
    %int0_12103 = torch.constant.int 0
    %8697 = torch.aten.select.int %211, %int5_12102, %int0_12103 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12104 = torch.constant.int 5
    %int0_12105 = torch.constant.int 0
    %8698 = torch.aten.select.int %8693, %int5_12104, %int0_12105 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8699 = torch.aten.mul.Tensor %8697, %8698 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12106 = torch.constant.int 5
    %int1_12107 = torch.constant.int 1
    %8700 = torch.aten.select.int %211, %int5_12106, %int1_12107 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12108 = torch.constant.int 5
    %int1_12109 = torch.constant.int 1
    %8701 = torch.aten.select.int %8693, %int5_12108, %int1_12109 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8702 = torch.aten.mul.Tensor %8700, %8701 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12110 = torch.constant.int 1
    %8703 = torch.aten.add.Tensor %8699, %8702, %int1_12110 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12111 = torch.constant.int 5
    %int0_12112 = torch.constant.int 0
    %8704 = torch.aten.select.int %211, %int5_12111, %int0_12112 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12113 = torch.constant.int 5
    %int0_12114 = torch.constant.int 0
    %8705 = torch.aten.select.int %8696, %int5_12113, %int0_12114 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8706 = torch.aten.mul.Tensor %8704, %8705 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12115 = torch.constant.int 5
    %int1_12116 = torch.constant.int 1
    %8707 = torch.aten.select.int %211, %int5_12115, %int1_12116 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12117 = torch.constant.int 5
    %int1_12118 = torch.constant.int 1
    %8708 = torch.aten.select.int %8696, %int5_12117, %int1_12118 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8709 = torch.aten.mul.Tensor %8707, %8708 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12119 = torch.constant.int 1
    %8710 = torch.aten.add.Tensor %8706, %8709, %int1_12119 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12120 = torch.constant.int 1
    %int24_12121 = torch.constant.int 24
    %int4608_12122 = torch.constant.int 4608
    %int128_12123 = torch.constant.int 128
    %8711 = torch.prim.ListConstruct %int1_12120, %int24_12121, %int4608_12122, %int128_12123 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8712 = torch.aten.view %8703, %8711 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12124 = torch.constant.int 5
    %8713 = torch.prims.convert_element_type %8712, %int5_12124 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_12125 = torch.constant.int 1
    %int24_12126 = torch.constant.int 24
    %int4608_12127 = torch.constant.int 4608
    %int128_12128 = torch.constant.int 128
    %8714 = torch.prim.ListConstruct %int1_12125, %int24_12126, %int4608_12127, %int128_12128 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8715 = torch.aten.view %8710, %8714 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12129 = torch.constant.int 5
    %8716 = torch.prims.convert_element_type %8715, %int5_12129 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_12130 = torch.constant.float 0.000000e+00
    %false_12131 = torch.constant.bool false
    %none_12132 = torch.constant.none
    %none_12133 = torch.constant.none
    %8717:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8713, %8716, %8668, %float0.000000e00_12130, %false_12131, %none_12132, %none_12133) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_12134 = torch.constant.int 0
    %int2_12135 = torch.constant.int 2
    %int1_12136 = torch.constant.int 1
    %int3_12137 = torch.constant.int 3
    %8718 = torch.prim.ListConstruct %int0_12134, %int2_12135, %int1_12136, %int3_12137 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8719 = torch.aten.permute %8717#0, %8718 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_12138 = torch.constant.int 1
    %int4608_12139 = torch.constant.int 4608
    %int3072_12140 = torch.constant.int 3072
    %8720 = torch.prim.ListConstruct %int1_12138, %int4608_12139, %int3072_12140 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8721 = torch.aten.view %8719, %8720 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_12141 = torch.constant.str "tanh"
    %8722 = torch.aten.gelu %8661, %str_12141 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8723 = torch.prim.ListConstruct %8721, %8722 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_12142 = torch.constant.int 2
    %8724 = torch.aten.cat %8723, %int2_12142 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_12143 = torch.constant.int 4608
    %int15360_12144 = torch.constant.int 15360
    %8725 = torch.prim.ListConstruct %int4608_12143, %int15360_12144 : (!torch.int, !torch.int) -> !torch.list<int>
    %8726 = torch.aten.view %8724, %8725 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.16.linear2.weight = util.global.load @__auto.sampler.single_blocks.16.linear2.weight : tensor<3072x15360xf16>
    %8727 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_12145 = torch.constant.int 0
    %int1_12146 = torch.constant.int 1
    %8728 = torch.aten.transpose.int %8727, %int0_12145, %int1_12146 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.16.linear2.bias = util.global.load @__auto.sampler.single_blocks.16.linear2.bias : tensor<3072xf16>
    %8729 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_12147 = torch.constant.int 6
    %8730 = torch.prims.convert_element_type %8729, %int6_12147 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_12148 = torch.constant.int 6
    %8731 = torch.prims.convert_element_type %8726, %int6_12148 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_12149 = torch.constant.int 6
    %8732 = torch.prims.convert_element_type %8728, %int6_12149 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8733 = torch.aten.mm %8731, %8732 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_12150 = torch.constant.int 1
    %8734 = torch.aten.mul.Scalar %8733, %int1_12150 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_12151 = torch.constant.int 1
    %8735 = torch.aten.mul.Scalar %8730, %int1_12151 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_12152 = torch.constant.int 1
    %8736 = torch.aten.add.Tensor %8734, %8735, %int1_12152 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_12153 = torch.constant.int 5
    %8737 = torch.prims.convert_element_type %8736, %int5_12153 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_12154 = torch.constant.int 1
    %int4608_12155 = torch.constant.int 4608
    %int3072_12156 = torch.constant.int 3072
    %8738 = torch.prim.ListConstruct %int1_12154, %int4608_12155, %int3072_12156 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8739 = torch.aten.view %8737, %8738 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8740 = torch.aten.mul.Tensor %8634, %8739 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12157 = torch.constant.int 1
    %8741 = torch.aten.add.Tensor %8616, %8740, %int1_12157 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8742 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.17.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.17.modulation.lin.weight : tensor<9216x3072xf16>
    %8743 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_12158 = torch.constant.int 0
    %int1_12159 = torch.constant.int 1
    %8744 = torch.aten.transpose.int %8743, %int0_12158, %int1_12159 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.17.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.17.modulation.lin.bias : tensor<9216xf16>
    %8745 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_12160 = torch.constant.int 6
    %8746 = torch.prims.convert_element_type %8745, %int6_12160 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_12161 = torch.constant.int 6
    %8747 = torch.prims.convert_element_type %8742, %int6_12161 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_12162 = torch.constant.int 6
    %8748 = torch.prims.convert_element_type %8744, %int6_12162 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8749 = torch.aten.mm %8747, %8748 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_12163 = torch.constant.int 1
    %8750 = torch.aten.mul.Scalar %8749, %int1_12163 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_12164 = torch.constant.int 1
    %8751 = torch.aten.mul.Scalar %8746, %int1_12164 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_12165 = torch.constant.int 1
    %8752 = torch.aten.add.Tensor %8750, %8751, %int1_12165 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_12166 = torch.constant.int 5
    %8753 = torch.prims.convert_element_type %8752, %int5_12166 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_12167 = torch.constant.int 0
    %int0_12168 = torch.constant.int 0
    %int9223372036854775807_12169 = torch.constant.int 9223372036854775807
    %int1_12170 = torch.constant.int 1
    %8754 = torch.aten.slice.Tensor %8753, %int0_12167, %int0_12168, %int9223372036854775807_12169, %int1_12170 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_12171 = torch.constant.int 1
    %8755 = torch.aten.unsqueeze %8754, %int1_12171 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_12172 = torch.constant.int 2
    %int0_12173 = torch.constant.int 0
    %int9223372036854775807_12174 = torch.constant.int 9223372036854775807
    %int1_12175 = torch.constant.int 1
    %8756 = torch.aten.slice.Tensor %8755, %int2_12172, %int0_12173, %int9223372036854775807_12174, %int1_12175 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_12176 = torch.constant.int -1
    %int0_12177 = torch.constant.int 0
    %int3072_12178 = torch.constant.int 3072
    %int1_12179 = torch.constant.int 1
    %8757 = torch.aten.slice.Tensor %8756, %int-1_12176, %int0_12177, %int3072_12178, %int1_12179 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12180 = torch.constant.int -1
    %int3072_12181 = torch.constant.int 3072
    %int6144_12182 = torch.constant.int 6144
    %int1_12183 = torch.constant.int 1
    %8758 = torch.aten.slice.Tensor %8756, %int-1_12180, %int3072_12181, %int6144_12182, %int1_12183 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12184 = torch.constant.int -1
    %int6144_12185 = torch.constant.int 6144
    %int9216_12186 = torch.constant.int 9216
    %int1_12187 = torch.constant.int 1
    %8759 = torch.aten.slice.Tensor %8756, %int-1_12184, %int6144_12185, %int9216_12186, %int1_12187 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_12188 = torch.constant.int 1
    %int1_12189 = torch.constant.int 1
    %8760 = torch.aten.add.Scalar %8758, %int1_12188, %int1_12189 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_12190 = torch.constant.int 6
    %8761 = torch.prims.convert_element_type %8741, %int6_12190 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_12191 = torch.constant.int 2
    %8762 = torch.prim.ListConstruct %int2_12191 : (!torch.int) -> !torch.list<int>
    %int0_12192 = torch.constant.int 0
    %true_12193 = torch.constant.bool true
    %result0_12194, %result1_12195 = torch.aten.var_mean.correction %8761, %8762, %int0_12192, %true_12193 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_12196 = torch.constant.float 9.9999999999999995E-7
    %int1_12197 = torch.constant.int 1
    %8763 = torch.aten.add.Scalar %result0_12194, %float9.999990e-07_12196, %int1_12197 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8764 = torch.aten.rsqrt %8763 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_12198 = torch.constant.int 1
    %8765 = torch.aten.sub.Tensor %8741, %result1_12195, %int1_12198 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8766 = torch.aten.mul.Tensor %8765, %8764 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_12199 = torch.constant.int 5
    %8767 = torch.prims.convert_element_type %8766, %int5_12199 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8768 = torch.aten.mul.Tensor %8760, %8767 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12200 = torch.constant.int 1
    %8769 = torch.aten.add.Tensor %8768, %8757, %int1_12200 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_12201 = torch.constant.int 4608
    %int3072_12202 = torch.constant.int 3072
    %8770 = torch.prim.ListConstruct %int4608_12201, %int3072_12202 : (!torch.int, !torch.int) -> !torch.list<int>
    %8771 = torch.aten.view %8769, %8770 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.17.linear1.weight = util.global.load @__auto.sampler.single_blocks.17.linear1.weight : tensor<21504x3072xf16>
    %8772 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_12203 = torch.constant.int 0
    %int1_12204 = torch.constant.int 1
    %8773 = torch.aten.transpose.int %8772, %int0_12203, %int1_12204 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.17.linear1.bias = util.global.load @__auto.sampler.single_blocks.17.linear1.bias : tensor<21504xf16>
    %8774 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_12205 = torch.constant.int 6
    %8775 = torch.prims.convert_element_type %8774, %int6_12205 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_12206 = torch.constant.int 6
    %8776 = torch.prims.convert_element_type %8771, %int6_12206 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_12207 = torch.constant.int 6
    %8777 = torch.prims.convert_element_type %8773, %int6_12207 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8778 = torch.aten.mm %8776, %8777 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_12208 = torch.constant.int 1
    %8779 = torch.aten.mul.Scalar %8778, %int1_12208 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_12209 = torch.constant.int 1
    %8780 = torch.aten.mul.Scalar %8775, %int1_12209 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_12210 = torch.constant.int 1
    %8781 = torch.aten.add.Tensor %8779, %8780, %int1_12210 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_12211 = torch.constant.int 5
    %8782 = torch.prims.convert_element_type %8781, %int5_12211 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_12212 = torch.constant.int 1
    %int4608_12213 = torch.constant.int 4608
    %int21504_12214 = torch.constant.int 21504
    %8783 = torch.prim.ListConstruct %int1_12212, %int4608_12213, %int21504_12214 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8784 = torch.aten.view %8782, %8783 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_12215 = torch.constant.int -1
    %int0_12216 = torch.constant.int 0
    %int9216_12217 = torch.constant.int 9216
    %int1_12218 = torch.constant.int 1
    %8785 = torch.aten.slice.Tensor %8784, %int-1_12215, %int0_12216, %int9216_12217, %int1_12218 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_12219 = torch.constant.int -1
    %int9216_12220 = torch.constant.int 9216
    %int21504_12221 = torch.constant.int 21504
    %int1_12222 = torch.constant.int 1
    %8786 = torch.aten.slice.Tensor %8784, %int-1_12219, %int9216_12220, %int21504_12221, %int1_12222 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_12223 = torch.constant.int 1
    %int4608_12224 = torch.constant.int 4608
    %int3_12225 = torch.constant.int 3
    %int24_12226 = torch.constant.int 24
    %int128_12227 = torch.constant.int 128
    %8787 = torch.prim.ListConstruct %int1_12223, %int4608_12224, %int3_12225, %int24_12226, %int128_12227 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8788 = torch.aten.view %8785, %8787 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_12228 = torch.constant.int 2
    %int0_12229 = torch.constant.int 0
    %int3_12230 = torch.constant.int 3
    %int1_12231 = torch.constant.int 1
    %int4_12232 = torch.constant.int 4
    %8789 = torch.prim.ListConstruct %int2_12228, %int0_12229, %int3_12230, %int1_12231, %int4_12232 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8790 = torch.aten.permute %8788, %8789 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_12233 = torch.constant.int 0
    %int0_12234 = torch.constant.int 0
    %8791 = torch.aten.select.int %8790, %int0_12233, %int0_12234 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12235 = torch.constant.int 0
    %int1_12236 = torch.constant.int 1
    %8792 = torch.aten.select.int %8790, %int0_12235, %int1_12236 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12237 = torch.constant.int 0
    %int2_12238 = torch.constant.int 2
    %8793 = torch.aten.select.int %8790, %int0_12237, %int2_12238 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12239 = torch.constant.int 6
    %8794 = torch.prims.convert_element_type %8791, %int6_12239 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12240 = torch.constant.int 2
    %8795 = torch.aten.pow.Tensor_Scalar %8794, %int2_12240 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12241 = torch.constant.int -1
    %8796 = torch.prim.ListConstruct %int-1_12241 : (!torch.int) -> !torch.list<int>
    %true_12242 = torch.constant.bool true
    %none_12243 = torch.constant.none
    %8797 = torch.aten.mean.dim %8795, %8796, %true_12242, %none_12243 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12244 = torch.constant.float 9.9999999999999995E-7
    %int1_12245 = torch.constant.int 1
    %8798 = torch.aten.add.Scalar %8797, %float9.999990e-07_12244, %int1_12245 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8799 = torch.aten.rsqrt %8798 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8800 = torch.aten.mul.Tensor %8794, %8799 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12246 = torch.constant.int 5
    %8801 = torch.prims.convert_element_type %8800, %int5_12246 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.17.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.17.norm.query_norm.scale : tensor<128xf16>
    %8802 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8803 = torch.aten.mul.Tensor %8801, %8802 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12247 = torch.constant.int 6
    %8804 = torch.prims.convert_element_type %8792, %int6_12247 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12248 = torch.constant.int 2
    %8805 = torch.aten.pow.Tensor_Scalar %8804, %int2_12248 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12249 = torch.constant.int -1
    %8806 = torch.prim.ListConstruct %int-1_12249 : (!torch.int) -> !torch.list<int>
    %true_12250 = torch.constant.bool true
    %none_12251 = torch.constant.none
    %8807 = torch.aten.mean.dim %8805, %8806, %true_12250, %none_12251 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12252 = torch.constant.float 9.9999999999999995E-7
    %int1_12253 = torch.constant.int 1
    %8808 = torch.aten.add.Scalar %8807, %float9.999990e-07_12252, %int1_12253 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8809 = torch.aten.rsqrt %8808 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8810 = torch.aten.mul.Tensor %8804, %8809 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12254 = torch.constant.int 5
    %8811 = torch.prims.convert_element_type %8810, %int5_12254 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.17.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.17.norm.key_norm.scale : tensor<128xf16>
    %8812 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8813 = torch.aten.mul.Tensor %8811, %8812 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12255 = torch.constant.int 5
    %8814 = torch.prims.convert_element_type %8803, %int5_12255 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12256 = torch.constant.int 5
    %8815 = torch.prims.convert_element_type %8813, %int5_12256 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12257 = torch.constant.int 6
    %8816 = torch.prims.convert_element_type %8814, %int6_12257 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12258 = torch.constant.int 1
    %int24_12259 = torch.constant.int 24
    %int4608_12260 = torch.constant.int 4608
    %int64_12261 = torch.constant.int 64
    %int1_12262 = torch.constant.int 1
    %int2_12263 = torch.constant.int 2
    %8817 = torch.prim.ListConstruct %int1_12258, %int24_12259, %int4608_12260, %int64_12261, %int1_12262, %int2_12263 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8818 = torch.aten.view %8816, %8817 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_12264 = torch.constant.int 6
    %8819 = torch.prims.convert_element_type %8815, %int6_12264 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12265 = torch.constant.int 1
    %int24_12266 = torch.constant.int 24
    %int4608_12267 = torch.constant.int 4608
    %int64_12268 = torch.constant.int 64
    %int1_12269 = torch.constant.int 1
    %int2_12270 = torch.constant.int 2
    %8820 = torch.prim.ListConstruct %int1_12265, %int24_12266, %int4608_12267, %int64_12268, %int1_12269, %int2_12270 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8821 = torch.aten.view %8819, %8820 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_12271 = torch.constant.int 5
    %int0_12272 = torch.constant.int 0
    %8822 = torch.aten.select.int %211, %int5_12271, %int0_12272 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12273 = torch.constant.int 5
    %int0_12274 = torch.constant.int 0
    %8823 = torch.aten.select.int %8818, %int5_12273, %int0_12274 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8824 = torch.aten.mul.Tensor %8822, %8823 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12275 = torch.constant.int 5
    %int1_12276 = torch.constant.int 1
    %8825 = torch.aten.select.int %211, %int5_12275, %int1_12276 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12277 = torch.constant.int 5
    %int1_12278 = torch.constant.int 1
    %8826 = torch.aten.select.int %8818, %int5_12277, %int1_12278 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8827 = torch.aten.mul.Tensor %8825, %8826 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12279 = torch.constant.int 1
    %8828 = torch.aten.add.Tensor %8824, %8827, %int1_12279 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12280 = torch.constant.int 5
    %int0_12281 = torch.constant.int 0
    %8829 = torch.aten.select.int %211, %int5_12280, %int0_12281 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12282 = torch.constant.int 5
    %int0_12283 = torch.constant.int 0
    %8830 = torch.aten.select.int %8821, %int5_12282, %int0_12283 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8831 = torch.aten.mul.Tensor %8829, %8830 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12284 = torch.constant.int 5
    %int1_12285 = torch.constant.int 1
    %8832 = torch.aten.select.int %211, %int5_12284, %int1_12285 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12286 = torch.constant.int 5
    %int1_12287 = torch.constant.int 1
    %8833 = torch.aten.select.int %8821, %int5_12286, %int1_12287 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8834 = torch.aten.mul.Tensor %8832, %8833 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12288 = torch.constant.int 1
    %8835 = torch.aten.add.Tensor %8831, %8834, %int1_12288 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12289 = torch.constant.int 1
    %int24_12290 = torch.constant.int 24
    %int4608_12291 = torch.constant.int 4608
    %int128_12292 = torch.constant.int 128
    %8836 = torch.prim.ListConstruct %int1_12289, %int24_12290, %int4608_12291, %int128_12292 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8837 = torch.aten.view %8828, %8836 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12293 = torch.constant.int 5
    %8838 = torch.prims.convert_element_type %8837, %int5_12293 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_12294 = torch.constant.int 1
    %int24_12295 = torch.constant.int 24
    %int4608_12296 = torch.constant.int 4608
    %int128_12297 = torch.constant.int 128
    %8839 = torch.prim.ListConstruct %int1_12294, %int24_12295, %int4608_12296, %int128_12297 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8840 = torch.aten.view %8835, %8839 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12298 = torch.constant.int 5
    %8841 = torch.prims.convert_element_type %8840, %int5_12298 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_12299 = torch.constant.float 0.000000e+00
    %false_12300 = torch.constant.bool false
    %none_12301 = torch.constant.none
    %none_12302 = torch.constant.none
    %8842:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8838, %8841, %8793, %float0.000000e00_12299, %false_12300, %none_12301, %none_12302) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_12303 = torch.constant.int 0
    %int2_12304 = torch.constant.int 2
    %int1_12305 = torch.constant.int 1
    %int3_12306 = torch.constant.int 3
    %8843 = torch.prim.ListConstruct %int0_12303, %int2_12304, %int1_12305, %int3_12306 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8844 = torch.aten.permute %8842#0, %8843 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_12307 = torch.constant.int 1
    %int4608_12308 = torch.constant.int 4608
    %int3072_12309 = torch.constant.int 3072
    %8845 = torch.prim.ListConstruct %int1_12307, %int4608_12308, %int3072_12309 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8846 = torch.aten.view %8844, %8845 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_12310 = torch.constant.str "tanh"
    %8847 = torch.aten.gelu %8786, %str_12310 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8848 = torch.prim.ListConstruct %8846, %8847 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_12311 = torch.constant.int 2
    %8849 = torch.aten.cat %8848, %int2_12311 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_12312 = torch.constant.int 4608
    %int15360_12313 = torch.constant.int 15360
    %8850 = torch.prim.ListConstruct %int4608_12312, %int15360_12313 : (!torch.int, !torch.int) -> !torch.list<int>
    %8851 = torch.aten.view %8849, %8850 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.17.linear2.weight = util.global.load @__auto.sampler.single_blocks.17.linear2.weight : tensor<3072x15360xf16>
    %8852 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_12314 = torch.constant.int 0
    %int1_12315 = torch.constant.int 1
    %8853 = torch.aten.transpose.int %8852, %int0_12314, %int1_12315 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.17.linear2.bias = util.global.load @__auto.sampler.single_blocks.17.linear2.bias : tensor<3072xf16>
    %8854 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_12316 = torch.constant.int 6
    %8855 = torch.prims.convert_element_type %8854, %int6_12316 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_12317 = torch.constant.int 6
    %8856 = torch.prims.convert_element_type %8851, %int6_12317 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_12318 = torch.constant.int 6
    %8857 = torch.prims.convert_element_type %8853, %int6_12318 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8858 = torch.aten.mm %8856, %8857 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_12319 = torch.constant.int 1
    %8859 = torch.aten.mul.Scalar %8858, %int1_12319 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_12320 = torch.constant.int 1
    %8860 = torch.aten.mul.Scalar %8855, %int1_12320 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_12321 = torch.constant.int 1
    %8861 = torch.aten.add.Tensor %8859, %8860, %int1_12321 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_12322 = torch.constant.int 5
    %8862 = torch.prims.convert_element_type %8861, %int5_12322 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_12323 = torch.constant.int 1
    %int4608_12324 = torch.constant.int 4608
    %int3072_12325 = torch.constant.int 3072
    %8863 = torch.prim.ListConstruct %int1_12323, %int4608_12324, %int3072_12325 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8864 = torch.aten.view %8862, %8863 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8865 = torch.aten.mul.Tensor %8759, %8864 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12326 = torch.constant.int 1
    %8866 = torch.aten.add.Tensor %8741, %8865, %int1_12326 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8867 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.18.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.18.modulation.lin.weight : tensor<9216x3072xf16>
    %8868 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_12327 = torch.constant.int 0
    %int1_12328 = torch.constant.int 1
    %8869 = torch.aten.transpose.int %8868, %int0_12327, %int1_12328 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.18.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.18.modulation.lin.bias : tensor<9216xf16>
    %8870 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_12329 = torch.constant.int 6
    %8871 = torch.prims.convert_element_type %8870, %int6_12329 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_12330 = torch.constant.int 6
    %8872 = torch.prims.convert_element_type %8867, %int6_12330 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_12331 = torch.constant.int 6
    %8873 = torch.prims.convert_element_type %8869, %int6_12331 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8874 = torch.aten.mm %8872, %8873 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_12332 = torch.constant.int 1
    %8875 = torch.aten.mul.Scalar %8874, %int1_12332 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_12333 = torch.constant.int 1
    %8876 = torch.aten.mul.Scalar %8871, %int1_12333 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_12334 = torch.constant.int 1
    %8877 = torch.aten.add.Tensor %8875, %8876, %int1_12334 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_12335 = torch.constant.int 5
    %8878 = torch.prims.convert_element_type %8877, %int5_12335 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_12336 = torch.constant.int 0
    %int0_12337 = torch.constant.int 0
    %int9223372036854775807_12338 = torch.constant.int 9223372036854775807
    %int1_12339 = torch.constant.int 1
    %8879 = torch.aten.slice.Tensor %8878, %int0_12336, %int0_12337, %int9223372036854775807_12338, %int1_12339 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_12340 = torch.constant.int 1
    %8880 = torch.aten.unsqueeze %8879, %int1_12340 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_12341 = torch.constant.int 2
    %int0_12342 = torch.constant.int 0
    %int9223372036854775807_12343 = torch.constant.int 9223372036854775807
    %int1_12344 = torch.constant.int 1
    %8881 = torch.aten.slice.Tensor %8880, %int2_12341, %int0_12342, %int9223372036854775807_12343, %int1_12344 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_12345 = torch.constant.int -1
    %int0_12346 = torch.constant.int 0
    %int3072_12347 = torch.constant.int 3072
    %int1_12348 = torch.constant.int 1
    %8882 = torch.aten.slice.Tensor %8881, %int-1_12345, %int0_12346, %int3072_12347, %int1_12348 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12349 = torch.constant.int -1
    %int3072_12350 = torch.constant.int 3072
    %int6144_12351 = torch.constant.int 6144
    %int1_12352 = torch.constant.int 1
    %8883 = torch.aten.slice.Tensor %8881, %int-1_12349, %int3072_12350, %int6144_12351, %int1_12352 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12353 = torch.constant.int -1
    %int6144_12354 = torch.constant.int 6144
    %int9216_12355 = torch.constant.int 9216
    %int1_12356 = torch.constant.int 1
    %8884 = torch.aten.slice.Tensor %8881, %int-1_12353, %int6144_12354, %int9216_12355, %int1_12356 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_12357 = torch.constant.int 1
    %int1_12358 = torch.constant.int 1
    %8885 = torch.aten.add.Scalar %8883, %int1_12357, %int1_12358 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_12359 = torch.constant.int 6
    %8886 = torch.prims.convert_element_type %8866, %int6_12359 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_12360 = torch.constant.int 2
    %8887 = torch.prim.ListConstruct %int2_12360 : (!torch.int) -> !torch.list<int>
    %int0_12361 = torch.constant.int 0
    %true_12362 = torch.constant.bool true
    %result0_12363, %result1_12364 = torch.aten.var_mean.correction %8886, %8887, %int0_12361, %true_12362 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_12365 = torch.constant.float 9.9999999999999995E-7
    %int1_12366 = torch.constant.int 1
    %8888 = torch.aten.add.Scalar %result0_12363, %float9.999990e-07_12365, %int1_12366 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8889 = torch.aten.rsqrt %8888 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_12367 = torch.constant.int 1
    %8890 = torch.aten.sub.Tensor %8866, %result1_12364, %int1_12367 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8891 = torch.aten.mul.Tensor %8890, %8889 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_12368 = torch.constant.int 5
    %8892 = torch.prims.convert_element_type %8891, %int5_12368 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8893 = torch.aten.mul.Tensor %8885, %8892 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12369 = torch.constant.int 1
    %8894 = torch.aten.add.Tensor %8893, %8882, %int1_12369 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_12370 = torch.constant.int 4608
    %int3072_12371 = torch.constant.int 3072
    %8895 = torch.prim.ListConstruct %int4608_12370, %int3072_12371 : (!torch.int, !torch.int) -> !torch.list<int>
    %8896 = torch.aten.view %8894, %8895 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.18.linear1.weight = util.global.load @__auto.sampler.single_blocks.18.linear1.weight : tensor<21504x3072xf16>
    %8897 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_12372 = torch.constant.int 0
    %int1_12373 = torch.constant.int 1
    %8898 = torch.aten.transpose.int %8897, %int0_12372, %int1_12373 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.18.linear1.bias = util.global.load @__auto.sampler.single_blocks.18.linear1.bias : tensor<21504xf16>
    %8899 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_12374 = torch.constant.int 6
    %8900 = torch.prims.convert_element_type %8899, %int6_12374 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_12375 = torch.constant.int 6
    %8901 = torch.prims.convert_element_type %8896, %int6_12375 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_12376 = torch.constant.int 6
    %8902 = torch.prims.convert_element_type %8898, %int6_12376 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8903 = torch.aten.mm %8901, %8902 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_12377 = torch.constant.int 1
    %8904 = torch.aten.mul.Scalar %8903, %int1_12377 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_12378 = torch.constant.int 1
    %8905 = torch.aten.mul.Scalar %8900, %int1_12378 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_12379 = torch.constant.int 1
    %8906 = torch.aten.add.Tensor %8904, %8905, %int1_12379 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_12380 = torch.constant.int 5
    %8907 = torch.prims.convert_element_type %8906, %int5_12380 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_12381 = torch.constant.int 1
    %int4608_12382 = torch.constant.int 4608
    %int21504_12383 = torch.constant.int 21504
    %8908 = torch.prim.ListConstruct %int1_12381, %int4608_12382, %int21504_12383 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8909 = torch.aten.view %8907, %8908 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_12384 = torch.constant.int -1
    %int0_12385 = torch.constant.int 0
    %int9216_12386 = torch.constant.int 9216
    %int1_12387 = torch.constant.int 1
    %8910 = torch.aten.slice.Tensor %8909, %int-1_12384, %int0_12385, %int9216_12386, %int1_12387 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_12388 = torch.constant.int -1
    %int9216_12389 = torch.constant.int 9216
    %int21504_12390 = torch.constant.int 21504
    %int1_12391 = torch.constant.int 1
    %8911 = torch.aten.slice.Tensor %8909, %int-1_12388, %int9216_12389, %int21504_12390, %int1_12391 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_12392 = torch.constant.int 1
    %int4608_12393 = torch.constant.int 4608
    %int3_12394 = torch.constant.int 3
    %int24_12395 = torch.constant.int 24
    %int128_12396 = torch.constant.int 128
    %8912 = torch.prim.ListConstruct %int1_12392, %int4608_12393, %int3_12394, %int24_12395, %int128_12396 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8913 = torch.aten.view %8910, %8912 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_12397 = torch.constant.int 2
    %int0_12398 = torch.constant.int 0
    %int3_12399 = torch.constant.int 3
    %int1_12400 = torch.constant.int 1
    %int4_12401 = torch.constant.int 4
    %8914 = torch.prim.ListConstruct %int2_12397, %int0_12398, %int3_12399, %int1_12400, %int4_12401 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8915 = torch.aten.permute %8913, %8914 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_12402 = torch.constant.int 0
    %int0_12403 = torch.constant.int 0
    %8916 = torch.aten.select.int %8915, %int0_12402, %int0_12403 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12404 = torch.constant.int 0
    %int1_12405 = torch.constant.int 1
    %8917 = torch.aten.select.int %8915, %int0_12404, %int1_12405 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12406 = torch.constant.int 0
    %int2_12407 = torch.constant.int 2
    %8918 = torch.aten.select.int %8915, %int0_12406, %int2_12407 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12408 = torch.constant.int 6
    %8919 = torch.prims.convert_element_type %8916, %int6_12408 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12409 = torch.constant.int 2
    %8920 = torch.aten.pow.Tensor_Scalar %8919, %int2_12409 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12410 = torch.constant.int -1
    %8921 = torch.prim.ListConstruct %int-1_12410 : (!torch.int) -> !torch.list<int>
    %true_12411 = torch.constant.bool true
    %none_12412 = torch.constant.none
    %8922 = torch.aten.mean.dim %8920, %8921, %true_12411, %none_12412 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12413 = torch.constant.float 9.9999999999999995E-7
    %int1_12414 = torch.constant.int 1
    %8923 = torch.aten.add.Scalar %8922, %float9.999990e-07_12413, %int1_12414 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8924 = torch.aten.rsqrt %8923 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8925 = torch.aten.mul.Tensor %8919, %8924 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12415 = torch.constant.int 5
    %8926 = torch.prims.convert_element_type %8925, %int5_12415 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.18.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.18.norm.query_norm.scale : tensor<128xf16>
    %8927 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8928 = torch.aten.mul.Tensor %8926, %8927 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12416 = torch.constant.int 6
    %8929 = torch.prims.convert_element_type %8917, %int6_12416 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12417 = torch.constant.int 2
    %8930 = torch.aten.pow.Tensor_Scalar %8929, %int2_12417 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12418 = torch.constant.int -1
    %8931 = torch.prim.ListConstruct %int-1_12418 : (!torch.int) -> !torch.list<int>
    %true_12419 = torch.constant.bool true
    %none_12420 = torch.constant.none
    %8932 = torch.aten.mean.dim %8930, %8931, %true_12419, %none_12420 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12421 = torch.constant.float 9.9999999999999995E-7
    %int1_12422 = torch.constant.int 1
    %8933 = torch.aten.add.Scalar %8932, %float9.999990e-07_12421, %int1_12422 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8934 = torch.aten.rsqrt %8933 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8935 = torch.aten.mul.Tensor %8929, %8934 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12423 = torch.constant.int 5
    %8936 = torch.prims.convert_element_type %8935, %int5_12423 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.18.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.18.norm.key_norm.scale : tensor<128xf16>
    %8937 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8938 = torch.aten.mul.Tensor %8936, %8937 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12424 = torch.constant.int 5
    %8939 = torch.prims.convert_element_type %8928, %int5_12424 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12425 = torch.constant.int 5
    %8940 = torch.prims.convert_element_type %8938, %int5_12425 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12426 = torch.constant.int 6
    %8941 = torch.prims.convert_element_type %8939, %int6_12426 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12427 = torch.constant.int 1
    %int24_12428 = torch.constant.int 24
    %int4608_12429 = torch.constant.int 4608
    %int64_12430 = torch.constant.int 64
    %int1_12431 = torch.constant.int 1
    %int2_12432 = torch.constant.int 2
    %8942 = torch.prim.ListConstruct %int1_12427, %int24_12428, %int4608_12429, %int64_12430, %int1_12431, %int2_12432 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8943 = torch.aten.view %8941, %8942 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_12433 = torch.constant.int 6
    %8944 = torch.prims.convert_element_type %8940, %int6_12433 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12434 = torch.constant.int 1
    %int24_12435 = torch.constant.int 24
    %int4608_12436 = torch.constant.int 4608
    %int64_12437 = torch.constant.int 64
    %int1_12438 = torch.constant.int 1
    %int2_12439 = torch.constant.int 2
    %8945 = torch.prim.ListConstruct %int1_12434, %int24_12435, %int4608_12436, %int64_12437, %int1_12438, %int2_12439 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8946 = torch.aten.view %8944, %8945 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_12440 = torch.constant.int 5
    %int0_12441 = torch.constant.int 0
    %8947 = torch.aten.select.int %211, %int5_12440, %int0_12441 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12442 = torch.constant.int 5
    %int0_12443 = torch.constant.int 0
    %8948 = torch.aten.select.int %8943, %int5_12442, %int0_12443 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8949 = torch.aten.mul.Tensor %8947, %8948 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12444 = torch.constant.int 5
    %int1_12445 = torch.constant.int 1
    %8950 = torch.aten.select.int %211, %int5_12444, %int1_12445 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12446 = torch.constant.int 5
    %int1_12447 = torch.constant.int 1
    %8951 = torch.aten.select.int %8943, %int5_12446, %int1_12447 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8952 = torch.aten.mul.Tensor %8950, %8951 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12448 = torch.constant.int 1
    %8953 = torch.aten.add.Tensor %8949, %8952, %int1_12448 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12449 = torch.constant.int 5
    %int0_12450 = torch.constant.int 0
    %8954 = torch.aten.select.int %211, %int5_12449, %int0_12450 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12451 = torch.constant.int 5
    %int0_12452 = torch.constant.int 0
    %8955 = torch.aten.select.int %8946, %int5_12451, %int0_12452 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8956 = torch.aten.mul.Tensor %8954, %8955 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12453 = torch.constant.int 5
    %int1_12454 = torch.constant.int 1
    %8957 = torch.aten.select.int %211, %int5_12453, %int1_12454 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12455 = torch.constant.int 5
    %int1_12456 = torch.constant.int 1
    %8958 = torch.aten.select.int %8946, %int5_12455, %int1_12456 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8959 = torch.aten.mul.Tensor %8957, %8958 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12457 = torch.constant.int 1
    %8960 = torch.aten.add.Tensor %8956, %8959, %int1_12457 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12458 = torch.constant.int 1
    %int24_12459 = torch.constant.int 24
    %int4608_12460 = torch.constant.int 4608
    %int128_12461 = torch.constant.int 128
    %8961 = torch.prim.ListConstruct %int1_12458, %int24_12459, %int4608_12460, %int128_12461 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8962 = torch.aten.view %8953, %8961 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12462 = torch.constant.int 5
    %8963 = torch.prims.convert_element_type %8962, %int5_12462 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_12463 = torch.constant.int 1
    %int24_12464 = torch.constant.int 24
    %int4608_12465 = torch.constant.int 4608
    %int128_12466 = torch.constant.int 128
    %8964 = torch.prim.ListConstruct %int1_12463, %int24_12464, %int4608_12465, %int128_12466 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8965 = torch.aten.view %8960, %8964 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12467 = torch.constant.int 5
    %8966 = torch.prims.convert_element_type %8965, %int5_12467 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_12468 = torch.constant.float 0.000000e+00
    %false_12469 = torch.constant.bool false
    %none_12470 = torch.constant.none
    %none_12471 = torch.constant.none
    %8967:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8963, %8966, %8918, %float0.000000e00_12468, %false_12469, %none_12470, %none_12471) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_12472 = torch.constant.int 0
    %int2_12473 = torch.constant.int 2
    %int1_12474 = torch.constant.int 1
    %int3_12475 = torch.constant.int 3
    %8968 = torch.prim.ListConstruct %int0_12472, %int2_12473, %int1_12474, %int3_12475 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8969 = torch.aten.permute %8967#0, %8968 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_12476 = torch.constant.int 1
    %int4608_12477 = torch.constant.int 4608
    %int3072_12478 = torch.constant.int 3072
    %8970 = torch.prim.ListConstruct %int1_12476, %int4608_12477, %int3072_12478 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8971 = torch.aten.view %8969, %8970 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_12479 = torch.constant.str "tanh"
    %8972 = torch.aten.gelu %8911, %str_12479 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8973 = torch.prim.ListConstruct %8971, %8972 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_12480 = torch.constant.int 2
    %8974 = torch.aten.cat %8973, %int2_12480 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_12481 = torch.constant.int 4608
    %int15360_12482 = torch.constant.int 15360
    %8975 = torch.prim.ListConstruct %int4608_12481, %int15360_12482 : (!torch.int, !torch.int) -> !torch.list<int>
    %8976 = torch.aten.view %8974, %8975 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.18.linear2.weight = util.global.load @__auto.sampler.single_blocks.18.linear2.weight : tensor<3072x15360xf16>
    %8977 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_12483 = torch.constant.int 0
    %int1_12484 = torch.constant.int 1
    %8978 = torch.aten.transpose.int %8977, %int0_12483, %int1_12484 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.18.linear2.bias = util.global.load @__auto.sampler.single_blocks.18.linear2.bias : tensor<3072xf16>
    %8979 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_12485 = torch.constant.int 6
    %8980 = torch.prims.convert_element_type %8979, %int6_12485 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_12486 = torch.constant.int 6
    %8981 = torch.prims.convert_element_type %8976, %int6_12486 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_12487 = torch.constant.int 6
    %8982 = torch.prims.convert_element_type %8978, %int6_12487 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8983 = torch.aten.mm %8981, %8982 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_12488 = torch.constant.int 1
    %8984 = torch.aten.mul.Scalar %8983, %int1_12488 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_12489 = torch.constant.int 1
    %8985 = torch.aten.mul.Scalar %8980, %int1_12489 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_12490 = torch.constant.int 1
    %8986 = torch.aten.add.Tensor %8984, %8985, %int1_12490 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_12491 = torch.constant.int 5
    %8987 = torch.prims.convert_element_type %8986, %int5_12491 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_12492 = torch.constant.int 1
    %int4608_12493 = torch.constant.int 4608
    %int3072_12494 = torch.constant.int 3072
    %8988 = torch.prim.ListConstruct %int1_12492, %int4608_12493, %int3072_12494 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8989 = torch.aten.view %8987, %8988 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8990 = torch.aten.mul.Tensor %8884, %8989 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12495 = torch.constant.int 1
    %8991 = torch.aten.add.Tensor %8866, %8990, %int1_12495 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8992 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.19.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.19.modulation.lin.weight : tensor<9216x3072xf16>
    %8993 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_12496 = torch.constant.int 0
    %int1_12497 = torch.constant.int 1
    %8994 = torch.aten.transpose.int %8993, %int0_12496, %int1_12497 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.19.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.19.modulation.lin.bias : tensor<9216xf16>
    %8995 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_12498 = torch.constant.int 6
    %8996 = torch.prims.convert_element_type %8995, %int6_12498 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_12499 = torch.constant.int 6
    %8997 = torch.prims.convert_element_type %8992, %int6_12499 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_12500 = torch.constant.int 6
    %8998 = torch.prims.convert_element_type %8994, %int6_12500 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8999 = torch.aten.mm %8997, %8998 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_12501 = torch.constant.int 1
    %9000 = torch.aten.mul.Scalar %8999, %int1_12501 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_12502 = torch.constant.int 1
    %9001 = torch.aten.mul.Scalar %8996, %int1_12502 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_12503 = torch.constant.int 1
    %9002 = torch.aten.add.Tensor %9000, %9001, %int1_12503 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_12504 = torch.constant.int 5
    %9003 = torch.prims.convert_element_type %9002, %int5_12504 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_12505 = torch.constant.int 0
    %int0_12506 = torch.constant.int 0
    %int9223372036854775807_12507 = torch.constant.int 9223372036854775807
    %int1_12508 = torch.constant.int 1
    %9004 = torch.aten.slice.Tensor %9003, %int0_12505, %int0_12506, %int9223372036854775807_12507, %int1_12508 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_12509 = torch.constant.int 1
    %9005 = torch.aten.unsqueeze %9004, %int1_12509 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_12510 = torch.constant.int 2
    %int0_12511 = torch.constant.int 0
    %int9223372036854775807_12512 = torch.constant.int 9223372036854775807
    %int1_12513 = torch.constant.int 1
    %9006 = torch.aten.slice.Tensor %9005, %int2_12510, %int0_12511, %int9223372036854775807_12512, %int1_12513 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_12514 = torch.constant.int -1
    %int0_12515 = torch.constant.int 0
    %int3072_12516 = torch.constant.int 3072
    %int1_12517 = torch.constant.int 1
    %9007 = torch.aten.slice.Tensor %9006, %int-1_12514, %int0_12515, %int3072_12516, %int1_12517 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12518 = torch.constant.int -1
    %int3072_12519 = torch.constant.int 3072
    %int6144_12520 = torch.constant.int 6144
    %int1_12521 = torch.constant.int 1
    %9008 = torch.aten.slice.Tensor %9006, %int-1_12518, %int3072_12519, %int6144_12520, %int1_12521 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12522 = torch.constant.int -1
    %int6144_12523 = torch.constant.int 6144
    %int9216_12524 = torch.constant.int 9216
    %int1_12525 = torch.constant.int 1
    %9009 = torch.aten.slice.Tensor %9006, %int-1_12522, %int6144_12523, %int9216_12524, %int1_12525 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_12526 = torch.constant.int 1
    %int1_12527 = torch.constant.int 1
    %9010 = torch.aten.add.Scalar %9008, %int1_12526, %int1_12527 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_12528 = torch.constant.int 6
    %9011 = torch.prims.convert_element_type %8991, %int6_12528 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_12529 = torch.constant.int 2
    %9012 = torch.prim.ListConstruct %int2_12529 : (!torch.int) -> !torch.list<int>
    %int0_12530 = torch.constant.int 0
    %true_12531 = torch.constant.bool true
    %result0_12532, %result1_12533 = torch.aten.var_mean.correction %9011, %9012, %int0_12530, %true_12531 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_12534 = torch.constant.float 9.9999999999999995E-7
    %int1_12535 = torch.constant.int 1
    %9013 = torch.aten.add.Scalar %result0_12532, %float9.999990e-07_12534, %int1_12535 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9014 = torch.aten.rsqrt %9013 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_12536 = torch.constant.int 1
    %9015 = torch.aten.sub.Tensor %8991, %result1_12533, %int1_12536 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9016 = torch.aten.mul.Tensor %9015, %9014 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_12537 = torch.constant.int 5
    %9017 = torch.prims.convert_element_type %9016, %int5_12537 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9018 = torch.aten.mul.Tensor %9010, %9017 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12538 = torch.constant.int 1
    %9019 = torch.aten.add.Tensor %9018, %9007, %int1_12538 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_12539 = torch.constant.int 4608
    %int3072_12540 = torch.constant.int 3072
    %9020 = torch.prim.ListConstruct %int4608_12539, %int3072_12540 : (!torch.int, !torch.int) -> !torch.list<int>
    %9021 = torch.aten.view %9019, %9020 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.19.linear1.weight = util.global.load @__auto.sampler.single_blocks.19.linear1.weight : tensor<21504x3072xf16>
    %9022 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_12541 = torch.constant.int 0
    %int1_12542 = torch.constant.int 1
    %9023 = torch.aten.transpose.int %9022, %int0_12541, %int1_12542 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.19.linear1.bias = util.global.load @__auto.sampler.single_blocks.19.linear1.bias : tensor<21504xf16>
    %9024 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_12543 = torch.constant.int 6
    %9025 = torch.prims.convert_element_type %9024, %int6_12543 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_12544 = torch.constant.int 6
    %9026 = torch.prims.convert_element_type %9021, %int6_12544 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_12545 = torch.constant.int 6
    %9027 = torch.prims.convert_element_type %9023, %int6_12545 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9028 = torch.aten.mm %9026, %9027 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_12546 = torch.constant.int 1
    %9029 = torch.aten.mul.Scalar %9028, %int1_12546 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_12547 = torch.constant.int 1
    %9030 = torch.aten.mul.Scalar %9025, %int1_12547 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_12548 = torch.constant.int 1
    %9031 = torch.aten.add.Tensor %9029, %9030, %int1_12548 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_12549 = torch.constant.int 5
    %9032 = torch.prims.convert_element_type %9031, %int5_12549 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_12550 = torch.constant.int 1
    %int4608_12551 = torch.constant.int 4608
    %int21504_12552 = torch.constant.int 21504
    %9033 = torch.prim.ListConstruct %int1_12550, %int4608_12551, %int21504_12552 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9034 = torch.aten.view %9032, %9033 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_12553 = torch.constant.int -1
    %int0_12554 = torch.constant.int 0
    %int9216_12555 = torch.constant.int 9216
    %int1_12556 = torch.constant.int 1
    %9035 = torch.aten.slice.Tensor %9034, %int-1_12553, %int0_12554, %int9216_12555, %int1_12556 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_12557 = torch.constant.int -1
    %int9216_12558 = torch.constant.int 9216
    %int21504_12559 = torch.constant.int 21504
    %int1_12560 = torch.constant.int 1
    %9036 = torch.aten.slice.Tensor %9034, %int-1_12557, %int9216_12558, %int21504_12559, %int1_12560 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_12561 = torch.constant.int 1
    %int4608_12562 = torch.constant.int 4608
    %int3_12563 = torch.constant.int 3
    %int24_12564 = torch.constant.int 24
    %int128_12565 = torch.constant.int 128
    %9037 = torch.prim.ListConstruct %int1_12561, %int4608_12562, %int3_12563, %int24_12564, %int128_12565 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9038 = torch.aten.view %9035, %9037 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_12566 = torch.constant.int 2
    %int0_12567 = torch.constant.int 0
    %int3_12568 = torch.constant.int 3
    %int1_12569 = torch.constant.int 1
    %int4_12570 = torch.constant.int 4
    %9039 = torch.prim.ListConstruct %int2_12566, %int0_12567, %int3_12568, %int1_12569, %int4_12570 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9040 = torch.aten.permute %9038, %9039 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_12571 = torch.constant.int 0
    %int0_12572 = torch.constant.int 0
    %9041 = torch.aten.select.int %9040, %int0_12571, %int0_12572 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12573 = torch.constant.int 0
    %int1_12574 = torch.constant.int 1
    %9042 = torch.aten.select.int %9040, %int0_12573, %int1_12574 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12575 = torch.constant.int 0
    %int2_12576 = torch.constant.int 2
    %9043 = torch.aten.select.int %9040, %int0_12575, %int2_12576 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12577 = torch.constant.int 6
    %9044 = torch.prims.convert_element_type %9041, %int6_12577 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12578 = torch.constant.int 2
    %9045 = torch.aten.pow.Tensor_Scalar %9044, %int2_12578 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12579 = torch.constant.int -1
    %9046 = torch.prim.ListConstruct %int-1_12579 : (!torch.int) -> !torch.list<int>
    %true_12580 = torch.constant.bool true
    %none_12581 = torch.constant.none
    %9047 = torch.aten.mean.dim %9045, %9046, %true_12580, %none_12581 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12582 = torch.constant.float 9.9999999999999995E-7
    %int1_12583 = torch.constant.int 1
    %9048 = torch.aten.add.Scalar %9047, %float9.999990e-07_12582, %int1_12583 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9049 = torch.aten.rsqrt %9048 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9050 = torch.aten.mul.Tensor %9044, %9049 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12584 = torch.constant.int 5
    %9051 = torch.prims.convert_element_type %9050, %int5_12584 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.19.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.19.norm.query_norm.scale : tensor<128xf16>
    %9052 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9053 = torch.aten.mul.Tensor %9051, %9052 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12585 = torch.constant.int 6
    %9054 = torch.prims.convert_element_type %9042, %int6_12585 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12586 = torch.constant.int 2
    %9055 = torch.aten.pow.Tensor_Scalar %9054, %int2_12586 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12587 = torch.constant.int -1
    %9056 = torch.prim.ListConstruct %int-1_12587 : (!torch.int) -> !torch.list<int>
    %true_12588 = torch.constant.bool true
    %none_12589 = torch.constant.none
    %9057 = torch.aten.mean.dim %9055, %9056, %true_12588, %none_12589 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12590 = torch.constant.float 9.9999999999999995E-7
    %int1_12591 = torch.constant.int 1
    %9058 = torch.aten.add.Scalar %9057, %float9.999990e-07_12590, %int1_12591 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9059 = torch.aten.rsqrt %9058 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9060 = torch.aten.mul.Tensor %9054, %9059 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12592 = torch.constant.int 5
    %9061 = torch.prims.convert_element_type %9060, %int5_12592 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.19.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.19.norm.key_norm.scale : tensor<128xf16>
    %9062 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9063 = torch.aten.mul.Tensor %9061, %9062 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12593 = torch.constant.int 5
    %9064 = torch.prims.convert_element_type %9053, %int5_12593 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12594 = torch.constant.int 5
    %9065 = torch.prims.convert_element_type %9063, %int5_12594 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12595 = torch.constant.int 6
    %9066 = torch.prims.convert_element_type %9064, %int6_12595 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12596 = torch.constant.int 1
    %int24_12597 = torch.constant.int 24
    %int4608_12598 = torch.constant.int 4608
    %int64_12599 = torch.constant.int 64
    %int1_12600 = torch.constant.int 1
    %int2_12601 = torch.constant.int 2
    %9067 = torch.prim.ListConstruct %int1_12596, %int24_12597, %int4608_12598, %int64_12599, %int1_12600, %int2_12601 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9068 = torch.aten.view %9066, %9067 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_12602 = torch.constant.int 6
    %9069 = torch.prims.convert_element_type %9065, %int6_12602 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12603 = torch.constant.int 1
    %int24_12604 = torch.constant.int 24
    %int4608_12605 = torch.constant.int 4608
    %int64_12606 = torch.constant.int 64
    %int1_12607 = torch.constant.int 1
    %int2_12608 = torch.constant.int 2
    %9070 = torch.prim.ListConstruct %int1_12603, %int24_12604, %int4608_12605, %int64_12606, %int1_12607, %int2_12608 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9071 = torch.aten.view %9069, %9070 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_12609 = torch.constant.int 5
    %int0_12610 = torch.constant.int 0
    %9072 = torch.aten.select.int %211, %int5_12609, %int0_12610 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12611 = torch.constant.int 5
    %int0_12612 = torch.constant.int 0
    %9073 = torch.aten.select.int %9068, %int5_12611, %int0_12612 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9074 = torch.aten.mul.Tensor %9072, %9073 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12613 = torch.constant.int 5
    %int1_12614 = torch.constant.int 1
    %9075 = torch.aten.select.int %211, %int5_12613, %int1_12614 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12615 = torch.constant.int 5
    %int1_12616 = torch.constant.int 1
    %9076 = torch.aten.select.int %9068, %int5_12615, %int1_12616 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9077 = torch.aten.mul.Tensor %9075, %9076 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12617 = torch.constant.int 1
    %9078 = torch.aten.add.Tensor %9074, %9077, %int1_12617 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12618 = torch.constant.int 5
    %int0_12619 = torch.constant.int 0
    %9079 = torch.aten.select.int %211, %int5_12618, %int0_12619 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12620 = torch.constant.int 5
    %int0_12621 = torch.constant.int 0
    %9080 = torch.aten.select.int %9071, %int5_12620, %int0_12621 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9081 = torch.aten.mul.Tensor %9079, %9080 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12622 = torch.constant.int 5
    %int1_12623 = torch.constant.int 1
    %9082 = torch.aten.select.int %211, %int5_12622, %int1_12623 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12624 = torch.constant.int 5
    %int1_12625 = torch.constant.int 1
    %9083 = torch.aten.select.int %9071, %int5_12624, %int1_12625 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9084 = torch.aten.mul.Tensor %9082, %9083 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12626 = torch.constant.int 1
    %9085 = torch.aten.add.Tensor %9081, %9084, %int1_12626 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12627 = torch.constant.int 1
    %int24_12628 = torch.constant.int 24
    %int4608_12629 = torch.constant.int 4608
    %int128_12630 = torch.constant.int 128
    %9086 = torch.prim.ListConstruct %int1_12627, %int24_12628, %int4608_12629, %int128_12630 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9087 = torch.aten.view %9078, %9086 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12631 = torch.constant.int 5
    %9088 = torch.prims.convert_element_type %9087, %int5_12631 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_12632 = torch.constant.int 1
    %int24_12633 = torch.constant.int 24
    %int4608_12634 = torch.constant.int 4608
    %int128_12635 = torch.constant.int 128
    %9089 = torch.prim.ListConstruct %int1_12632, %int24_12633, %int4608_12634, %int128_12635 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9090 = torch.aten.view %9085, %9089 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12636 = torch.constant.int 5
    %9091 = torch.prims.convert_element_type %9090, %int5_12636 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_12637 = torch.constant.float 0.000000e+00
    %false_12638 = torch.constant.bool false
    %none_12639 = torch.constant.none
    %none_12640 = torch.constant.none
    %9092:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9088, %9091, %9043, %float0.000000e00_12637, %false_12638, %none_12639, %none_12640) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_12641 = torch.constant.int 0
    %int2_12642 = torch.constant.int 2
    %int1_12643 = torch.constant.int 1
    %int3_12644 = torch.constant.int 3
    %9093 = torch.prim.ListConstruct %int0_12641, %int2_12642, %int1_12643, %int3_12644 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9094 = torch.aten.permute %9092#0, %9093 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_12645 = torch.constant.int 1
    %int4608_12646 = torch.constant.int 4608
    %int3072_12647 = torch.constant.int 3072
    %9095 = torch.prim.ListConstruct %int1_12645, %int4608_12646, %int3072_12647 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9096 = torch.aten.view %9094, %9095 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_12648 = torch.constant.str "tanh"
    %9097 = torch.aten.gelu %9036, %str_12648 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9098 = torch.prim.ListConstruct %9096, %9097 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_12649 = torch.constant.int 2
    %9099 = torch.aten.cat %9098, %int2_12649 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_12650 = torch.constant.int 4608
    %int15360_12651 = torch.constant.int 15360
    %9100 = torch.prim.ListConstruct %int4608_12650, %int15360_12651 : (!torch.int, !torch.int) -> !torch.list<int>
    %9101 = torch.aten.view %9099, %9100 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.19.linear2.weight = util.global.load @__auto.sampler.single_blocks.19.linear2.weight : tensor<3072x15360xf16>
    %9102 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_12652 = torch.constant.int 0
    %int1_12653 = torch.constant.int 1
    %9103 = torch.aten.transpose.int %9102, %int0_12652, %int1_12653 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.19.linear2.bias = util.global.load @__auto.sampler.single_blocks.19.linear2.bias : tensor<3072xf16>
    %9104 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_12654 = torch.constant.int 6
    %9105 = torch.prims.convert_element_type %9104, %int6_12654 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_12655 = torch.constant.int 6
    %9106 = torch.prims.convert_element_type %9101, %int6_12655 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_12656 = torch.constant.int 6
    %9107 = torch.prims.convert_element_type %9103, %int6_12656 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9108 = torch.aten.mm %9106, %9107 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_12657 = torch.constant.int 1
    %9109 = torch.aten.mul.Scalar %9108, %int1_12657 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_12658 = torch.constant.int 1
    %9110 = torch.aten.mul.Scalar %9105, %int1_12658 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_12659 = torch.constant.int 1
    %9111 = torch.aten.add.Tensor %9109, %9110, %int1_12659 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_12660 = torch.constant.int 5
    %9112 = torch.prims.convert_element_type %9111, %int5_12660 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_12661 = torch.constant.int 1
    %int4608_12662 = torch.constant.int 4608
    %int3072_12663 = torch.constant.int 3072
    %9113 = torch.prim.ListConstruct %int1_12661, %int4608_12662, %int3072_12663 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9114 = torch.aten.view %9112, %9113 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9115 = torch.aten.mul.Tensor %9009, %9114 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12664 = torch.constant.int 1
    %9116 = torch.aten.add.Tensor %8991, %9115, %int1_12664 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9117 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.20.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.20.modulation.lin.weight : tensor<9216x3072xf16>
    %9118 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_12665 = torch.constant.int 0
    %int1_12666 = torch.constant.int 1
    %9119 = torch.aten.transpose.int %9118, %int0_12665, %int1_12666 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.20.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.20.modulation.lin.bias : tensor<9216xf16>
    %9120 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_12667 = torch.constant.int 6
    %9121 = torch.prims.convert_element_type %9120, %int6_12667 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_12668 = torch.constant.int 6
    %9122 = torch.prims.convert_element_type %9117, %int6_12668 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_12669 = torch.constant.int 6
    %9123 = torch.prims.convert_element_type %9119, %int6_12669 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9124 = torch.aten.mm %9122, %9123 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_12670 = torch.constant.int 1
    %9125 = torch.aten.mul.Scalar %9124, %int1_12670 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_12671 = torch.constant.int 1
    %9126 = torch.aten.mul.Scalar %9121, %int1_12671 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_12672 = torch.constant.int 1
    %9127 = torch.aten.add.Tensor %9125, %9126, %int1_12672 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_12673 = torch.constant.int 5
    %9128 = torch.prims.convert_element_type %9127, %int5_12673 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_12674 = torch.constant.int 0
    %int0_12675 = torch.constant.int 0
    %int9223372036854775807_12676 = torch.constant.int 9223372036854775807
    %int1_12677 = torch.constant.int 1
    %9129 = torch.aten.slice.Tensor %9128, %int0_12674, %int0_12675, %int9223372036854775807_12676, %int1_12677 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_12678 = torch.constant.int 1
    %9130 = torch.aten.unsqueeze %9129, %int1_12678 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_12679 = torch.constant.int 2
    %int0_12680 = torch.constant.int 0
    %int9223372036854775807_12681 = torch.constant.int 9223372036854775807
    %int1_12682 = torch.constant.int 1
    %9131 = torch.aten.slice.Tensor %9130, %int2_12679, %int0_12680, %int9223372036854775807_12681, %int1_12682 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_12683 = torch.constant.int -1
    %int0_12684 = torch.constant.int 0
    %int3072_12685 = torch.constant.int 3072
    %int1_12686 = torch.constant.int 1
    %9132 = torch.aten.slice.Tensor %9131, %int-1_12683, %int0_12684, %int3072_12685, %int1_12686 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12687 = torch.constant.int -1
    %int3072_12688 = torch.constant.int 3072
    %int6144_12689 = torch.constant.int 6144
    %int1_12690 = torch.constant.int 1
    %9133 = torch.aten.slice.Tensor %9131, %int-1_12687, %int3072_12688, %int6144_12689, %int1_12690 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12691 = torch.constant.int -1
    %int6144_12692 = torch.constant.int 6144
    %int9216_12693 = torch.constant.int 9216
    %int1_12694 = torch.constant.int 1
    %9134 = torch.aten.slice.Tensor %9131, %int-1_12691, %int6144_12692, %int9216_12693, %int1_12694 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_12695 = torch.constant.int 1
    %int1_12696 = torch.constant.int 1
    %9135 = torch.aten.add.Scalar %9133, %int1_12695, %int1_12696 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_12697 = torch.constant.int 6
    %9136 = torch.prims.convert_element_type %9116, %int6_12697 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_12698 = torch.constant.int 2
    %9137 = torch.prim.ListConstruct %int2_12698 : (!torch.int) -> !torch.list<int>
    %int0_12699 = torch.constant.int 0
    %true_12700 = torch.constant.bool true
    %result0_12701, %result1_12702 = torch.aten.var_mean.correction %9136, %9137, %int0_12699, %true_12700 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_12703 = torch.constant.float 9.9999999999999995E-7
    %int1_12704 = torch.constant.int 1
    %9138 = torch.aten.add.Scalar %result0_12701, %float9.999990e-07_12703, %int1_12704 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9139 = torch.aten.rsqrt %9138 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_12705 = torch.constant.int 1
    %9140 = torch.aten.sub.Tensor %9116, %result1_12702, %int1_12705 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9141 = torch.aten.mul.Tensor %9140, %9139 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_12706 = torch.constant.int 5
    %9142 = torch.prims.convert_element_type %9141, %int5_12706 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9143 = torch.aten.mul.Tensor %9135, %9142 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12707 = torch.constant.int 1
    %9144 = torch.aten.add.Tensor %9143, %9132, %int1_12707 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_12708 = torch.constant.int 4608
    %int3072_12709 = torch.constant.int 3072
    %9145 = torch.prim.ListConstruct %int4608_12708, %int3072_12709 : (!torch.int, !torch.int) -> !torch.list<int>
    %9146 = torch.aten.view %9144, %9145 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.20.linear1.weight = util.global.load @__auto.sampler.single_blocks.20.linear1.weight : tensor<21504x3072xf16>
    %9147 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_12710 = torch.constant.int 0
    %int1_12711 = torch.constant.int 1
    %9148 = torch.aten.transpose.int %9147, %int0_12710, %int1_12711 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.20.linear1.bias = util.global.load @__auto.sampler.single_blocks.20.linear1.bias : tensor<21504xf16>
    %9149 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_12712 = torch.constant.int 6
    %9150 = torch.prims.convert_element_type %9149, %int6_12712 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_12713 = torch.constant.int 6
    %9151 = torch.prims.convert_element_type %9146, %int6_12713 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_12714 = torch.constant.int 6
    %9152 = torch.prims.convert_element_type %9148, %int6_12714 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9153 = torch.aten.mm %9151, %9152 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_12715 = torch.constant.int 1
    %9154 = torch.aten.mul.Scalar %9153, %int1_12715 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_12716 = torch.constant.int 1
    %9155 = torch.aten.mul.Scalar %9150, %int1_12716 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_12717 = torch.constant.int 1
    %9156 = torch.aten.add.Tensor %9154, %9155, %int1_12717 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_12718 = torch.constant.int 5
    %9157 = torch.prims.convert_element_type %9156, %int5_12718 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_12719 = torch.constant.int 1
    %int4608_12720 = torch.constant.int 4608
    %int21504_12721 = torch.constant.int 21504
    %9158 = torch.prim.ListConstruct %int1_12719, %int4608_12720, %int21504_12721 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9159 = torch.aten.view %9157, %9158 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_12722 = torch.constant.int -1
    %int0_12723 = torch.constant.int 0
    %int9216_12724 = torch.constant.int 9216
    %int1_12725 = torch.constant.int 1
    %9160 = torch.aten.slice.Tensor %9159, %int-1_12722, %int0_12723, %int9216_12724, %int1_12725 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_12726 = torch.constant.int -1
    %int9216_12727 = torch.constant.int 9216
    %int21504_12728 = torch.constant.int 21504
    %int1_12729 = torch.constant.int 1
    %9161 = torch.aten.slice.Tensor %9159, %int-1_12726, %int9216_12727, %int21504_12728, %int1_12729 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_12730 = torch.constant.int 1
    %int4608_12731 = torch.constant.int 4608
    %int3_12732 = torch.constant.int 3
    %int24_12733 = torch.constant.int 24
    %int128_12734 = torch.constant.int 128
    %9162 = torch.prim.ListConstruct %int1_12730, %int4608_12731, %int3_12732, %int24_12733, %int128_12734 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9163 = torch.aten.view %9160, %9162 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_12735 = torch.constant.int 2
    %int0_12736 = torch.constant.int 0
    %int3_12737 = torch.constant.int 3
    %int1_12738 = torch.constant.int 1
    %int4_12739 = torch.constant.int 4
    %9164 = torch.prim.ListConstruct %int2_12735, %int0_12736, %int3_12737, %int1_12738, %int4_12739 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9165 = torch.aten.permute %9163, %9164 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_12740 = torch.constant.int 0
    %int0_12741 = torch.constant.int 0
    %9166 = torch.aten.select.int %9165, %int0_12740, %int0_12741 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12742 = torch.constant.int 0
    %int1_12743 = torch.constant.int 1
    %9167 = torch.aten.select.int %9165, %int0_12742, %int1_12743 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12744 = torch.constant.int 0
    %int2_12745 = torch.constant.int 2
    %9168 = torch.aten.select.int %9165, %int0_12744, %int2_12745 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12746 = torch.constant.int 6
    %9169 = torch.prims.convert_element_type %9166, %int6_12746 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12747 = torch.constant.int 2
    %9170 = torch.aten.pow.Tensor_Scalar %9169, %int2_12747 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12748 = torch.constant.int -1
    %9171 = torch.prim.ListConstruct %int-1_12748 : (!torch.int) -> !torch.list<int>
    %true_12749 = torch.constant.bool true
    %none_12750 = torch.constant.none
    %9172 = torch.aten.mean.dim %9170, %9171, %true_12749, %none_12750 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12751 = torch.constant.float 9.9999999999999995E-7
    %int1_12752 = torch.constant.int 1
    %9173 = torch.aten.add.Scalar %9172, %float9.999990e-07_12751, %int1_12752 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9174 = torch.aten.rsqrt %9173 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9175 = torch.aten.mul.Tensor %9169, %9174 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12753 = torch.constant.int 5
    %9176 = torch.prims.convert_element_type %9175, %int5_12753 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.20.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.20.norm.query_norm.scale : tensor<128xf16>
    %9177 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9178 = torch.aten.mul.Tensor %9176, %9177 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12754 = torch.constant.int 6
    %9179 = torch.prims.convert_element_type %9167, %int6_12754 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12755 = torch.constant.int 2
    %9180 = torch.aten.pow.Tensor_Scalar %9179, %int2_12755 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12756 = torch.constant.int -1
    %9181 = torch.prim.ListConstruct %int-1_12756 : (!torch.int) -> !torch.list<int>
    %true_12757 = torch.constant.bool true
    %none_12758 = torch.constant.none
    %9182 = torch.aten.mean.dim %9180, %9181, %true_12757, %none_12758 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12759 = torch.constant.float 9.9999999999999995E-7
    %int1_12760 = torch.constant.int 1
    %9183 = torch.aten.add.Scalar %9182, %float9.999990e-07_12759, %int1_12760 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9184 = torch.aten.rsqrt %9183 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9185 = torch.aten.mul.Tensor %9179, %9184 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12761 = torch.constant.int 5
    %9186 = torch.prims.convert_element_type %9185, %int5_12761 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.20.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.20.norm.key_norm.scale : tensor<128xf16>
    %9187 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9188 = torch.aten.mul.Tensor %9186, %9187 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12762 = torch.constant.int 5
    %9189 = torch.prims.convert_element_type %9178, %int5_12762 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12763 = torch.constant.int 5
    %9190 = torch.prims.convert_element_type %9188, %int5_12763 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12764 = torch.constant.int 6
    %9191 = torch.prims.convert_element_type %9189, %int6_12764 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12765 = torch.constant.int 1
    %int24_12766 = torch.constant.int 24
    %int4608_12767 = torch.constant.int 4608
    %int64_12768 = torch.constant.int 64
    %int1_12769 = torch.constant.int 1
    %int2_12770 = torch.constant.int 2
    %9192 = torch.prim.ListConstruct %int1_12765, %int24_12766, %int4608_12767, %int64_12768, %int1_12769, %int2_12770 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9193 = torch.aten.view %9191, %9192 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_12771 = torch.constant.int 6
    %9194 = torch.prims.convert_element_type %9190, %int6_12771 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12772 = torch.constant.int 1
    %int24_12773 = torch.constant.int 24
    %int4608_12774 = torch.constant.int 4608
    %int64_12775 = torch.constant.int 64
    %int1_12776 = torch.constant.int 1
    %int2_12777 = torch.constant.int 2
    %9195 = torch.prim.ListConstruct %int1_12772, %int24_12773, %int4608_12774, %int64_12775, %int1_12776, %int2_12777 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9196 = torch.aten.view %9194, %9195 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_12778 = torch.constant.int 5
    %int0_12779 = torch.constant.int 0
    %9197 = torch.aten.select.int %211, %int5_12778, %int0_12779 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12780 = torch.constant.int 5
    %int0_12781 = torch.constant.int 0
    %9198 = torch.aten.select.int %9193, %int5_12780, %int0_12781 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9199 = torch.aten.mul.Tensor %9197, %9198 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12782 = torch.constant.int 5
    %int1_12783 = torch.constant.int 1
    %9200 = torch.aten.select.int %211, %int5_12782, %int1_12783 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12784 = torch.constant.int 5
    %int1_12785 = torch.constant.int 1
    %9201 = torch.aten.select.int %9193, %int5_12784, %int1_12785 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9202 = torch.aten.mul.Tensor %9200, %9201 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12786 = torch.constant.int 1
    %9203 = torch.aten.add.Tensor %9199, %9202, %int1_12786 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12787 = torch.constant.int 5
    %int0_12788 = torch.constant.int 0
    %9204 = torch.aten.select.int %211, %int5_12787, %int0_12788 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12789 = torch.constant.int 5
    %int0_12790 = torch.constant.int 0
    %9205 = torch.aten.select.int %9196, %int5_12789, %int0_12790 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9206 = torch.aten.mul.Tensor %9204, %9205 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12791 = torch.constant.int 5
    %int1_12792 = torch.constant.int 1
    %9207 = torch.aten.select.int %211, %int5_12791, %int1_12792 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12793 = torch.constant.int 5
    %int1_12794 = torch.constant.int 1
    %9208 = torch.aten.select.int %9196, %int5_12793, %int1_12794 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9209 = torch.aten.mul.Tensor %9207, %9208 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12795 = torch.constant.int 1
    %9210 = torch.aten.add.Tensor %9206, %9209, %int1_12795 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12796 = torch.constant.int 1
    %int24_12797 = torch.constant.int 24
    %int4608_12798 = torch.constant.int 4608
    %int128_12799 = torch.constant.int 128
    %9211 = torch.prim.ListConstruct %int1_12796, %int24_12797, %int4608_12798, %int128_12799 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9212 = torch.aten.view %9203, %9211 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12800 = torch.constant.int 5
    %9213 = torch.prims.convert_element_type %9212, %int5_12800 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_12801 = torch.constant.int 1
    %int24_12802 = torch.constant.int 24
    %int4608_12803 = torch.constant.int 4608
    %int128_12804 = torch.constant.int 128
    %9214 = torch.prim.ListConstruct %int1_12801, %int24_12802, %int4608_12803, %int128_12804 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9215 = torch.aten.view %9210, %9214 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12805 = torch.constant.int 5
    %9216 = torch.prims.convert_element_type %9215, %int5_12805 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_12806 = torch.constant.float 0.000000e+00
    %false_12807 = torch.constant.bool false
    %none_12808 = torch.constant.none
    %none_12809 = torch.constant.none
    %9217:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9213, %9216, %9168, %float0.000000e00_12806, %false_12807, %none_12808, %none_12809) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_12810 = torch.constant.int 0
    %int2_12811 = torch.constant.int 2
    %int1_12812 = torch.constant.int 1
    %int3_12813 = torch.constant.int 3
    %9218 = torch.prim.ListConstruct %int0_12810, %int2_12811, %int1_12812, %int3_12813 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9219 = torch.aten.permute %9217#0, %9218 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_12814 = torch.constant.int 1
    %int4608_12815 = torch.constant.int 4608
    %int3072_12816 = torch.constant.int 3072
    %9220 = torch.prim.ListConstruct %int1_12814, %int4608_12815, %int3072_12816 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9221 = torch.aten.view %9219, %9220 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_12817 = torch.constant.str "tanh"
    %9222 = torch.aten.gelu %9161, %str_12817 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9223 = torch.prim.ListConstruct %9221, %9222 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_12818 = torch.constant.int 2
    %9224 = torch.aten.cat %9223, %int2_12818 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_12819 = torch.constant.int 4608
    %int15360_12820 = torch.constant.int 15360
    %9225 = torch.prim.ListConstruct %int4608_12819, %int15360_12820 : (!torch.int, !torch.int) -> !torch.list<int>
    %9226 = torch.aten.view %9224, %9225 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.20.linear2.weight = util.global.load @__auto.sampler.single_blocks.20.linear2.weight : tensor<3072x15360xf16>
    %9227 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_12821 = torch.constant.int 0
    %int1_12822 = torch.constant.int 1
    %9228 = torch.aten.transpose.int %9227, %int0_12821, %int1_12822 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.20.linear2.bias = util.global.load @__auto.sampler.single_blocks.20.linear2.bias : tensor<3072xf16>
    %9229 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_12823 = torch.constant.int 6
    %9230 = torch.prims.convert_element_type %9229, %int6_12823 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_12824 = torch.constant.int 6
    %9231 = torch.prims.convert_element_type %9226, %int6_12824 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_12825 = torch.constant.int 6
    %9232 = torch.prims.convert_element_type %9228, %int6_12825 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9233 = torch.aten.mm %9231, %9232 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_12826 = torch.constant.int 1
    %9234 = torch.aten.mul.Scalar %9233, %int1_12826 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_12827 = torch.constant.int 1
    %9235 = torch.aten.mul.Scalar %9230, %int1_12827 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_12828 = torch.constant.int 1
    %9236 = torch.aten.add.Tensor %9234, %9235, %int1_12828 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_12829 = torch.constant.int 5
    %9237 = torch.prims.convert_element_type %9236, %int5_12829 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_12830 = torch.constant.int 1
    %int4608_12831 = torch.constant.int 4608
    %int3072_12832 = torch.constant.int 3072
    %9238 = torch.prim.ListConstruct %int1_12830, %int4608_12831, %int3072_12832 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9239 = torch.aten.view %9237, %9238 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9240 = torch.aten.mul.Tensor %9134, %9239 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12833 = torch.constant.int 1
    %9241 = torch.aten.add.Tensor %9116, %9240, %int1_12833 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9242 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.21.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.21.modulation.lin.weight : tensor<9216x3072xf16>
    %9243 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_12834 = torch.constant.int 0
    %int1_12835 = torch.constant.int 1
    %9244 = torch.aten.transpose.int %9243, %int0_12834, %int1_12835 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.21.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.21.modulation.lin.bias : tensor<9216xf16>
    %9245 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_12836 = torch.constant.int 6
    %9246 = torch.prims.convert_element_type %9245, %int6_12836 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_12837 = torch.constant.int 6
    %9247 = torch.prims.convert_element_type %9242, %int6_12837 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_12838 = torch.constant.int 6
    %9248 = torch.prims.convert_element_type %9244, %int6_12838 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9249 = torch.aten.mm %9247, %9248 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_12839 = torch.constant.int 1
    %9250 = torch.aten.mul.Scalar %9249, %int1_12839 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_12840 = torch.constant.int 1
    %9251 = torch.aten.mul.Scalar %9246, %int1_12840 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_12841 = torch.constant.int 1
    %9252 = torch.aten.add.Tensor %9250, %9251, %int1_12841 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_12842 = torch.constant.int 5
    %9253 = torch.prims.convert_element_type %9252, %int5_12842 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_12843 = torch.constant.int 0
    %int0_12844 = torch.constant.int 0
    %int9223372036854775807_12845 = torch.constant.int 9223372036854775807
    %int1_12846 = torch.constant.int 1
    %9254 = torch.aten.slice.Tensor %9253, %int0_12843, %int0_12844, %int9223372036854775807_12845, %int1_12846 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_12847 = torch.constant.int 1
    %9255 = torch.aten.unsqueeze %9254, %int1_12847 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_12848 = torch.constant.int 2
    %int0_12849 = torch.constant.int 0
    %int9223372036854775807_12850 = torch.constant.int 9223372036854775807
    %int1_12851 = torch.constant.int 1
    %9256 = torch.aten.slice.Tensor %9255, %int2_12848, %int0_12849, %int9223372036854775807_12850, %int1_12851 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_12852 = torch.constant.int -1
    %int0_12853 = torch.constant.int 0
    %int3072_12854 = torch.constant.int 3072
    %int1_12855 = torch.constant.int 1
    %9257 = torch.aten.slice.Tensor %9256, %int-1_12852, %int0_12853, %int3072_12854, %int1_12855 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12856 = torch.constant.int -1
    %int3072_12857 = torch.constant.int 3072
    %int6144_12858 = torch.constant.int 6144
    %int1_12859 = torch.constant.int 1
    %9258 = torch.aten.slice.Tensor %9256, %int-1_12856, %int3072_12857, %int6144_12858, %int1_12859 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12860 = torch.constant.int -1
    %int6144_12861 = torch.constant.int 6144
    %int9216_12862 = torch.constant.int 9216
    %int1_12863 = torch.constant.int 1
    %9259 = torch.aten.slice.Tensor %9256, %int-1_12860, %int6144_12861, %int9216_12862, %int1_12863 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_12864 = torch.constant.int 1
    %int1_12865 = torch.constant.int 1
    %9260 = torch.aten.add.Scalar %9258, %int1_12864, %int1_12865 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_12866 = torch.constant.int 6
    %9261 = torch.prims.convert_element_type %9241, %int6_12866 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_12867 = torch.constant.int 2
    %9262 = torch.prim.ListConstruct %int2_12867 : (!torch.int) -> !torch.list<int>
    %int0_12868 = torch.constant.int 0
    %true_12869 = torch.constant.bool true
    %result0_12870, %result1_12871 = torch.aten.var_mean.correction %9261, %9262, %int0_12868, %true_12869 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_12872 = torch.constant.float 9.9999999999999995E-7
    %int1_12873 = torch.constant.int 1
    %9263 = torch.aten.add.Scalar %result0_12870, %float9.999990e-07_12872, %int1_12873 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9264 = torch.aten.rsqrt %9263 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_12874 = torch.constant.int 1
    %9265 = torch.aten.sub.Tensor %9241, %result1_12871, %int1_12874 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9266 = torch.aten.mul.Tensor %9265, %9264 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_12875 = torch.constant.int 5
    %9267 = torch.prims.convert_element_type %9266, %int5_12875 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9268 = torch.aten.mul.Tensor %9260, %9267 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12876 = torch.constant.int 1
    %9269 = torch.aten.add.Tensor %9268, %9257, %int1_12876 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_12877 = torch.constant.int 4608
    %int3072_12878 = torch.constant.int 3072
    %9270 = torch.prim.ListConstruct %int4608_12877, %int3072_12878 : (!torch.int, !torch.int) -> !torch.list<int>
    %9271 = torch.aten.view %9269, %9270 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.21.linear1.weight = util.global.load @__auto.sampler.single_blocks.21.linear1.weight : tensor<21504x3072xf16>
    %9272 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_12879 = torch.constant.int 0
    %int1_12880 = torch.constant.int 1
    %9273 = torch.aten.transpose.int %9272, %int0_12879, %int1_12880 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.21.linear1.bias = util.global.load @__auto.sampler.single_blocks.21.linear1.bias : tensor<21504xf16>
    %9274 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_12881 = torch.constant.int 6
    %9275 = torch.prims.convert_element_type %9274, %int6_12881 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_12882 = torch.constant.int 6
    %9276 = torch.prims.convert_element_type %9271, %int6_12882 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_12883 = torch.constant.int 6
    %9277 = torch.prims.convert_element_type %9273, %int6_12883 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9278 = torch.aten.mm %9276, %9277 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_12884 = torch.constant.int 1
    %9279 = torch.aten.mul.Scalar %9278, %int1_12884 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_12885 = torch.constant.int 1
    %9280 = torch.aten.mul.Scalar %9275, %int1_12885 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_12886 = torch.constant.int 1
    %9281 = torch.aten.add.Tensor %9279, %9280, %int1_12886 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_12887 = torch.constant.int 5
    %9282 = torch.prims.convert_element_type %9281, %int5_12887 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_12888 = torch.constant.int 1
    %int4608_12889 = torch.constant.int 4608
    %int21504_12890 = torch.constant.int 21504
    %9283 = torch.prim.ListConstruct %int1_12888, %int4608_12889, %int21504_12890 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9284 = torch.aten.view %9282, %9283 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_12891 = torch.constant.int -1
    %int0_12892 = torch.constant.int 0
    %int9216_12893 = torch.constant.int 9216
    %int1_12894 = torch.constant.int 1
    %9285 = torch.aten.slice.Tensor %9284, %int-1_12891, %int0_12892, %int9216_12893, %int1_12894 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_12895 = torch.constant.int -1
    %int9216_12896 = torch.constant.int 9216
    %int21504_12897 = torch.constant.int 21504
    %int1_12898 = torch.constant.int 1
    %9286 = torch.aten.slice.Tensor %9284, %int-1_12895, %int9216_12896, %int21504_12897, %int1_12898 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_12899 = torch.constant.int 1
    %int4608_12900 = torch.constant.int 4608
    %int3_12901 = torch.constant.int 3
    %int24_12902 = torch.constant.int 24
    %int128_12903 = torch.constant.int 128
    %9287 = torch.prim.ListConstruct %int1_12899, %int4608_12900, %int3_12901, %int24_12902, %int128_12903 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9288 = torch.aten.view %9285, %9287 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_12904 = torch.constant.int 2
    %int0_12905 = torch.constant.int 0
    %int3_12906 = torch.constant.int 3
    %int1_12907 = torch.constant.int 1
    %int4_12908 = torch.constant.int 4
    %9289 = torch.prim.ListConstruct %int2_12904, %int0_12905, %int3_12906, %int1_12907, %int4_12908 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9290 = torch.aten.permute %9288, %9289 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_12909 = torch.constant.int 0
    %int0_12910 = torch.constant.int 0
    %9291 = torch.aten.select.int %9290, %int0_12909, %int0_12910 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12911 = torch.constant.int 0
    %int1_12912 = torch.constant.int 1
    %9292 = torch.aten.select.int %9290, %int0_12911, %int1_12912 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12913 = torch.constant.int 0
    %int2_12914 = torch.constant.int 2
    %9293 = torch.aten.select.int %9290, %int0_12913, %int2_12914 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12915 = torch.constant.int 6
    %9294 = torch.prims.convert_element_type %9291, %int6_12915 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12916 = torch.constant.int 2
    %9295 = torch.aten.pow.Tensor_Scalar %9294, %int2_12916 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12917 = torch.constant.int -1
    %9296 = torch.prim.ListConstruct %int-1_12917 : (!torch.int) -> !torch.list<int>
    %true_12918 = torch.constant.bool true
    %none_12919 = torch.constant.none
    %9297 = torch.aten.mean.dim %9295, %9296, %true_12918, %none_12919 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12920 = torch.constant.float 9.9999999999999995E-7
    %int1_12921 = torch.constant.int 1
    %9298 = torch.aten.add.Scalar %9297, %float9.999990e-07_12920, %int1_12921 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9299 = torch.aten.rsqrt %9298 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9300 = torch.aten.mul.Tensor %9294, %9299 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12922 = torch.constant.int 5
    %9301 = torch.prims.convert_element_type %9300, %int5_12922 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.21.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.21.norm.query_norm.scale : tensor<128xf16>
    %9302 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9303 = torch.aten.mul.Tensor %9301, %9302 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12923 = torch.constant.int 6
    %9304 = torch.prims.convert_element_type %9292, %int6_12923 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12924 = torch.constant.int 2
    %9305 = torch.aten.pow.Tensor_Scalar %9304, %int2_12924 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12925 = torch.constant.int -1
    %9306 = torch.prim.ListConstruct %int-1_12925 : (!torch.int) -> !torch.list<int>
    %true_12926 = torch.constant.bool true
    %none_12927 = torch.constant.none
    %9307 = torch.aten.mean.dim %9305, %9306, %true_12926, %none_12927 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12928 = torch.constant.float 9.9999999999999995E-7
    %int1_12929 = torch.constant.int 1
    %9308 = torch.aten.add.Scalar %9307, %float9.999990e-07_12928, %int1_12929 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9309 = torch.aten.rsqrt %9308 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9310 = torch.aten.mul.Tensor %9304, %9309 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12930 = torch.constant.int 5
    %9311 = torch.prims.convert_element_type %9310, %int5_12930 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.21.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.21.norm.key_norm.scale : tensor<128xf16>
    %9312 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9313 = torch.aten.mul.Tensor %9311, %9312 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12931 = torch.constant.int 5
    %9314 = torch.prims.convert_element_type %9303, %int5_12931 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12932 = torch.constant.int 5
    %9315 = torch.prims.convert_element_type %9313, %int5_12932 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12933 = torch.constant.int 6
    %9316 = torch.prims.convert_element_type %9314, %int6_12933 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12934 = torch.constant.int 1
    %int24_12935 = torch.constant.int 24
    %int4608_12936 = torch.constant.int 4608
    %int64_12937 = torch.constant.int 64
    %int1_12938 = torch.constant.int 1
    %int2_12939 = torch.constant.int 2
    %9317 = torch.prim.ListConstruct %int1_12934, %int24_12935, %int4608_12936, %int64_12937, %int1_12938, %int2_12939 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9318 = torch.aten.view %9316, %9317 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_12940 = torch.constant.int 6
    %9319 = torch.prims.convert_element_type %9315, %int6_12940 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12941 = torch.constant.int 1
    %int24_12942 = torch.constant.int 24
    %int4608_12943 = torch.constant.int 4608
    %int64_12944 = torch.constant.int 64
    %int1_12945 = torch.constant.int 1
    %int2_12946 = torch.constant.int 2
    %9320 = torch.prim.ListConstruct %int1_12941, %int24_12942, %int4608_12943, %int64_12944, %int1_12945, %int2_12946 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9321 = torch.aten.view %9319, %9320 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_12947 = torch.constant.int 5
    %int0_12948 = torch.constant.int 0
    %9322 = torch.aten.select.int %211, %int5_12947, %int0_12948 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12949 = torch.constant.int 5
    %int0_12950 = torch.constant.int 0
    %9323 = torch.aten.select.int %9318, %int5_12949, %int0_12950 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9324 = torch.aten.mul.Tensor %9322, %9323 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12951 = torch.constant.int 5
    %int1_12952 = torch.constant.int 1
    %9325 = torch.aten.select.int %211, %int5_12951, %int1_12952 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12953 = torch.constant.int 5
    %int1_12954 = torch.constant.int 1
    %9326 = torch.aten.select.int %9318, %int5_12953, %int1_12954 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9327 = torch.aten.mul.Tensor %9325, %9326 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12955 = torch.constant.int 1
    %9328 = torch.aten.add.Tensor %9324, %9327, %int1_12955 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12956 = torch.constant.int 5
    %int0_12957 = torch.constant.int 0
    %9329 = torch.aten.select.int %211, %int5_12956, %int0_12957 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12958 = torch.constant.int 5
    %int0_12959 = torch.constant.int 0
    %9330 = torch.aten.select.int %9321, %int5_12958, %int0_12959 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9331 = torch.aten.mul.Tensor %9329, %9330 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12960 = torch.constant.int 5
    %int1_12961 = torch.constant.int 1
    %9332 = torch.aten.select.int %211, %int5_12960, %int1_12961 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12962 = torch.constant.int 5
    %int1_12963 = torch.constant.int 1
    %9333 = torch.aten.select.int %9321, %int5_12962, %int1_12963 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9334 = torch.aten.mul.Tensor %9332, %9333 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12964 = torch.constant.int 1
    %9335 = torch.aten.add.Tensor %9331, %9334, %int1_12964 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12965 = torch.constant.int 1
    %int24_12966 = torch.constant.int 24
    %int4608_12967 = torch.constant.int 4608
    %int128_12968 = torch.constant.int 128
    %9336 = torch.prim.ListConstruct %int1_12965, %int24_12966, %int4608_12967, %int128_12968 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9337 = torch.aten.view %9328, %9336 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12969 = torch.constant.int 5
    %9338 = torch.prims.convert_element_type %9337, %int5_12969 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_12970 = torch.constant.int 1
    %int24_12971 = torch.constant.int 24
    %int4608_12972 = torch.constant.int 4608
    %int128_12973 = torch.constant.int 128
    %9339 = torch.prim.ListConstruct %int1_12970, %int24_12971, %int4608_12972, %int128_12973 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9340 = torch.aten.view %9335, %9339 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12974 = torch.constant.int 5
    %9341 = torch.prims.convert_element_type %9340, %int5_12974 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_12975 = torch.constant.float 0.000000e+00
    %false_12976 = torch.constant.bool false
    %none_12977 = torch.constant.none
    %none_12978 = torch.constant.none
    %9342:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9338, %9341, %9293, %float0.000000e00_12975, %false_12976, %none_12977, %none_12978) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_12979 = torch.constant.int 0
    %int2_12980 = torch.constant.int 2
    %int1_12981 = torch.constant.int 1
    %int3_12982 = torch.constant.int 3
    %9343 = torch.prim.ListConstruct %int0_12979, %int2_12980, %int1_12981, %int3_12982 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9344 = torch.aten.permute %9342#0, %9343 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_12983 = torch.constant.int 1
    %int4608_12984 = torch.constant.int 4608
    %int3072_12985 = torch.constant.int 3072
    %9345 = torch.prim.ListConstruct %int1_12983, %int4608_12984, %int3072_12985 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9346 = torch.aten.view %9344, %9345 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_12986 = torch.constant.str "tanh"
    %9347 = torch.aten.gelu %9286, %str_12986 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9348 = torch.prim.ListConstruct %9346, %9347 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_12987 = torch.constant.int 2
    %9349 = torch.aten.cat %9348, %int2_12987 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_12988 = torch.constant.int 4608
    %int15360_12989 = torch.constant.int 15360
    %9350 = torch.prim.ListConstruct %int4608_12988, %int15360_12989 : (!torch.int, !torch.int) -> !torch.list<int>
    %9351 = torch.aten.view %9349, %9350 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.21.linear2.weight = util.global.load @__auto.sampler.single_blocks.21.linear2.weight : tensor<3072x15360xf16>
    %9352 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_12990 = torch.constant.int 0
    %int1_12991 = torch.constant.int 1
    %9353 = torch.aten.transpose.int %9352, %int0_12990, %int1_12991 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.21.linear2.bias = util.global.load @__auto.sampler.single_blocks.21.linear2.bias : tensor<3072xf16>
    %9354 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_12992 = torch.constant.int 6
    %9355 = torch.prims.convert_element_type %9354, %int6_12992 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_12993 = torch.constant.int 6
    %9356 = torch.prims.convert_element_type %9351, %int6_12993 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_12994 = torch.constant.int 6
    %9357 = torch.prims.convert_element_type %9353, %int6_12994 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9358 = torch.aten.mm %9356, %9357 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_12995 = torch.constant.int 1
    %9359 = torch.aten.mul.Scalar %9358, %int1_12995 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_12996 = torch.constant.int 1
    %9360 = torch.aten.mul.Scalar %9355, %int1_12996 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_12997 = torch.constant.int 1
    %9361 = torch.aten.add.Tensor %9359, %9360, %int1_12997 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_12998 = torch.constant.int 5
    %9362 = torch.prims.convert_element_type %9361, %int5_12998 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_12999 = torch.constant.int 1
    %int4608_13000 = torch.constant.int 4608
    %int3072_13001 = torch.constant.int 3072
    %9363 = torch.prim.ListConstruct %int1_12999, %int4608_13000, %int3072_13001 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9364 = torch.aten.view %9362, %9363 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9365 = torch.aten.mul.Tensor %9259, %9364 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13002 = torch.constant.int 1
    %9366 = torch.aten.add.Tensor %9241, %9365, %int1_13002 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9367 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.22.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.22.modulation.lin.weight : tensor<9216x3072xf16>
    %9368 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_13003 = torch.constant.int 0
    %int1_13004 = torch.constant.int 1
    %9369 = torch.aten.transpose.int %9368, %int0_13003, %int1_13004 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.22.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.22.modulation.lin.bias : tensor<9216xf16>
    %9370 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_13005 = torch.constant.int 6
    %9371 = torch.prims.convert_element_type %9370, %int6_13005 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_13006 = torch.constant.int 6
    %9372 = torch.prims.convert_element_type %9367, %int6_13006 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_13007 = torch.constant.int 6
    %9373 = torch.prims.convert_element_type %9369, %int6_13007 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9374 = torch.aten.mm %9372, %9373 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_13008 = torch.constant.int 1
    %9375 = torch.aten.mul.Scalar %9374, %int1_13008 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_13009 = torch.constant.int 1
    %9376 = torch.aten.mul.Scalar %9371, %int1_13009 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_13010 = torch.constant.int 1
    %9377 = torch.aten.add.Tensor %9375, %9376, %int1_13010 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_13011 = torch.constant.int 5
    %9378 = torch.prims.convert_element_type %9377, %int5_13011 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_13012 = torch.constant.int 0
    %int0_13013 = torch.constant.int 0
    %int9223372036854775807_13014 = torch.constant.int 9223372036854775807
    %int1_13015 = torch.constant.int 1
    %9379 = torch.aten.slice.Tensor %9378, %int0_13012, %int0_13013, %int9223372036854775807_13014, %int1_13015 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_13016 = torch.constant.int 1
    %9380 = torch.aten.unsqueeze %9379, %int1_13016 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_13017 = torch.constant.int 2
    %int0_13018 = torch.constant.int 0
    %int9223372036854775807_13019 = torch.constant.int 9223372036854775807
    %int1_13020 = torch.constant.int 1
    %9381 = torch.aten.slice.Tensor %9380, %int2_13017, %int0_13018, %int9223372036854775807_13019, %int1_13020 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_13021 = torch.constant.int -1
    %int0_13022 = torch.constant.int 0
    %int3072_13023 = torch.constant.int 3072
    %int1_13024 = torch.constant.int 1
    %9382 = torch.aten.slice.Tensor %9381, %int-1_13021, %int0_13022, %int3072_13023, %int1_13024 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13025 = torch.constant.int -1
    %int3072_13026 = torch.constant.int 3072
    %int6144_13027 = torch.constant.int 6144
    %int1_13028 = torch.constant.int 1
    %9383 = torch.aten.slice.Tensor %9381, %int-1_13025, %int3072_13026, %int6144_13027, %int1_13028 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13029 = torch.constant.int -1
    %int6144_13030 = torch.constant.int 6144
    %int9216_13031 = torch.constant.int 9216
    %int1_13032 = torch.constant.int 1
    %9384 = torch.aten.slice.Tensor %9381, %int-1_13029, %int6144_13030, %int9216_13031, %int1_13032 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_13033 = torch.constant.int 1
    %int1_13034 = torch.constant.int 1
    %9385 = torch.aten.add.Scalar %9383, %int1_13033, %int1_13034 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_13035 = torch.constant.int 6
    %9386 = torch.prims.convert_element_type %9366, %int6_13035 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_13036 = torch.constant.int 2
    %9387 = torch.prim.ListConstruct %int2_13036 : (!torch.int) -> !torch.list<int>
    %int0_13037 = torch.constant.int 0
    %true_13038 = torch.constant.bool true
    %result0_13039, %result1_13040 = torch.aten.var_mean.correction %9386, %9387, %int0_13037, %true_13038 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_13041 = torch.constant.float 9.9999999999999995E-7
    %int1_13042 = torch.constant.int 1
    %9388 = torch.aten.add.Scalar %result0_13039, %float9.999990e-07_13041, %int1_13042 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9389 = torch.aten.rsqrt %9388 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_13043 = torch.constant.int 1
    %9390 = torch.aten.sub.Tensor %9366, %result1_13040, %int1_13043 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9391 = torch.aten.mul.Tensor %9390, %9389 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_13044 = torch.constant.int 5
    %9392 = torch.prims.convert_element_type %9391, %int5_13044 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9393 = torch.aten.mul.Tensor %9385, %9392 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13045 = torch.constant.int 1
    %9394 = torch.aten.add.Tensor %9393, %9382, %int1_13045 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_13046 = torch.constant.int 4608
    %int3072_13047 = torch.constant.int 3072
    %9395 = torch.prim.ListConstruct %int4608_13046, %int3072_13047 : (!torch.int, !torch.int) -> !torch.list<int>
    %9396 = torch.aten.view %9394, %9395 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.22.linear1.weight = util.global.load @__auto.sampler.single_blocks.22.linear1.weight : tensor<21504x3072xf16>
    %9397 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_13048 = torch.constant.int 0
    %int1_13049 = torch.constant.int 1
    %9398 = torch.aten.transpose.int %9397, %int0_13048, %int1_13049 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.22.linear1.bias = util.global.load @__auto.sampler.single_blocks.22.linear1.bias : tensor<21504xf16>
    %9399 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_13050 = torch.constant.int 6
    %9400 = torch.prims.convert_element_type %9399, %int6_13050 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_13051 = torch.constant.int 6
    %9401 = torch.prims.convert_element_type %9396, %int6_13051 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_13052 = torch.constant.int 6
    %9402 = torch.prims.convert_element_type %9398, %int6_13052 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9403 = torch.aten.mm %9401, %9402 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_13053 = torch.constant.int 1
    %9404 = torch.aten.mul.Scalar %9403, %int1_13053 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_13054 = torch.constant.int 1
    %9405 = torch.aten.mul.Scalar %9400, %int1_13054 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_13055 = torch.constant.int 1
    %9406 = torch.aten.add.Tensor %9404, %9405, %int1_13055 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_13056 = torch.constant.int 5
    %9407 = torch.prims.convert_element_type %9406, %int5_13056 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_13057 = torch.constant.int 1
    %int4608_13058 = torch.constant.int 4608
    %int21504_13059 = torch.constant.int 21504
    %9408 = torch.prim.ListConstruct %int1_13057, %int4608_13058, %int21504_13059 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9409 = torch.aten.view %9407, %9408 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_13060 = torch.constant.int -1
    %int0_13061 = torch.constant.int 0
    %int9216_13062 = torch.constant.int 9216
    %int1_13063 = torch.constant.int 1
    %9410 = torch.aten.slice.Tensor %9409, %int-1_13060, %int0_13061, %int9216_13062, %int1_13063 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_13064 = torch.constant.int -1
    %int9216_13065 = torch.constant.int 9216
    %int21504_13066 = torch.constant.int 21504
    %int1_13067 = torch.constant.int 1
    %9411 = torch.aten.slice.Tensor %9409, %int-1_13064, %int9216_13065, %int21504_13066, %int1_13067 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_13068 = torch.constant.int 1
    %int4608_13069 = torch.constant.int 4608
    %int3_13070 = torch.constant.int 3
    %int24_13071 = torch.constant.int 24
    %int128_13072 = torch.constant.int 128
    %9412 = torch.prim.ListConstruct %int1_13068, %int4608_13069, %int3_13070, %int24_13071, %int128_13072 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9413 = torch.aten.view %9410, %9412 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_13073 = torch.constant.int 2
    %int0_13074 = torch.constant.int 0
    %int3_13075 = torch.constant.int 3
    %int1_13076 = torch.constant.int 1
    %int4_13077 = torch.constant.int 4
    %9414 = torch.prim.ListConstruct %int2_13073, %int0_13074, %int3_13075, %int1_13076, %int4_13077 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9415 = torch.aten.permute %9413, %9414 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_13078 = torch.constant.int 0
    %int0_13079 = torch.constant.int 0
    %9416 = torch.aten.select.int %9415, %int0_13078, %int0_13079 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13080 = torch.constant.int 0
    %int1_13081 = torch.constant.int 1
    %9417 = torch.aten.select.int %9415, %int0_13080, %int1_13081 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13082 = torch.constant.int 0
    %int2_13083 = torch.constant.int 2
    %9418 = torch.aten.select.int %9415, %int0_13082, %int2_13083 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13084 = torch.constant.int 6
    %9419 = torch.prims.convert_element_type %9416, %int6_13084 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13085 = torch.constant.int 2
    %9420 = torch.aten.pow.Tensor_Scalar %9419, %int2_13085 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13086 = torch.constant.int -1
    %9421 = torch.prim.ListConstruct %int-1_13086 : (!torch.int) -> !torch.list<int>
    %true_13087 = torch.constant.bool true
    %none_13088 = torch.constant.none
    %9422 = torch.aten.mean.dim %9420, %9421, %true_13087, %none_13088 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13089 = torch.constant.float 9.9999999999999995E-7
    %int1_13090 = torch.constant.int 1
    %9423 = torch.aten.add.Scalar %9422, %float9.999990e-07_13089, %int1_13090 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9424 = torch.aten.rsqrt %9423 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9425 = torch.aten.mul.Tensor %9419, %9424 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13091 = torch.constant.int 5
    %9426 = torch.prims.convert_element_type %9425, %int5_13091 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.22.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.22.norm.query_norm.scale : tensor<128xf16>
    %9427 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9428 = torch.aten.mul.Tensor %9426, %9427 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13092 = torch.constant.int 6
    %9429 = torch.prims.convert_element_type %9417, %int6_13092 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13093 = torch.constant.int 2
    %9430 = torch.aten.pow.Tensor_Scalar %9429, %int2_13093 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13094 = torch.constant.int -1
    %9431 = torch.prim.ListConstruct %int-1_13094 : (!torch.int) -> !torch.list<int>
    %true_13095 = torch.constant.bool true
    %none_13096 = torch.constant.none
    %9432 = torch.aten.mean.dim %9430, %9431, %true_13095, %none_13096 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13097 = torch.constant.float 9.9999999999999995E-7
    %int1_13098 = torch.constant.int 1
    %9433 = torch.aten.add.Scalar %9432, %float9.999990e-07_13097, %int1_13098 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9434 = torch.aten.rsqrt %9433 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9435 = torch.aten.mul.Tensor %9429, %9434 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13099 = torch.constant.int 5
    %9436 = torch.prims.convert_element_type %9435, %int5_13099 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.22.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.22.norm.key_norm.scale : tensor<128xf16>
    %9437 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9438 = torch.aten.mul.Tensor %9436, %9437 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13100 = torch.constant.int 5
    %9439 = torch.prims.convert_element_type %9428, %int5_13100 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13101 = torch.constant.int 5
    %9440 = torch.prims.convert_element_type %9438, %int5_13101 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13102 = torch.constant.int 6
    %9441 = torch.prims.convert_element_type %9439, %int6_13102 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13103 = torch.constant.int 1
    %int24_13104 = torch.constant.int 24
    %int4608_13105 = torch.constant.int 4608
    %int64_13106 = torch.constant.int 64
    %int1_13107 = torch.constant.int 1
    %int2_13108 = torch.constant.int 2
    %9442 = torch.prim.ListConstruct %int1_13103, %int24_13104, %int4608_13105, %int64_13106, %int1_13107, %int2_13108 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9443 = torch.aten.view %9441, %9442 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_13109 = torch.constant.int 6
    %9444 = torch.prims.convert_element_type %9440, %int6_13109 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13110 = torch.constant.int 1
    %int24_13111 = torch.constant.int 24
    %int4608_13112 = torch.constant.int 4608
    %int64_13113 = torch.constant.int 64
    %int1_13114 = torch.constant.int 1
    %int2_13115 = torch.constant.int 2
    %9445 = torch.prim.ListConstruct %int1_13110, %int24_13111, %int4608_13112, %int64_13113, %int1_13114, %int2_13115 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9446 = torch.aten.view %9444, %9445 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_13116 = torch.constant.int 5
    %int0_13117 = torch.constant.int 0
    %9447 = torch.aten.select.int %211, %int5_13116, %int0_13117 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13118 = torch.constant.int 5
    %int0_13119 = torch.constant.int 0
    %9448 = torch.aten.select.int %9443, %int5_13118, %int0_13119 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9449 = torch.aten.mul.Tensor %9447, %9448 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13120 = torch.constant.int 5
    %int1_13121 = torch.constant.int 1
    %9450 = torch.aten.select.int %211, %int5_13120, %int1_13121 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13122 = torch.constant.int 5
    %int1_13123 = torch.constant.int 1
    %9451 = torch.aten.select.int %9443, %int5_13122, %int1_13123 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9452 = torch.aten.mul.Tensor %9450, %9451 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13124 = torch.constant.int 1
    %9453 = torch.aten.add.Tensor %9449, %9452, %int1_13124 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13125 = torch.constant.int 5
    %int0_13126 = torch.constant.int 0
    %9454 = torch.aten.select.int %211, %int5_13125, %int0_13126 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13127 = torch.constant.int 5
    %int0_13128 = torch.constant.int 0
    %9455 = torch.aten.select.int %9446, %int5_13127, %int0_13128 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9456 = torch.aten.mul.Tensor %9454, %9455 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13129 = torch.constant.int 5
    %int1_13130 = torch.constant.int 1
    %9457 = torch.aten.select.int %211, %int5_13129, %int1_13130 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13131 = torch.constant.int 5
    %int1_13132 = torch.constant.int 1
    %9458 = torch.aten.select.int %9446, %int5_13131, %int1_13132 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9459 = torch.aten.mul.Tensor %9457, %9458 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13133 = torch.constant.int 1
    %9460 = torch.aten.add.Tensor %9456, %9459, %int1_13133 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13134 = torch.constant.int 1
    %int24_13135 = torch.constant.int 24
    %int4608_13136 = torch.constant.int 4608
    %int128_13137 = torch.constant.int 128
    %9461 = torch.prim.ListConstruct %int1_13134, %int24_13135, %int4608_13136, %int128_13137 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9462 = torch.aten.view %9453, %9461 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13138 = torch.constant.int 5
    %9463 = torch.prims.convert_element_type %9462, %int5_13138 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_13139 = torch.constant.int 1
    %int24_13140 = torch.constant.int 24
    %int4608_13141 = torch.constant.int 4608
    %int128_13142 = torch.constant.int 128
    %9464 = torch.prim.ListConstruct %int1_13139, %int24_13140, %int4608_13141, %int128_13142 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9465 = torch.aten.view %9460, %9464 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13143 = torch.constant.int 5
    %9466 = torch.prims.convert_element_type %9465, %int5_13143 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_13144 = torch.constant.float 0.000000e+00
    %false_13145 = torch.constant.bool false
    %none_13146 = torch.constant.none
    %none_13147 = torch.constant.none
    %9467:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9463, %9466, %9418, %float0.000000e00_13144, %false_13145, %none_13146, %none_13147) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_13148 = torch.constant.int 0
    %int2_13149 = torch.constant.int 2
    %int1_13150 = torch.constant.int 1
    %int3_13151 = torch.constant.int 3
    %9468 = torch.prim.ListConstruct %int0_13148, %int2_13149, %int1_13150, %int3_13151 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9469 = torch.aten.permute %9467#0, %9468 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_13152 = torch.constant.int 1
    %int4608_13153 = torch.constant.int 4608
    %int3072_13154 = torch.constant.int 3072
    %9470 = torch.prim.ListConstruct %int1_13152, %int4608_13153, %int3072_13154 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9471 = torch.aten.view %9469, %9470 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_13155 = torch.constant.str "tanh"
    %9472 = torch.aten.gelu %9411, %str_13155 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9473 = torch.prim.ListConstruct %9471, %9472 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_13156 = torch.constant.int 2
    %9474 = torch.aten.cat %9473, %int2_13156 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_13157 = torch.constant.int 4608
    %int15360_13158 = torch.constant.int 15360
    %9475 = torch.prim.ListConstruct %int4608_13157, %int15360_13158 : (!torch.int, !torch.int) -> !torch.list<int>
    %9476 = torch.aten.view %9474, %9475 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.22.linear2.weight = util.global.load @__auto.sampler.single_blocks.22.linear2.weight : tensor<3072x15360xf16>
    %9477 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_13159 = torch.constant.int 0
    %int1_13160 = torch.constant.int 1
    %9478 = torch.aten.transpose.int %9477, %int0_13159, %int1_13160 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.22.linear2.bias = util.global.load @__auto.sampler.single_blocks.22.linear2.bias : tensor<3072xf16>
    %9479 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_13161 = torch.constant.int 6
    %9480 = torch.prims.convert_element_type %9479, %int6_13161 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_13162 = torch.constant.int 6
    %9481 = torch.prims.convert_element_type %9476, %int6_13162 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_13163 = torch.constant.int 6
    %9482 = torch.prims.convert_element_type %9478, %int6_13163 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9483 = torch.aten.mm %9481, %9482 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_13164 = torch.constant.int 1
    %9484 = torch.aten.mul.Scalar %9483, %int1_13164 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_13165 = torch.constant.int 1
    %9485 = torch.aten.mul.Scalar %9480, %int1_13165 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_13166 = torch.constant.int 1
    %9486 = torch.aten.add.Tensor %9484, %9485, %int1_13166 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_13167 = torch.constant.int 5
    %9487 = torch.prims.convert_element_type %9486, %int5_13167 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_13168 = torch.constant.int 1
    %int4608_13169 = torch.constant.int 4608
    %int3072_13170 = torch.constant.int 3072
    %9488 = torch.prim.ListConstruct %int1_13168, %int4608_13169, %int3072_13170 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9489 = torch.aten.view %9487, %9488 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9490 = torch.aten.mul.Tensor %9384, %9489 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13171 = torch.constant.int 1
    %9491 = torch.aten.add.Tensor %9366, %9490, %int1_13171 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9492 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.23.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.23.modulation.lin.weight : tensor<9216x3072xf16>
    %9493 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_13172 = torch.constant.int 0
    %int1_13173 = torch.constant.int 1
    %9494 = torch.aten.transpose.int %9493, %int0_13172, %int1_13173 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.23.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.23.modulation.lin.bias : tensor<9216xf16>
    %9495 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_13174 = torch.constant.int 6
    %9496 = torch.prims.convert_element_type %9495, %int6_13174 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_13175 = torch.constant.int 6
    %9497 = torch.prims.convert_element_type %9492, %int6_13175 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_13176 = torch.constant.int 6
    %9498 = torch.prims.convert_element_type %9494, %int6_13176 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9499 = torch.aten.mm %9497, %9498 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_13177 = torch.constant.int 1
    %9500 = torch.aten.mul.Scalar %9499, %int1_13177 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_13178 = torch.constant.int 1
    %9501 = torch.aten.mul.Scalar %9496, %int1_13178 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_13179 = torch.constant.int 1
    %9502 = torch.aten.add.Tensor %9500, %9501, %int1_13179 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_13180 = torch.constant.int 5
    %9503 = torch.prims.convert_element_type %9502, %int5_13180 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_13181 = torch.constant.int 0
    %int0_13182 = torch.constant.int 0
    %int9223372036854775807_13183 = torch.constant.int 9223372036854775807
    %int1_13184 = torch.constant.int 1
    %9504 = torch.aten.slice.Tensor %9503, %int0_13181, %int0_13182, %int9223372036854775807_13183, %int1_13184 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_13185 = torch.constant.int 1
    %9505 = torch.aten.unsqueeze %9504, %int1_13185 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_13186 = torch.constant.int 2
    %int0_13187 = torch.constant.int 0
    %int9223372036854775807_13188 = torch.constant.int 9223372036854775807
    %int1_13189 = torch.constant.int 1
    %9506 = torch.aten.slice.Tensor %9505, %int2_13186, %int0_13187, %int9223372036854775807_13188, %int1_13189 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_13190 = torch.constant.int -1
    %int0_13191 = torch.constant.int 0
    %int3072_13192 = torch.constant.int 3072
    %int1_13193 = torch.constant.int 1
    %9507 = torch.aten.slice.Tensor %9506, %int-1_13190, %int0_13191, %int3072_13192, %int1_13193 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13194 = torch.constant.int -1
    %int3072_13195 = torch.constant.int 3072
    %int6144_13196 = torch.constant.int 6144
    %int1_13197 = torch.constant.int 1
    %9508 = torch.aten.slice.Tensor %9506, %int-1_13194, %int3072_13195, %int6144_13196, %int1_13197 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13198 = torch.constant.int -1
    %int6144_13199 = torch.constant.int 6144
    %int9216_13200 = torch.constant.int 9216
    %int1_13201 = torch.constant.int 1
    %9509 = torch.aten.slice.Tensor %9506, %int-1_13198, %int6144_13199, %int9216_13200, %int1_13201 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_13202 = torch.constant.int 1
    %int1_13203 = torch.constant.int 1
    %9510 = torch.aten.add.Scalar %9508, %int1_13202, %int1_13203 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_13204 = torch.constant.int 6
    %9511 = torch.prims.convert_element_type %9491, %int6_13204 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_13205 = torch.constant.int 2
    %9512 = torch.prim.ListConstruct %int2_13205 : (!torch.int) -> !torch.list<int>
    %int0_13206 = torch.constant.int 0
    %true_13207 = torch.constant.bool true
    %result0_13208, %result1_13209 = torch.aten.var_mean.correction %9511, %9512, %int0_13206, %true_13207 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_13210 = torch.constant.float 9.9999999999999995E-7
    %int1_13211 = torch.constant.int 1
    %9513 = torch.aten.add.Scalar %result0_13208, %float9.999990e-07_13210, %int1_13211 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9514 = torch.aten.rsqrt %9513 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_13212 = torch.constant.int 1
    %9515 = torch.aten.sub.Tensor %9491, %result1_13209, %int1_13212 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9516 = torch.aten.mul.Tensor %9515, %9514 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_13213 = torch.constant.int 5
    %9517 = torch.prims.convert_element_type %9516, %int5_13213 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9518 = torch.aten.mul.Tensor %9510, %9517 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13214 = torch.constant.int 1
    %9519 = torch.aten.add.Tensor %9518, %9507, %int1_13214 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_13215 = torch.constant.int 4608
    %int3072_13216 = torch.constant.int 3072
    %9520 = torch.prim.ListConstruct %int4608_13215, %int3072_13216 : (!torch.int, !torch.int) -> !torch.list<int>
    %9521 = torch.aten.view %9519, %9520 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.23.linear1.weight = util.global.load @__auto.sampler.single_blocks.23.linear1.weight : tensor<21504x3072xf16>
    %9522 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_13217 = torch.constant.int 0
    %int1_13218 = torch.constant.int 1
    %9523 = torch.aten.transpose.int %9522, %int0_13217, %int1_13218 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.23.linear1.bias = util.global.load @__auto.sampler.single_blocks.23.linear1.bias : tensor<21504xf16>
    %9524 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_13219 = torch.constant.int 6
    %9525 = torch.prims.convert_element_type %9524, %int6_13219 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_13220 = torch.constant.int 6
    %9526 = torch.prims.convert_element_type %9521, %int6_13220 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_13221 = torch.constant.int 6
    %9527 = torch.prims.convert_element_type %9523, %int6_13221 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9528 = torch.aten.mm %9526, %9527 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_13222 = torch.constant.int 1
    %9529 = torch.aten.mul.Scalar %9528, %int1_13222 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_13223 = torch.constant.int 1
    %9530 = torch.aten.mul.Scalar %9525, %int1_13223 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_13224 = torch.constant.int 1
    %9531 = torch.aten.add.Tensor %9529, %9530, %int1_13224 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_13225 = torch.constant.int 5
    %9532 = torch.prims.convert_element_type %9531, %int5_13225 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_13226 = torch.constant.int 1
    %int4608_13227 = torch.constant.int 4608
    %int21504_13228 = torch.constant.int 21504
    %9533 = torch.prim.ListConstruct %int1_13226, %int4608_13227, %int21504_13228 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9534 = torch.aten.view %9532, %9533 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_13229 = torch.constant.int -1
    %int0_13230 = torch.constant.int 0
    %int9216_13231 = torch.constant.int 9216
    %int1_13232 = torch.constant.int 1
    %9535 = torch.aten.slice.Tensor %9534, %int-1_13229, %int0_13230, %int9216_13231, %int1_13232 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_13233 = torch.constant.int -1
    %int9216_13234 = torch.constant.int 9216
    %int21504_13235 = torch.constant.int 21504
    %int1_13236 = torch.constant.int 1
    %9536 = torch.aten.slice.Tensor %9534, %int-1_13233, %int9216_13234, %int21504_13235, %int1_13236 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_13237 = torch.constant.int 1
    %int4608_13238 = torch.constant.int 4608
    %int3_13239 = torch.constant.int 3
    %int24_13240 = torch.constant.int 24
    %int128_13241 = torch.constant.int 128
    %9537 = torch.prim.ListConstruct %int1_13237, %int4608_13238, %int3_13239, %int24_13240, %int128_13241 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9538 = torch.aten.view %9535, %9537 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_13242 = torch.constant.int 2
    %int0_13243 = torch.constant.int 0
    %int3_13244 = torch.constant.int 3
    %int1_13245 = torch.constant.int 1
    %int4_13246 = torch.constant.int 4
    %9539 = torch.prim.ListConstruct %int2_13242, %int0_13243, %int3_13244, %int1_13245, %int4_13246 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9540 = torch.aten.permute %9538, %9539 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_13247 = torch.constant.int 0
    %int0_13248 = torch.constant.int 0
    %9541 = torch.aten.select.int %9540, %int0_13247, %int0_13248 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13249 = torch.constant.int 0
    %int1_13250 = torch.constant.int 1
    %9542 = torch.aten.select.int %9540, %int0_13249, %int1_13250 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13251 = torch.constant.int 0
    %int2_13252 = torch.constant.int 2
    %9543 = torch.aten.select.int %9540, %int0_13251, %int2_13252 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13253 = torch.constant.int 6
    %9544 = torch.prims.convert_element_type %9541, %int6_13253 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13254 = torch.constant.int 2
    %9545 = torch.aten.pow.Tensor_Scalar %9544, %int2_13254 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13255 = torch.constant.int -1
    %9546 = torch.prim.ListConstruct %int-1_13255 : (!torch.int) -> !torch.list<int>
    %true_13256 = torch.constant.bool true
    %none_13257 = torch.constant.none
    %9547 = torch.aten.mean.dim %9545, %9546, %true_13256, %none_13257 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13258 = torch.constant.float 9.9999999999999995E-7
    %int1_13259 = torch.constant.int 1
    %9548 = torch.aten.add.Scalar %9547, %float9.999990e-07_13258, %int1_13259 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9549 = torch.aten.rsqrt %9548 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9550 = torch.aten.mul.Tensor %9544, %9549 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13260 = torch.constant.int 5
    %9551 = torch.prims.convert_element_type %9550, %int5_13260 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.23.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.23.norm.query_norm.scale : tensor<128xf16>
    %9552 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9553 = torch.aten.mul.Tensor %9551, %9552 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13261 = torch.constant.int 6
    %9554 = torch.prims.convert_element_type %9542, %int6_13261 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13262 = torch.constant.int 2
    %9555 = torch.aten.pow.Tensor_Scalar %9554, %int2_13262 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13263 = torch.constant.int -1
    %9556 = torch.prim.ListConstruct %int-1_13263 : (!torch.int) -> !torch.list<int>
    %true_13264 = torch.constant.bool true
    %none_13265 = torch.constant.none
    %9557 = torch.aten.mean.dim %9555, %9556, %true_13264, %none_13265 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13266 = torch.constant.float 9.9999999999999995E-7
    %int1_13267 = torch.constant.int 1
    %9558 = torch.aten.add.Scalar %9557, %float9.999990e-07_13266, %int1_13267 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9559 = torch.aten.rsqrt %9558 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9560 = torch.aten.mul.Tensor %9554, %9559 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13268 = torch.constant.int 5
    %9561 = torch.prims.convert_element_type %9560, %int5_13268 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.23.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.23.norm.key_norm.scale : tensor<128xf16>
    %9562 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9563 = torch.aten.mul.Tensor %9561, %9562 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13269 = torch.constant.int 5
    %9564 = torch.prims.convert_element_type %9553, %int5_13269 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13270 = torch.constant.int 5
    %9565 = torch.prims.convert_element_type %9563, %int5_13270 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13271 = torch.constant.int 6
    %9566 = torch.prims.convert_element_type %9564, %int6_13271 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13272 = torch.constant.int 1
    %int24_13273 = torch.constant.int 24
    %int4608_13274 = torch.constant.int 4608
    %int64_13275 = torch.constant.int 64
    %int1_13276 = torch.constant.int 1
    %int2_13277 = torch.constant.int 2
    %9567 = torch.prim.ListConstruct %int1_13272, %int24_13273, %int4608_13274, %int64_13275, %int1_13276, %int2_13277 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9568 = torch.aten.view %9566, %9567 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_13278 = torch.constant.int 6
    %9569 = torch.prims.convert_element_type %9565, %int6_13278 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13279 = torch.constant.int 1
    %int24_13280 = torch.constant.int 24
    %int4608_13281 = torch.constant.int 4608
    %int64_13282 = torch.constant.int 64
    %int1_13283 = torch.constant.int 1
    %int2_13284 = torch.constant.int 2
    %9570 = torch.prim.ListConstruct %int1_13279, %int24_13280, %int4608_13281, %int64_13282, %int1_13283, %int2_13284 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9571 = torch.aten.view %9569, %9570 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_13285 = torch.constant.int 5
    %int0_13286 = torch.constant.int 0
    %9572 = torch.aten.select.int %211, %int5_13285, %int0_13286 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13287 = torch.constant.int 5
    %int0_13288 = torch.constant.int 0
    %9573 = torch.aten.select.int %9568, %int5_13287, %int0_13288 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9574 = torch.aten.mul.Tensor %9572, %9573 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13289 = torch.constant.int 5
    %int1_13290 = torch.constant.int 1
    %9575 = torch.aten.select.int %211, %int5_13289, %int1_13290 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13291 = torch.constant.int 5
    %int1_13292 = torch.constant.int 1
    %9576 = torch.aten.select.int %9568, %int5_13291, %int1_13292 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9577 = torch.aten.mul.Tensor %9575, %9576 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13293 = torch.constant.int 1
    %9578 = torch.aten.add.Tensor %9574, %9577, %int1_13293 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13294 = torch.constant.int 5
    %int0_13295 = torch.constant.int 0
    %9579 = torch.aten.select.int %211, %int5_13294, %int0_13295 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13296 = torch.constant.int 5
    %int0_13297 = torch.constant.int 0
    %9580 = torch.aten.select.int %9571, %int5_13296, %int0_13297 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9581 = torch.aten.mul.Tensor %9579, %9580 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13298 = torch.constant.int 5
    %int1_13299 = torch.constant.int 1
    %9582 = torch.aten.select.int %211, %int5_13298, %int1_13299 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13300 = torch.constant.int 5
    %int1_13301 = torch.constant.int 1
    %9583 = torch.aten.select.int %9571, %int5_13300, %int1_13301 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9584 = torch.aten.mul.Tensor %9582, %9583 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13302 = torch.constant.int 1
    %9585 = torch.aten.add.Tensor %9581, %9584, %int1_13302 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13303 = torch.constant.int 1
    %int24_13304 = torch.constant.int 24
    %int4608_13305 = torch.constant.int 4608
    %int128_13306 = torch.constant.int 128
    %9586 = torch.prim.ListConstruct %int1_13303, %int24_13304, %int4608_13305, %int128_13306 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9587 = torch.aten.view %9578, %9586 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13307 = torch.constant.int 5
    %9588 = torch.prims.convert_element_type %9587, %int5_13307 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_13308 = torch.constant.int 1
    %int24_13309 = torch.constant.int 24
    %int4608_13310 = torch.constant.int 4608
    %int128_13311 = torch.constant.int 128
    %9589 = torch.prim.ListConstruct %int1_13308, %int24_13309, %int4608_13310, %int128_13311 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9590 = torch.aten.view %9585, %9589 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13312 = torch.constant.int 5
    %9591 = torch.prims.convert_element_type %9590, %int5_13312 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_13313 = torch.constant.float 0.000000e+00
    %false_13314 = torch.constant.bool false
    %none_13315 = torch.constant.none
    %none_13316 = torch.constant.none
    %9592:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9588, %9591, %9543, %float0.000000e00_13313, %false_13314, %none_13315, %none_13316) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_13317 = torch.constant.int 0
    %int2_13318 = torch.constant.int 2
    %int1_13319 = torch.constant.int 1
    %int3_13320 = torch.constant.int 3
    %9593 = torch.prim.ListConstruct %int0_13317, %int2_13318, %int1_13319, %int3_13320 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9594 = torch.aten.permute %9592#0, %9593 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_13321 = torch.constant.int 1
    %int4608_13322 = torch.constant.int 4608
    %int3072_13323 = torch.constant.int 3072
    %9595 = torch.prim.ListConstruct %int1_13321, %int4608_13322, %int3072_13323 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9596 = torch.aten.view %9594, %9595 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_13324 = torch.constant.str "tanh"
    %9597 = torch.aten.gelu %9536, %str_13324 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9598 = torch.prim.ListConstruct %9596, %9597 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_13325 = torch.constant.int 2
    %9599 = torch.aten.cat %9598, %int2_13325 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_13326 = torch.constant.int 4608
    %int15360_13327 = torch.constant.int 15360
    %9600 = torch.prim.ListConstruct %int4608_13326, %int15360_13327 : (!torch.int, !torch.int) -> !torch.list<int>
    %9601 = torch.aten.view %9599, %9600 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.23.linear2.weight = util.global.load @__auto.sampler.single_blocks.23.linear2.weight : tensor<3072x15360xf16>
    %9602 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_13328 = torch.constant.int 0
    %int1_13329 = torch.constant.int 1
    %9603 = torch.aten.transpose.int %9602, %int0_13328, %int1_13329 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.23.linear2.bias = util.global.load @__auto.sampler.single_blocks.23.linear2.bias : tensor<3072xf16>
    %9604 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_13330 = torch.constant.int 6
    %9605 = torch.prims.convert_element_type %9604, %int6_13330 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_13331 = torch.constant.int 6
    %9606 = torch.prims.convert_element_type %9601, %int6_13331 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_13332 = torch.constant.int 6
    %9607 = torch.prims.convert_element_type %9603, %int6_13332 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9608 = torch.aten.mm %9606, %9607 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_13333 = torch.constant.int 1
    %9609 = torch.aten.mul.Scalar %9608, %int1_13333 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_13334 = torch.constant.int 1
    %9610 = torch.aten.mul.Scalar %9605, %int1_13334 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_13335 = torch.constant.int 1
    %9611 = torch.aten.add.Tensor %9609, %9610, %int1_13335 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_13336 = torch.constant.int 5
    %9612 = torch.prims.convert_element_type %9611, %int5_13336 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_13337 = torch.constant.int 1
    %int4608_13338 = torch.constant.int 4608
    %int3072_13339 = torch.constant.int 3072
    %9613 = torch.prim.ListConstruct %int1_13337, %int4608_13338, %int3072_13339 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9614 = torch.aten.view %9612, %9613 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9615 = torch.aten.mul.Tensor %9509, %9614 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13340 = torch.constant.int 1
    %9616 = torch.aten.add.Tensor %9491, %9615, %int1_13340 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9617 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.24.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.24.modulation.lin.weight : tensor<9216x3072xf16>
    %9618 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_13341 = torch.constant.int 0
    %int1_13342 = torch.constant.int 1
    %9619 = torch.aten.transpose.int %9618, %int0_13341, %int1_13342 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.24.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.24.modulation.lin.bias : tensor<9216xf16>
    %9620 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_13343 = torch.constant.int 6
    %9621 = torch.prims.convert_element_type %9620, %int6_13343 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_13344 = torch.constant.int 6
    %9622 = torch.prims.convert_element_type %9617, %int6_13344 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_13345 = torch.constant.int 6
    %9623 = torch.prims.convert_element_type %9619, %int6_13345 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9624 = torch.aten.mm %9622, %9623 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_13346 = torch.constant.int 1
    %9625 = torch.aten.mul.Scalar %9624, %int1_13346 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_13347 = torch.constant.int 1
    %9626 = torch.aten.mul.Scalar %9621, %int1_13347 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_13348 = torch.constant.int 1
    %9627 = torch.aten.add.Tensor %9625, %9626, %int1_13348 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_13349 = torch.constant.int 5
    %9628 = torch.prims.convert_element_type %9627, %int5_13349 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_13350 = torch.constant.int 0
    %int0_13351 = torch.constant.int 0
    %int9223372036854775807_13352 = torch.constant.int 9223372036854775807
    %int1_13353 = torch.constant.int 1
    %9629 = torch.aten.slice.Tensor %9628, %int0_13350, %int0_13351, %int9223372036854775807_13352, %int1_13353 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_13354 = torch.constant.int 1
    %9630 = torch.aten.unsqueeze %9629, %int1_13354 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_13355 = torch.constant.int 2
    %int0_13356 = torch.constant.int 0
    %int9223372036854775807_13357 = torch.constant.int 9223372036854775807
    %int1_13358 = torch.constant.int 1
    %9631 = torch.aten.slice.Tensor %9630, %int2_13355, %int0_13356, %int9223372036854775807_13357, %int1_13358 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_13359 = torch.constant.int -1
    %int0_13360 = torch.constant.int 0
    %int3072_13361 = torch.constant.int 3072
    %int1_13362 = torch.constant.int 1
    %9632 = torch.aten.slice.Tensor %9631, %int-1_13359, %int0_13360, %int3072_13361, %int1_13362 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13363 = torch.constant.int -1
    %int3072_13364 = torch.constant.int 3072
    %int6144_13365 = torch.constant.int 6144
    %int1_13366 = torch.constant.int 1
    %9633 = torch.aten.slice.Tensor %9631, %int-1_13363, %int3072_13364, %int6144_13365, %int1_13366 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13367 = torch.constant.int -1
    %int6144_13368 = torch.constant.int 6144
    %int9216_13369 = torch.constant.int 9216
    %int1_13370 = torch.constant.int 1
    %9634 = torch.aten.slice.Tensor %9631, %int-1_13367, %int6144_13368, %int9216_13369, %int1_13370 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_13371 = torch.constant.int 1
    %int1_13372 = torch.constant.int 1
    %9635 = torch.aten.add.Scalar %9633, %int1_13371, %int1_13372 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_13373 = torch.constant.int 6
    %9636 = torch.prims.convert_element_type %9616, %int6_13373 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_13374 = torch.constant.int 2
    %9637 = torch.prim.ListConstruct %int2_13374 : (!torch.int) -> !torch.list<int>
    %int0_13375 = torch.constant.int 0
    %true_13376 = torch.constant.bool true
    %result0_13377, %result1_13378 = torch.aten.var_mean.correction %9636, %9637, %int0_13375, %true_13376 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_13379 = torch.constant.float 9.9999999999999995E-7
    %int1_13380 = torch.constant.int 1
    %9638 = torch.aten.add.Scalar %result0_13377, %float9.999990e-07_13379, %int1_13380 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9639 = torch.aten.rsqrt %9638 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_13381 = torch.constant.int 1
    %9640 = torch.aten.sub.Tensor %9616, %result1_13378, %int1_13381 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9641 = torch.aten.mul.Tensor %9640, %9639 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_13382 = torch.constant.int 5
    %9642 = torch.prims.convert_element_type %9641, %int5_13382 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9643 = torch.aten.mul.Tensor %9635, %9642 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13383 = torch.constant.int 1
    %9644 = torch.aten.add.Tensor %9643, %9632, %int1_13383 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_13384 = torch.constant.int 4608
    %int3072_13385 = torch.constant.int 3072
    %9645 = torch.prim.ListConstruct %int4608_13384, %int3072_13385 : (!torch.int, !torch.int) -> !torch.list<int>
    %9646 = torch.aten.view %9644, %9645 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.24.linear1.weight = util.global.load @__auto.sampler.single_blocks.24.linear1.weight : tensor<21504x3072xf16>
    %9647 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_13386 = torch.constant.int 0
    %int1_13387 = torch.constant.int 1
    %9648 = torch.aten.transpose.int %9647, %int0_13386, %int1_13387 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.24.linear1.bias = util.global.load @__auto.sampler.single_blocks.24.linear1.bias : tensor<21504xf16>
    %9649 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_13388 = torch.constant.int 6
    %9650 = torch.prims.convert_element_type %9649, %int6_13388 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_13389 = torch.constant.int 6
    %9651 = torch.prims.convert_element_type %9646, %int6_13389 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_13390 = torch.constant.int 6
    %9652 = torch.prims.convert_element_type %9648, %int6_13390 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9653 = torch.aten.mm %9651, %9652 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_13391 = torch.constant.int 1
    %9654 = torch.aten.mul.Scalar %9653, %int1_13391 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_13392 = torch.constant.int 1
    %9655 = torch.aten.mul.Scalar %9650, %int1_13392 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_13393 = torch.constant.int 1
    %9656 = torch.aten.add.Tensor %9654, %9655, %int1_13393 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_13394 = torch.constant.int 5
    %9657 = torch.prims.convert_element_type %9656, %int5_13394 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_13395 = torch.constant.int 1
    %int4608_13396 = torch.constant.int 4608
    %int21504_13397 = torch.constant.int 21504
    %9658 = torch.prim.ListConstruct %int1_13395, %int4608_13396, %int21504_13397 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9659 = torch.aten.view %9657, %9658 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_13398 = torch.constant.int -1
    %int0_13399 = torch.constant.int 0
    %int9216_13400 = torch.constant.int 9216
    %int1_13401 = torch.constant.int 1
    %9660 = torch.aten.slice.Tensor %9659, %int-1_13398, %int0_13399, %int9216_13400, %int1_13401 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_13402 = torch.constant.int -1
    %int9216_13403 = torch.constant.int 9216
    %int21504_13404 = torch.constant.int 21504
    %int1_13405 = torch.constant.int 1
    %9661 = torch.aten.slice.Tensor %9659, %int-1_13402, %int9216_13403, %int21504_13404, %int1_13405 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_13406 = torch.constant.int 1
    %int4608_13407 = torch.constant.int 4608
    %int3_13408 = torch.constant.int 3
    %int24_13409 = torch.constant.int 24
    %int128_13410 = torch.constant.int 128
    %9662 = torch.prim.ListConstruct %int1_13406, %int4608_13407, %int3_13408, %int24_13409, %int128_13410 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9663 = torch.aten.view %9660, %9662 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_13411 = torch.constant.int 2
    %int0_13412 = torch.constant.int 0
    %int3_13413 = torch.constant.int 3
    %int1_13414 = torch.constant.int 1
    %int4_13415 = torch.constant.int 4
    %9664 = torch.prim.ListConstruct %int2_13411, %int0_13412, %int3_13413, %int1_13414, %int4_13415 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9665 = torch.aten.permute %9663, %9664 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_13416 = torch.constant.int 0
    %int0_13417 = torch.constant.int 0
    %9666 = torch.aten.select.int %9665, %int0_13416, %int0_13417 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13418 = torch.constant.int 0
    %int1_13419 = torch.constant.int 1
    %9667 = torch.aten.select.int %9665, %int0_13418, %int1_13419 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13420 = torch.constant.int 0
    %int2_13421 = torch.constant.int 2
    %9668 = torch.aten.select.int %9665, %int0_13420, %int2_13421 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13422 = torch.constant.int 6
    %9669 = torch.prims.convert_element_type %9666, %int6_13422 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13423 = torch.constant.int 2
    %9670 = torch.aten.pow.Tensor_Scalar %9669, %int2_13423 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13424 = torch.constant.int -1
    %9671 = torch.prim.ListConstruct %int-1_13424 : (!torch.int) -> !torch.list<int>
    %true_13425 = torch.constant.bool true
    %none_13426 = torch.constant.none
    %9672 = torch.aten.mean.dim %9670, %9671, %true_13425, %none_13426 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13427 = torch.constant.float 9.9999999999999995E-7
    %int1_13428 = torch.constant.int 1
    %9673 = torch.aten.add.Scalar %9672, %float9.999990e-07_13427, %int1_13428 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9674 = torch.aten.rsqrt %9673 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9675 = torch.aten.mul.Tensor %9669, %9674 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13429 = torch.constant.int 5
    %9676 = torch.prims.convert_element_type %9675, %int5_13429 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.24.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.24.norm.query_norm.scale : tensor<128xf16>
    %9677 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9678 = torch.aten.mul.Tensor %9676, %9677 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13430 = torch.constant.int 6
    %9679 = torch.prims.convert_element_type %9667, %int6_13430 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13431 = torch.constant.int 2
    %9680 = torch.aten.pow.Tensor_Scalar %9679, %int2_13431 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13432 = torch.constant.int -1
    %9681 = torch.prim.ListConstruct %int-1_13432 : (!torch.int) -> !torch.list<int>
    %true_13433 = torch.constant.bool true
    %none_13434 = torch.constant.none
    %9682 = torch.aten.mean.dim %9680, %9681, %true_13433, %none_13434 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13435 = torch.constant.float 9.9999999999999995E-7
    %int1_13436 = torch.constant.int 1
    %9683 = torch.aten.add.Scalar %9682, %float9.999990e-07_13435, %int1_13436 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9684 = torch.aten.rsqrt %9683 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9685 = torch.aten.mul.Tensor %9679, %9684 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13437 = torch.constant.int 5
    %9686 = torch.prims.convert_element_type %9685, %int5_13437 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.24.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.24.norm.key_norm.scale : tensor<128xf16>
    %9687 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9688 = torch.aten.mul.Tensor %9686, %9687 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13438 = torch.constant.int 5
    %9689 = torch.prims.convert_element_type %9678, %int5_13438 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13439 = torch.constant.int 5
    %9690 = torch.prims.convert_element_type %9688, %int5_13439 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13440 = torch.constant.int 6
    %9691 = torch.prims.convert_element_type %9689, %int6_13440 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13441 = torch.constant.int 1
    %int24_13442 = torch.constant.int 24
    %int4608_13443 = torch.constant.int 4608
    %int64_13444 = torch.constant.int 64
    %int1_13445 = torch.constant.int 1
    %int2_13446 = torch.constant.int 2
    %9692 = torch.prim.ListConstruct %int1_13441, %int24_13442, %int4608_13443, %int64_13444, %int1_13445, %int2_13446 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9693 = torch.aten.view %9691, %9692 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_13447 = torch.constant.int 6
    %9694 = torch.prims.convert_element_type %9690, %int6_13447 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13448 = torch.constant.int 1
    %int24_13449 = torch.constant.int 24
    %int4608_13450 = torch.constant.int 4608
    %int64_13451 = torch.constant.int 64
    %int1_13452 = torch.constant.int 1
    %int2_13453 = torch.constant.int 2
    %9695 = torch.prim.ListConstruct %int1_13448, %int24_13449, %int4608_13450, %int64_13451, %int1_13452, %int2_13453 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9696 = torch.aten.view %9694, %9695 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_13454 = torch.constant.int 5
    %int0_13455 = torch.constant.int 0
    %9697 = torch.aten.select.int %211, %int5_13454, %int0_13455 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13456 = torch.constant.int 5
    %int0_13457 = torch.constant.int 0
    %9698 = torch.aten.select.int %9693, %int5_13456, %int0_13457 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9699 = torch.aten.mul.Tensor %9697, %9698 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13458 = torch.constant.int 5
    %int1_13459 = torch.constant.int 1
    %9700 = torch.aten.select.int %211, %int5_13458, %int1_13459 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13460 = torch.constant.int 5
    %int1_13461 = torch.constant.int 1
    %9701 = torch.aten.select.int %9693, %int5_13460, %int1_13461 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9702 = torch.aten.mul.Tensor %9700, %9701 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13462 = torch.constant.int 1
    %9703 = torch.aten.add.Tensor %9699, %9702, %int1_13462 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13463 = torch.constant.int 5
    %int0_13464 = torch.constant.int 0
    %9704 = torch.aten.select.int %211, %int5_13463, %int0_13464 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13465 = torch.constant.int 5
    %int0_13466 = torch.constant.int 0
    %9705 = torch.aten.select.int %9696, %int5_13465, %int0_13466 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9706 = torch.aten.mul.Tensor %9704, %9705 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13467 = torch.constant.int 5
    %int1_13468 = torch.constant.int 1
    %9707 = torch.aten.select.int %211, %int5_13467, %int1_13468 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13469 = torch.constant.int 5
    %int1_13470 = torch.constant.int 1
    %9708 = torch.aten.select.int %9696, %int5_13469, %int1_13470 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9709 = torch.aten.mul.Tensor %9707, %9708 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13471 = torch.constant.int 1
    %9710 = torch.aten.add.Tensor %9706, %9709, %int1_13471 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13472 = torch.constant.int 1
    %int24_13473 = torch.constant.int 24
    %int4608_13474 = torch.constant.int 4608
    %int128_13475 = torch.constant.int 128
    %9711 = torch.prim.ListConstruct %int1_13472, %int24_13473, %int4608_13474, %int128_13475 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9712 = torch.aten.view %9703, %9711 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13476 = torch.constant.int 5
    %9713 = torch.prims.convert_element_type %9712, %int5_13476 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_13477 = torch.constant.int 1
    %int24_13478 = torch.constant.int 24
    %int4608_13479 = torch.constant.int 4608
    %int128_13480 = torch.constant.int 128
    %9714 = torch.prim.ListConstruct %int1_13477, %int24_13478, %int4608_13479, %int128_13480 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9715 = torch.aten.view %9710, %9714 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13481 = torch.constant.int 5
    %9716 = torch.prims.convert_element_type %9715, %int5_13481 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_13482 = torch.constant.float 0.000000e+00
    %false_13483 = torch.constant.bool false
    %none_13484 = torch.constant.none
    %none_13485 = torch.constant.none
    %9717:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9713, %9716, %9668, %float0.000000e00_13482, %false_13483, %none_13484, %none_13485) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_13486 = torch.constant.int 0
    %int2_13487 = torch.constant.int 2
    %int1_13488 = torch.constant.int 1
    %int3_13489 = torch.constant.int 3
    %9718 = torch.prim.ListConstruct %int0_13486, %int2_13487, %int1_13488, %int3_13489 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9719 = torch.aten.permute %9717#0, %9718 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_13490 = torch.constant.int 1
    %int4608_13491 = torch.constant.int 4608
    %int3072_13492 = torch.constant.int 3072
    %9720 = torch.prim.ListConstruct %int1_13490, %int4608_13491, %int3072_13492 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9721 = torch.aten.view %9719, %9720 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_13493 = torch.constant.str "tanh"
    %9722 = torch.aten.gelu %9661, %str_13493 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9723 = torch.prim.ListConstruct %9721, %9722 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_13494 = torch.constant.int 2
    %9724 = torch.aten.cat %9723, %int2_13494 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_13495 = torch.constant.int 4608
    %int15360_13496 = torch.constant.int 15360
    %9725 = torch.prim.ListConstruct %int4608_13495, %int15360_13496 : (!torch.int, !torch.int) -> !torch.list<int>
    %9726 = torch.aten.view %9724, %9725 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.24.linear2.weight = util.global.load @__auto.sampler.single_blocks.24.linear2.weight : tensor<3072x15360xf16>
    %9727 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_13497 = torch.constant.int 0
    %int1_13498 = torch.constant.int 1
    %9728 = torch.aten.transpose.int %9727, %int0_13497, %int1_13498 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.24.linear2.bias = util.global.load @__auto.sampler.single_blocks.24.linear2.bias : tensor<3072xf16>
    %9729 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_13499 = torch.constant.int 6
    %9730 = torch.prims.convert_element_type %9729, %int6_13499 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_13500 = torch.constant.int 6
    %9731 = torch.prims.convert_element_type %9726, %int6_13500 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_13501 = torch.constant.int 6
    %9732 = torch.prims.convert_element_type %9728, %int6_13501 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9733 = torch.aten.mm %9731, %9732 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_13502 = torch.constant.int 1
    %9734 = torch.aten.mul.Scalar %9733, %int1_13502 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_13503 = torch.constant.int 1
    %9735 = torch.aten.mul.Scalar %9730, %int1_13503 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_13504 = torch.constant.int 1
    %9736 = torch.aten.add.Tensor %9734, %9735, %int1_13504 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_13505 = torch.constant.int 5
    %9737 = torch.prims.convert_element_type %9736, %int5_13505 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_13506 = torch.constant.int 1
    %int4608_13507 = torch.constant.int 4608
    %int3072_13508 = torch.constant.int 3072
    %9738 = torch.prim.ListConstruct %int1_13506, %int4608_13507, %int3072_13508 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9739 = torch.aten.view %9737, %9738 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9740 = torch.aten.mul.Tensor %9634, %9739 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13509 = torch.constant.int 1
    %9741 = torch.aten.add.Tensor %9616, %9740, %int1_13509 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9742 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.25.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.25.modulation.lin.weight : tensor<9216x3072xf16>
    %9743 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_13510 = torch.constant.int 0
    %int1_13511 = torch.constant.int 1
    %9744 = torch.aten.transpose.int %9743, %int0_13510, %int1_13511 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.25.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.25.modulation.lin.bias : tensor<9216xf16>
    %9745 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_13512 = torch.constant.int 6
    %9746 = torch.prims.convert_element_type %9745, %int6_13512 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_13513 = torch.constant.int 6
    %9747 = torch.prims.convert_element_type %9742, %int6_13513 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_13514 = torch.constant.int 6
    %9748 = torch.prims.convert_element_type %9744, %int6_13514 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9749 = torch.aten.mm %9747, %9748 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_13515 = torch.constant.int 1
    %9750 = torch.aten.mul.Scalar %9749, %int1_13515 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_13516 = torch.constant.int 1
    %9751 = torch.aten.mul.Scalar %9746, %int1_13516 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_13517 = torch.constant.int 1
    %9752 = torch.aten.add.Tensor %9750, %9751, %int1_13517 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_13518 = torch.constant.int 5
    %9753 = torch.prims.convert_element_type %9752, %int5_13518 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_13519 = torch.constant.int 0
    %int0_13520 = torch.constant.int 0
    %int9223372036854775807_13521 = torch.constant.int 9223372036854775807
    %int1_13522 = torch.constant.int 1
    %9754 = torch.aten.slice.Tensor %9753, %int0_13519, %int0_13520, %int9223372036854775807_13521, %int1_13522 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_13523 = torch.constant.int 1
    %9755 = torch.aten.unsqueeze %9754, %int1_13523 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_13524 = torch.constant.int 2
    %int0_13525 = torch.constant.int 0
    %int9223372036854775807_13526 = torch.constant.int 9223372036854775807
    %int1_13527 = torch.constant.int 1
    %9756 = torch.aten.slice.Tensor %9755, %int2_13524, %int0_13525, %int9223372036854775807_13526, %int1_13527 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_13528 = torch.constant.int -1
    %int0_13529 = torch.constant.int 0
    %int3072_13530 = torch.constant.int 3072
    %int1_13531 = torch.constant.int 1
    %9757 = torch.aten.slice.Tensor %9756, %int-1_13528, %int0_13529, %int3072_13530, %int1_13531 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13532 = torch.constant.int -1
    %int3072_13533 = torch.constant.int 3072
    %int6144_13534 = torch.constant.int 6144
    %int1_13535 = torch.constant.int 1
    %9758 = torch.aten.slice.Tensor %9756, %int-1_13532, %int3072_13533, %int6144_13534, %int1_13535 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13536 = torch.constant.int -1
    %int6144_13537 = torch.constant.int 6144
    %int9216_13538 = torch.constant.int 9216
    %int1_13539 = torch.constant.int 1
    %9759 = torch.aten.slice.Tensor %9756, %int-1_13536, %int6144_13537, %int9216_13538, %int1_13539 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_13540 = torch.constant.int 1
    %int1_13541 = torch.constant.int 1
    %9760 = torch.aten.add.Scalar %9758, %int1_13540, %int1_13541 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_13542 = torch.constant.int 6
    %9761 = torch.prims.convert_element_type %9741, %int6_13542 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_13543 = torch.constant.int 2
    %9762 = torch.prim.ListConstruct %int2_13543 : (!torch.int) -> !torch.list<int>
    %int0_13544 = torch.constant.int 0
    %true_13545 = torch.constant.bool true
    %result0_13546, %result1_13547 = torch.aten.var_mean.correction %9761, %9762, %int0_13544, %true_13545 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_13548 = torch.constant.float 9.9999999999999995E-7
    %int1_13549 = torch.constant.int 1
    %9763 = torch.aten.add.Scalar %result0_13546, %float9.999990e-07_13548, %int1_13549 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9764 = torch.aten.rsqrt %9763 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_13550 = torch.constant.int 1
    %9765 = torch.aten.sub.Tensor %9741, %result1_13547, %int1_13550 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9766 = torch.aten.mul.Tensor %9765, %9764 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_13551 = torch.constant.int 5
    %9767 = torch.prims.convert_element_type %9766, %int5_13551 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9768 = torch.aten.mul.Tensor %9760, %9767 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13552 = torch.constant.int 1
    %9769 = torch.aten.add.Tensor %9768, %9757, %int1_13552 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_13553 = torch.constant.int 4608
    %int3072_13554 = torch.constant.int 3072
    %9770 = torch.prim.ListConstruct %int4608_13553, %int3072_13554 : (!torch.int, !torch.int) -> !torch.list<int>
    %9771 = torch.aten.view %9769, %9770 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.25.linear1.weight = util.global.load @__auto.sampler.single_blocks.25.linear1.weight : tensor<21504x3072xf16>
    %9772 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_13555 = torch.constant.int 0
    %int1_13556 = torch.constant.int 1
    %9773 = torch.aten.transpose.int %9772, %int0_13555, %int1_13556 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.25.linear1.bias = util.global.load @__auto.sampler.single_blocks.25.linear1.bias : tensor<21504xf16>
    %9774 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_13557 = torch.constant.int 6
    %9775 = torch.prims.convert_element_type %9774, %int6_13557 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_13558 = torch.constant.int 6
    %9776 = torch.prims.convert_element_type %9771, %int6_13558 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_13559 = torch.constant.int 6
    %9777 = torch.prims.convert_element_type %9773, %int6_13559 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9778 = torch.aten.mm %9776, %9777 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_13560 = torch.constant.int 1
    %9779 = torch.aten.mul.Scalar %9778, %int1_13560 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_13561 = torch.constant.int 1
    %9780 = torch.aten.mul.Scalar %9775, %int1_13561 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_13562 = torch.constant.int 1
    %9781 = torch.aten.add.Tensor %9779, %9780, %int1_13562 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_13563 = torch.constant.int 5
    %9782 = torch.prims.convert_element_type %9781, %int5_13563 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_13564 = torch.constant.int 1
    %int4608_13565 = torch.constant.int 4608
    %int21504_13566 = torch.constant.int 21504
    %9783 = torch.prim.ListConstruct %int1_13564, %int4608_13565, %int21504_13566 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9784 = torch.aten.view %9782, %9783 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_13567 = torch.constant.int -1
    %int0_13568 = torch.constant.int 0
    %int9216_13569 = torch.constant.int 9216
    %int1_13570 = torch.constant.int 1
    %9785 = torch.aten.slice.Tensor %9784, %int-1_13567, %int0_13568, %int9216_13569, %int1_13570 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_13571 = torch.constant.int -1
    %int9216_13572 = torch.constant.int 9216
    %int21504_13573 = torch.constant.int 21504
    %int1_13574 = torch.constant.int 1
    %9786 = torch.aten.slice.Tensor %9784, %int-1_13571, %int9216_13572, %int21504_13573, %int1_13574 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_13575 = torch.constant.int 1
    %int4608_13576 = torch.constant.int 4608
    %int3_13577 = torch.constant.int 3
    %int24_13578 = torch.constant.int 24
    %int128_13579 = torch.constant.int 128
    %9787 = torch.prim.ListConstruct %int1_13575, %int4608_13576, %int3_13577, %int24_13578, %int128_13579 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9788 = torch.aten.view %9785, %9787 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_13580 = torch.constant.int 2
    %int0_13581 = torch.constant.int 0
    %int3_13582 = torch.constant.int 3
    %int1_13583 = torch.constant.int 1
    %int4_13584 = torch.constant.int 4
    %9789 = torch.prim.ListConstruct %int2_13580, %int0_13581, %int3_13582, %int1_13583, %int4_13584 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9790 = torch.aten.permute %9788, %9789 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_13585 = torch.constant.int 0
    %int0_13586 = torch.constant.int 0
    %9791 = torch.aten.select.int %9790, %int0_13585, %int0_13586 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13587 = torch.constant.int 0
    %int1_13588 = torch.constant.int 1
    %9792 = torch.aten.select.int %9790, %int0_13587, %int1_13588 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13589 = torch.constant.int 0
    %int2_13590 = torch.constant.int 2
    %9793 = torch.aten.select.int %9790, %int0_13589, %int2_13590 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13591 = torch.constant.int 6
    %9794 = torch.prims.convert_element_type %9791, %int6_13591 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13592 = torch.constant.int 2
    %9795 = torch.aten.pow.Tensor_Scalar %9794, %int2_13592 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13593 = torch.constant.int -1
    %9796 = torch.prim.ListConstruct %int-1_13593 : (!torch.int) -> !torch.list<int>
    %true_13594 = torch.constant.bool true
    %none_13595 = torch.constant.none
    %9797 = torch.aten.mean.dim %9795, %9796, %true_13594, %none_13595 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13596 = torch.constant.float 9.9999999999999995E-7
    %int1_13597 = torch.constant.int 1
    %9798 = torch.aten.add.Scalar %9797, %float9.999990e-07_13596, %int1_13597 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9799 = torch.aten.rsqrt %9798 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9800 = torch.aten.mul.Tensor %9794, %9799 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13598 = torch.constant.int 5
    %9801 = torch.prims.convert_element_type %9800, %int5_13598 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.25.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.25.norm.query_norm.scale : tensor<128xf16>
    %9802 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9803 = torch.aten.mul.Tensor %9801, %9802 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13599 = torch.constant.int 6
    %9804 = torch.prims.convert_element_type %9792, %int6_13599 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13600 = torch.constant.int 2
    %9805 = torch.aten.pow.Tensor_Scalar %9804, %int2_13600 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13601 = torch.constant.int -1
    %9806 = torch.prim.ListConstruct %int-1_13601 : (!torch.int) -> !torch.list<int>
    %true_13602 = torch.constant.bool true
    %none_13603 = torch.constant.none
    %9807 = torch.aten.mean.dim %9805, %9806, %true_13602, %none_13603 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13604 = torch.constant.float 9.9999999999999995E-7
    %int1_13605 = torch.constant.int 1
    %9808 = torch.aten.add.Scalar %9807, %float9.999990e-07_13604, %int1_13605 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9809 = torch.aten.rsqrt %9808 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9810 = torch.aten.mul.Tensor %9804, %9809 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13606 = torch.constant.int 5
    %9811 = torch.prims.convert_element_type %9810, %int5_13606 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.25.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.25.norm.key_norm.scale : tensor<128xf16>
    %9812 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9813 = torch.aten.mul.Tensor %9811, %9812 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13607 = torch.constant.int 5
    %9814 = torch.prims.convert_element_type %9803, %int5_13607 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13608 = torch.constant.int 5
    %9815 = torch.prims.convert_element_type %9813, %int5_13608 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13609 = torch.constant.int 6
    %9816 = torch.prims.convert_element_type %9814, %int6_13609 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13610 = torch.constant.int 1
    %int24_13611 = torch.constant.int 24
    %int4608_13612 = torch.constant.int 4608
    %int64_13613 = torch.constant.int 64
    %int1_13614 = torch.constant.int 1
    %int2_13615 = torch.constant.int 2
    %9817 = torch.prim.ListConstruct %int1_13610, %int24_13611, %int4608_13612, %int64_13613, %int1_13614, %int2_13615 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9818 = torch.aten.view %9816, %9817 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_13616 = torch.constant.int 6
    %9819 = torch.prims.convert_element_type %9815, %int6_13616 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13617 = torch.constant.int 1
    %int24_13618 = torch.constant.int 24
    %int4608_13619 = torch.constant.int 4608
    %int64_13620 = torch.constant.int 64
    %int1_13621 = torch.constant.int 1
    %int2_13622 = torch.constant.int 2
    %9820 = torch.prim.ListConstruct %int1_13617, %int24_13618, %int4608_13619, %int64_13620, %int1_13621, %int2_13622 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9821 = torch.aten.view %9819, %9820 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_13623 = torch.constant.int 5
    %int0_13624 = torch.constant.int 0
    %9822 = torch.aten.select.int %211, %int5_13623, %int0_13624 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13625 = torch.constant.int 5
    %int0_13626 = torch.constant.int 0
    %9823 = torch.aten.select.int %9818, %int5_13625, %int0_13626 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9824 = torch.aten.mul.Tensor %9822, %9823 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13627 = torch.constant.int 5
    %int1_13628 = torch.constant.int 1
    %9825 = torch.aten.select.int %211, %int5_13627, %int1_13628 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13629 = torch.constant.int 5
    %int1_13630 = torch.constant.int 1
    %9826 = torch.aten.select.int %9818, %int5_13629, %int1_13630 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9827 = torch.aten.mul.Tensor %9825, %9826 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13631 = torch.constant.int 1
    %9828 = torch.aten.add.Tensor %9824, %9827, %int1_13631 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13632 = torch.constant.int 5
    %int0_13633 = torch.constant.int 0
    %9829 = torch.aten.select.int %211, %int5_13632, %int0_13633 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13634 = torch.constant.int 5
    %int0_13635 = torch.constant.int 0
    %9830 = torch.aten.select.int %9821, %int5_13634, %int0_13635 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9831 = torch.aten.mul.Tensor %9829, %9830 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13636 = torch.constant.int 5
    %int1_13637 = torch.constant.int 1
    %9832 = torch.aten.select.int %211, %int5_13636, %int1_13637 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13638 = torch.constant.int 5
    %int1_13639 = torch.constant.int 1
    %9833 = torch.aten.select.int %9821, %int5_13638, %int1_13639 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9834 = torch.aten.mul.Tensor %9832, %9833 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13640 = torch.constant.int 1
    %9835 = torch.aten.add.Tensor %9831, %9834, %int1_13640 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13641 = torch.constant.int 1
    %int24_13642 = torch.constant.int 24
    %int4608_13643 = torch.constant.int 4608
    %int128_13644 = torch.constant.int 128
    %9836 = torch.prim.ListConstruct %int1_13641, %int24_13642, %int4608_13643, %int128_13644 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9837 = torch.aten.view %9828, %9836 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13645 = torch.constant.int 5
    %9838 = torch.prims.convert_element_type %9837, %int5_13645 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_13646 = torch.constant.int 1
    %int24_13647 = torch.constant.int 24
    %int4608_13648 = torch.constant.int 4608
    %int128_13649 = torch.constant.int 128
    %9839 = torch.prim.ListConstruct %int1_13646, %int24_13647, %int4608_13648, %int128_13649 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9840 = torch.aten.view %9835, %9839 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13650 = torch.constant.int 5
    %9841 = torch.prims.convert_element_type %9840, %int5_13650 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_13651 = torch.constant.float 0.000000e+00
    %false_13652 = torch.constant.bool false
    %none_13653 = torch.constant.none
    %none_13654 = torch.constant.none
    %9842:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9838, %9841, %9793, %float0.000000e00_13651, %false_13652, %none_13653, %none_13654) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_13655 = torch.constant.int 0
    %int2_13656 = torch.constant.int 2
    %int1_13657 = torch.constant.int 1
    %int3_13658 = torch.constant.int 3
    %9843 = torch.prim.ListConstruct %int0_13655, %int2_13656, %int1_13657, %int3_13658 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9844 = torch.aten.permute %9842#0, %9843 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_13659 = torch.constant.int 1
    %int4608_13660 = torch.constant.int 4608
    %int3072_13661 = torch.constant.int 3072
    %9845 = torch.prim.ListConstruct %int1_13659, %int4608_13660, %int3072_13661 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9846 = torch.aten.view %9844, %9845 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_13662 = torch.constant.str "tanh"
    %9847 = torch.aten.gelu %9786, %str_13662 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9848 = torch.prim.ListConstruct %9846, %9847 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_13663 = torch.constant.int 2
    %9849 = torch.aten.cat %9848, %int2_13663 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_13664 = torch.constant.int 4608
    %int15360_13665 = torch.constant.int 15360
    %9850 = torch.prim.ListConstruct %int4608_13664, %int15360_13665 : (!torch.int, !torch.int) -> !torch.list<int>
    %9851 = torch.aten.view %9849, %9850 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.25.linear2.weight = util.global.load @__auto.sampler.single_blocks.25.linear2.weight : tensor<3072x15360xf16>
    %9852 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_13666 = torch.constant.int 0
    %int1_13667 = torch.constant.int 1
    %9853 = torch.aten.transpose.int %9852, %int0_13666, %int1_13667 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.25.linear2.bias = util.global.load @__auto.sampler.single_blocks.25.linear2.bias : tensor<3072xf16>
    %9854 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_13668 = torch.constant.int 6
    %9855 = torch.prims.convert_element_type %9854, %int6_13668 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_13669 = torch.constant.int 6
    %9856 = torch.prims.convert_element_type %9851, %int6_13669 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_13670 = torch.constant.int 6
    %9857 = torch.prims.convert_element_type %9853, %int6_13670 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9858 = torch.aten.mm %9856, %9857 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_13671 = torch.constant.int 1
    %9859 = torch.aten.mul.Scalar %9858, %int1_13671 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_13672 = torch.constant.int 1
    %9860 = torch.aten.mul.Scalar %9855, %int1_13672 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_13673 = torch.constant.int 1
    %9861 = torch.aten.add.Tensor %9859, %9860, %int1_13673 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_13674 = torch.constant.int 5
    %9862 = torch.prims.convert_element_type %9861, %int5_13674 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_13675 = torch.constant.int 1
    %int4608_13676 = torch.constant.int 4608
    %int3072_13677 = torch.constant.int 3072
    %9863 = torch.prim.ListConstruct %int1_13675, %int4608_13676, %int3072_13677 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9864 = torch.aten.view %9862, %9863 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9865 = torch.aten.mul.Tensor %9759, %9864 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13678 = torch.constant.int 1
    %9866 = torch.aten.add.Tensor %9741, %9865, %int1_13678 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9867 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.26.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.26.modulation.lin.weight : tensor<9216x3072xf16>
    %9868 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_13679 = torch.constant.int 0
    %int1_13680 = torch.constant.int 1
    %9869 = torch.aten.transpose.int %9868, %int0_13679, %int1_13680 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.26.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.26.modulation.lin.bias : tensor<9216xf16>
    %9870 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_13681 = torch.constant.int 6
    %9871 = torch.prims.convert_element_type %9870, %int6_13681 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_13682 = torch.constant.int 6
    %9872 = torch.prims.convert_element_type %9867, %int6_13682 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_13683 = torch.constant.int 6
    %9873 = torch.prims.convert_element_type %9869, %int6_13683 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9874 = torch.aten.mm %9872, %9873 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_13684 = torch.constant.int 1
    %9875 = torch.aten.mul.Scalar %9874, %int1_13684 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_13685 = torch.constant.int 1
    %9876 = torch.aten.mul.Scalar %9871, %int1_13685 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_13686 = torch.constant.int 1
    %9877 = torch.aten.add.Tensor %9875, %9876, %int1_13686 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_13687 = torch.constant.int 5
    %9878 = torch.prims.convert_element_type %9877, %int5_13687 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_13688 = torch.constant.int 0
    %int0_13689 = torch.constant.int 0
    %int9223372036854775807_13690 = torch.constant.int 9223372036854775807
    %int1_13691 = torch.constant.int 1
    %9879 = torch.aten.slice.Tensor %9878, %int0_13688, %int0_13689, %int9223372036854775807_13690, %int1_13691 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_13692 = torch.constant.int 1
    %9880 = torch.aten.unsqueeze %9879, %int1_13692 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_13693 = torch.constant.int 2
    %int0_13694 = torch.constant.int 0
    %int9223372036854775807_13695 = torch.constant.int 9223372036854775807
    %int1_13696 = torch.constant.int 1
    %9881 = torch.aten.slice.Tensor %9880, %int2_13693, %int0_13694, %int9223372036854775807_13695, %int1_13696 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_13697 = torch.constant.int -1
    %int0_13698 = torch.constant.int 0
    %int3072_13699 = torch.constant.int 3072
    %int1_13700 = torch.constant.int 1
    %9882 = torch.aten.slice.Tensor %9881, %int-1_13697, %int0_13698, %int3072_13699, %int1_13700 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13701 = torch.constant.int -1
    %int3072_13702 = torch.constant.int 3072
    %int6144_13703 = torch.constant.int 6144
    %int1_13704 = torch.constant.int 1
    %9883 = torch.aten.slice.Tensor %9881, %int-1_13701, %int3072_13702, %int6144_13703, %int1_13704 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13705 = torch.constant.int -1
    %int6144_13706 = torch.constant.int 6144
    %int9216_13707 = torch.constant.int 9216
    %int1_13708 = torch.constant.int 1
    %9884 = torch.aten.slice.Tensor %9881, %int-1_13705, %int6144_13706, %int9216_13707, %int1_13708 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_13709 = torch.constant.int 1
    %int1_13710 = torch.constant.int 1
    %9885 = torch.aten.add.Scalar %9883, %int1_13709, %int1_13710 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_13711 = torch.constant.int 6
    %9886 = torch.prims.convert_element_type %9866, %int6_13711 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_13712 = torch.constant.int 2
    %9887 = torch.prim.ListConstruct %int2_13712 : (!torch.int) -> !torch.list<int>
    %int0_13713 = torch.constant.int 0
    %true_13714 = torch.constant.bool true
    %result0_13715, %result1_13716 = torch.aten.var_mean.correction %9886, %9887, %int0_13713, %true_13714 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_13717 = torch.constant.float 9.9999999999999995E-7
    %int1_13718 = torch.constant.int 1
    %9888 = torch.aten.add.Scalar %result0_13715, %float9.999990e-07_13717, %int1_13718 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9889 = torch.aten.rsqrt %9888 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_13719 = torch.constant.int 1
    %9890 = torch.aten.sub.Tensor %9866, %result1_13716, %int1_13719 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9891 = torch.aten.mul.Tensor %9890, %9889 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_13720 = torch.constant.int 5
    %9892 = torch.prims.convert_element_type %9891, %int5_13720 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9893 = torch.aten.mul.Tensor %9885, %9892 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13721 = torch.constant.int 1
    %9894 = torch.aten.add.Tensor %9893, %9882, %int1_13721 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_13722 = torch.constant.int 4608
    %int3072_13723 = torch.constant.int 3072
    %9895 = torch.prim.ListConstruct %int4608_13722, %int3072_13723 : (!torch.int, !torch.int) -> !torch.list<int>
    %9896 = torch.aten.view %9894, %9895 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.26.linear1.weight = util.global.load @__auto.sampler.single_blocks.26.linear1.weight : tensor<21504x3072xf16>
    %9897 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_13724 = torch.constant.int 0
    %int1_13725 = torch.constant.int 1
    %9898 = torch.aten.transpose.int %9897, %int0_13724, %int1_13725 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.26.linear1.bias = util.global.load @__auto.sampler.single_blocks.26.linear1.bias : tensor<21504xf16>
    %9899 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_13726 = torch.constant.int 6
    %9900 = torch.prims.convert_element_type %9899, %int6_13726 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_13727 = torch.constant.int 6
    %9901 = torch.prims.convert_element_type %9896, %int6_13727 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_13728 = torch.constant.int 6
    %9902 = torch.prims.convert_element_type %9898, %int6_13728 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9903 = torch.aten.mm %9901, %9902 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_13729 = torch.constant.int 1
    %9904 = torch.aten.mul.Scalar %9903, %int1_13729 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_13730 = torch.constant.int 1
    %9905 = torch.aten.mul.Scalar %9900, %int1_13730 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_13731 = torch.constant.int 1
    %9906 = torch.aten.add.Tensor %9904, %9905, %int1_13731 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_13732 = torch.constant.int 5
    %9907 = torch.prims.convert_element_type %9906, %int5_13732 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_13733 = torch.constant.int 1
    %int4608_13734 = torch.constant.int 4608
    %int21504_13735 = torch.constant.int 21504
    %9908 = torch.prim.ListConstruct %int1_13733, %int4608_13734, %int21504_13735 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9909 = torch.aten.view %9907, %9908 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_13736 = torch.constant.int -1
    %int0_13737 = torch.constant.int 0
    %int9216_13738 = torch.constant.int 9216
    %int1_13739 = torch.constant.int 1
    %9910 = torch.aten.slice.Tensor %9909, %int-1_13736, %int0_13737, %int9216_13738, %int1_13739 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_13740 = torch.constant.int -1
    %int9216_13741 = torch.constant.int 9216
    %int21504_13742 = torch.constant.int 21504
    %int1_13743 = torch.constant.int 1
    %9911 = torch.aten.slice.Tensor %9909, %int-1_13740, %int9216_13741, %int21504_13742, %int1_13743 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_13744 = torch.constant.int 1
    %int4608_13745 = torch.constant.int 4608
    %int3_13746 = torch.constant.int 3
    %int24_13747 = torch.constant.int 24
    %int128_13748 = torch.constant.int 128
    %9912 = torch.prim.ListConstruct %int1_13744, %int4608_13745, %int3_13746, %int24_13747, %int128_13748 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9913 = torch.aten.view %9910, %9912 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_13749 = torch.constant.int 2
    %int0_13750 = torch.constant.int 0
    %int3_13751 = torch.constant.int 3
    %int1_13752 = torch.constant.int 1
    %int4_13753 = torch.constant.int 4
    %9914 = torch.prim.ListConstruct %int2_13749, %int0_13750, %int3_13751, %int1_13752, %int4_13753 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9915 = torch.aten.permute %9913, %9914 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_13754 = torch.constant.int 0
    %int0_13755 = torch.constant.int 0
    %9916 = torch.aten.select.int %9915, %int0_13754, %int0_13755 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13756 = torch.constant.int 0
    %int1_13757 = torch.constant.int 1
    %9917 = torch.aten.select.int %9915, %int0_13756, %int1_13757 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13758 = torch.constant.int 0
    %int2_13759 = torch.constant.int 2
    %9918 = torch.aten.select.int %9915, %int0_13758, %int2_13759 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13760 = torch.constant.int 6
    %9919 = torch.prims.convert_element_type %9916, %int6_13760 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13761 = torch.constant.int 2
    %9920 = torch.aten.pow.Tensor_Scalar %9919, %int2_13761 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13762 = torch.constant.int -1
    %9921 = torch.prim.ListConstruct %int-1_13762 : (!torch.int) -> !torch.list<int>
    %true_13763 = torch.constant.bool true
    %none_13764 = torch.constant.none
    %9922 = torch.aten.mean.dim %9920, %9921, %true_13763, %none_13764 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13765 = torch.constant.float 9.9999999999999995E-7
    %int1_13766 = torch.constant.int 1
    %9923 = torch.aten.add.Scalar %9922, %float9.999990e-07_13765, %int1_13766 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9924 = torch.aten.rsqrt %9923 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9925 = torch.aten.mul.Tensor %9919, %9924 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13767 = torch.constant.int 5
    %9926 = torch.prims.convert_element_type %9925, %int5_13767 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.26.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.26.norm.query_norm.scale : tensor<128xf16>
    %9927 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9928 = torch.aten.mul.Tensor %9926, %9927 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13768 = torch.constant.int 6
    %9929 = torch.prims.convert_element_type %9917, %int6_13768 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13769 = torch.constant.int 2
    %9930 = torch.aten.pow.Tensor_Scalar %9929, %int2_13769 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13770 = torch.constant.int -1
    %9931 = torch.prim.ListConstruct %int-1_13770 : (!torch.int) -> !torch.list<int>
    %true_13771 = torch.constant.bool true
    %none_13772 = torch.constant.none
    %9932 = torch.aten.mean.dim %9930, %9931, %true_13771, %none_13772 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13773 = torch.constant.float 9.9999999999999995E-7
    %int1_13774 = torch.constant.int 1
    %9933 = torch.aten.add.Scalar %9932, %float9.999990e-07_13773, %int1_13774 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9934 = torch.aten.rsqrt %9933 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9935 = torch.aten.mul.Tensor %9929, %9934 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13775 = torch.constant.int 5
    %9936 = torch.prims.convert_element_type %9935, %int5_13775 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.26.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.26.norm.key_norm.scale : tensor<128xf16>
    %9937 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9938 = torch.aten.mul.Tensor %9936, %9937 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13776 = torch.constant.int 5
    %9939 = torch.prims.convert_element_type %9928, %int5_13776 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13777 = torch.constant.int 5
    %9940 = torch.prims.convert_element_type %9938, %int5_13777 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13778 = torch.constant.int 6
    %9941 = torch.prims.convert_element_type %9939, %int6_13778 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13779 = torch.constant.int 1
    %int24_13780 = torch.constant.int 24
    %int4608_13781 = torch.constant.int 4608
    %int64_13782 = torch.constant.int 64
    %int1_13783 = torch.constant.int 1
    %int2_13784 = torch.constant.int 2
    %9942 = torch.prim.ListConstruct %int1_13779, %int24_13780, %int4608_13781, %int64_13782, %int1_13783, %int2_13784 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9943 = torch.aten.view %9941, %9942 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_13785 = torch.constant.int 6
    %9944 = torch.prims.convert_element_type %9940, %int6_13785 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13786 = torch.constant.int 1
    %int24_13787 = torch.constant.int 24
    %int4608_13788 = torch.constant.int 4608
    %int64_13789 = torch.constant.int 64
    %int1_13790 = torch.constant.int 1
    %int2_13791 = torch.constant.int 2
    %9945 = torch.prim.ListConstruct %int1_13786, %int24_13787, %int4608_13788, %int64_13789, %int1_13790, %int2_13791 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9946 = torch.aten.view %9944, %9945 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_13792 = torch.constant.int 5
    %int0_13793 = torch.constant.int 0
    %9947 = torch.aten.select.int %211, %int5_13792, %int0_13793 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13794 = torch.constant.int 5
    %int0_13795 = torch.constant.int 0
    %9948 = torch.aten.select.int %9943, %int5_13794, %int0_13795 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9949 = torch.aten.mul.Tensor %9947, %9948 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13796 = torch.constant.int 5
    %int1_13797 = torch.constant.int 1
    %9950 = torch.aten.select.int %211, %int5_13796, %int1_13797 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13798 = torch.constant.int 5
    %int1_13799 = torch.constant.int 1
    %9951 = torch.aten.select.int %9943, %int5_13798, %int1_13799 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9952 = torch.aten.mul.Tensor %9950, %9951 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13800 = torch.constant.int 1
    %9953 = torch.aten.add.Tensor %9949, %9952, %int1_13800 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13801 = torch.constant.int 5
    %int0_13802 = torch.constant.int 0
    %9954 = torch.aten.select.int %211, %int5_13801, %int0_13802 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13803 = torch.constant.int 5
    %int0_13804 = torch.constant.int 0
    %9955 = torch.aten.select.int %9946, %int5_13803, %int0_13804 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9956 = torch.aten.mul.Tensor %9954, %9955 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13805 = torch.constant.int 5
    %int1_13806 = torch.constant.int 1
    %9957 = torch.aten.select.int %211, %int5_13805, %int1_13806 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13807 = torch.constant.int 5
    %int1_13808 = torch.constant.int 1
    %9958 = torch.aten.select.int %9946, %int5_13807, %int1_13808 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9959 = torch.aten.mul.Tensor %9957, %9958 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13809 = torch.constant.int 1
    %9960 = torch.aten.add.Tensor %9956, %9959, %int1_13809 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13810 = torch.constant.int 1
    %int24_13811 = torch.constant.int 24
    %int4608_13812 = torch.constant.int 4608
    %int128_13813 = torch.constant.int 128
    %9961 = torch.prim.ListConstruct %int1_13810, %int24_13811, %int4608_13812, %int128_13813 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9962 = torch.aten.view %9953, %9961 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13814 = torch.constant.int 5
    %9963 = torch.prims.convert_element_type %9962, %int5_13814 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_13815 = torch.constant.int 1
    %int24_13816 = torch.constant.int 24
    %int4608_13817 = torch.constant.int 4608
    %int128_13818 = torch.constant.int 128
    %9964 = torch.prim.ListConstruct %int1_13815, %int24_13816, %int4608_13817, %int128_13818 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9965 = torch.aten.view %9960, %9964 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13819 = torch.constant.int 5
    %9966 = torch.prims.convert_element_type %9965, %int5_13819 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_13820 = torch.constant.float 0.000000e+00
    %false_13821 = torch.constant.bool false
    %none_13822 = torch.constant.none
    %none_13823 = torch.constant.none
    %9967:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9963, %9966, %9918, %float0.000000e00_13820, %false_13821, %none_13822, %none_13823) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_13824 = torch.constant.int 0
    %int2_13825 = torch.constant.int 2
    %int1_13826 = torch.constant.int 1
    %int3_13827 = torch.constant.int 3
    %9968 = torch.prim.ListConstruct %int0_13824, %int2_13825, %int1_13826, %int3_13827 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9969 = torch.aten.permute %9967#0, %9968 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_13828 = torch.constant.int 1
    %int4608_13829 = torch.constant.int 4608
    %int3072_13830 = torch.constant.int 3072
    %9970 = torch.prim.ListConstruct %int1_13828, %int4608_13829, %int3072_13830 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9971 = torch.aten.view %9969, %9970 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_13831 = torch.constant.str "tanh"
    %9972 = torch.aten.gelu %9911, %str_13831 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9973 = torch.prim.ListConstruct %9971, %9972 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_13832 = torch.constant.int 2
    %9974 = torch.aten.cat %9973, %int2_13832 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_13833 = torch.constant.int 4608
    %int15360_13834 = torch.constant.int 15360
    %9975 = torch.prim.ListConstruct %int4608_13833, %int15360_13834 : (!torch.int, !torch.int) -> !torch.list<int>
    %9976 = torch.aten.view %9974, %9975 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.26.linear2.weight = util.global.load @__auto.sampler.single_blocks.26.linear2.weight : tensor<3072x15360xf16>
    %9977 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_13835 = torch.constant.int 0
    %int1_13836 = torch.constant.int 1
    %9978 = torch.aten.transpose.int %9977, %int0_13835, %int1_13836 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.26.linear2.bias = util.global.load @__auto.sampler.single_blocks.26.linear2.bias : tensor<3072xf16>
    %9979 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_13837 = torch.constant.int 6
    %9980 = torch.prims.convert_element_type %9979, %int6_13837 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_13838 = torch.constant.int 6
    %9981 = torch.prims.convert_element_type %9976, %int6_13838 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_13839 = torch.constant.int 6
    %9982 = torch.prims.convert_element_type %9978, %int6_13839 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9983 = torch.aten.mm %9981, %9982 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_13840 = torch.constant.int 1
    %9984 = torch.aten.mul.Scalar %9983, %int1_13840 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_13841 = torch.constant.int 1
    %9985 = torch.aten.mul.Scalar %9980, %int1_13841 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_13842 = torch.constant.int 1
    %9986 = torch.aten.add.Tensor %9984, %9985, %int1_13842 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_13843 = torch.constant.int 5
    %9987 = torch.prims.convert_element_type %9986, %int5_13843 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_13844 = torch.constant.int 1
    %int4608_13845 = torch.constant.int 4608
    %int3072_13846 = torch.constant.int 3072
    %9988 = torch.prim.ListConstruct %int1_13844, %int4608_13845, %int3072_13846 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9989 = torch.aten.view %9987, %9988 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9990 = torch.aten.mul.Tensor %9884, %9989 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13847 = torch.constant.int 1
    %9991 = torch.aten.add.Tensor %9866, %9990, %int1_13847 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9992 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.27.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.27.modulation.lin.weight : tensor<9216x3072xf16>
    %9993 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_13848 = torch.constant.int 0
    %int1_13849 = torch.constant.int 1
    %9994 = torch.aten.transpose.int %9993, %int0_13848, %int1_13849 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.27.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.27.modulation.lin.bias : tensor<9216xf16>
    %9995 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_13850 = torch.constant.int 6
    %9996 = torch.prims.convert_element_type %9995, %int6_13850 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_13851 = torch.constant.int 6
    %9997 = torch.prims.convert_element_type %9992, %int6_13851 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_13852 = torch.constant.int 6
    %9998 = torch.prims.convert_element_type %9994, %int6_13852 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9999 = torch.aten.mm %9997, %9998 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_13853 = torch.constant.int 1
    %10000 = torch.aten.mul.Scalar %9999, %int1_13853 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_13854 = torch.constant.int 1
    %10001 = torch.aten.mul.Scalar %9996, %int1_13854 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_13855 = torch.constant.int 1
    %10002 = torch.aten.add.Tensor %10000, %10001, %int1_13855 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_13856 = torch.constant.int 5
    %10003 = torch.prims.convert_element_type %10002, %int5_13856 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_13857 = torch.constant.int 0
    %int0_13858 = torch.constant.int 0
    %int9223372036854775807_13859 = torch.constant.int 9223372036854775807
    %int1_13860 = torch.constant.int 1
    %10004 = torch.aten.slice.Tensor %10003, %int0_13857, %int0_13858, %int9223372036854775807_13859, %int1_13860 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_13861 = torch.constant.int 1
    %10005 = torch.aten.unsqueeze %10004, %int1_13861 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_13862 = torch.constant.int 2
    %int0_13863 = torch.constant.int 0
    %int9223372036854775807_13864 = torch.constant.int 9223372036854775807
    %int1_13865 = torch.constant.int 1
    %10006 = torch.aten.slice.Tensor %10005, %int2_13862, %int0_13863, %int9223372036854775807_13864, %int1_13865 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_13866 = torch.constant.int -1
    %int0_13867 = torch.constant.int 0
    %int3072_13868 = torch.constant.int 3072
    %int1_13869 = torch.constant.int 1
    %10007 = torch.aten.slice.Tensor %10006, %int-1_13866, %int0_13867, %int3072_13868, %int1_13869 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13870 = torch.constant.int -1
    %int3072_13871 = torch.constant.int 3072
    %int6144_13872 = torch.constant.int 6144
    %int1_13873 = torch.constant.int 1
    %10008 = torch.aten.slice.Tensor %10006, %int-1_13870, %int3072_13871, %int6144_13872, %int1_13873 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13874 = torch.constant.int -1
    %int6144_13875 = torch.constant.int 6144
    %int9216_13876 = torch.constant.int 9216
    %int1_13877 = torch.constant.int 1
    %10009 = torch.aten.slice.Tensor %10006, %int-1_13874, %int6144_13875, %int9216_13876, %int1_13877 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_13878 = torch.constant.int 1
    %int1_13879 = torch.constant.int 1
    %10010 = torch.aten.add.Scalar %10008, %int1_13878, %int1_13879 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_13880 = torch.constant.int 6
    %10011 = torch.prims.convert_element_type %9991, %int6_13880 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_13881 = torch.constant.int 2
    %10012 = torch.prim.ListConstruct %int2_13881 : (!torch.int) -> !torch.list<int>
    %int0_13882 = torch.constant.int 0
    %true_13883 = torch.constant.bool true
    %result0_13884, %result1_13885 = torch.aten.var_mean.correction %10011, %10012, %int0_13882, %true_13883 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_13886 = torch.constant.float 9.9999999999999995E-7
    %int1_13887 = torch.constant.int 1
    %10013 = torch.aten.add.Scalar %result0_13884, %float9.999990e-07_13886, %int1_13887 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %10014 = torch.aten.rsqrt %10013 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_13888 = torch.constant.int 1
    %10015 = torch.aten.sub.Tensor %9991, %result1_13885, %int1_13888 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %10016 = torch.aten.mul.Tensor %10015, %10014 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_13889 = torch.constant.int 5
    %10017 = torch.prims.convert_element_type %10016, %int5_13889 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10018 = torch.aten.mul.Tensor %10010, %10017 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13890 = torch.constant.int 1
    %10019 = torch.aten.add.Tensor %10018, %10007, %int1_13890 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_13891 = torch.constant.int 4608
    %int3072_13892 = torch.constant.int 3072
    %10020 = torch.prim.ListConstruct %int4608_13891, %int3072_13892 : (!torch.int, !torch.int) -> !torch.list<int>
    %10021 = torch.aten.view %10019, %10020 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.27.linear1.weight = util.global.load @__auto.sampler.single_blocks.27.linear1.weight : tensor<21504x3072xf16>
    %10022 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_13893 = torch.constant.int 0
    %int1_13894 = torch.constant.int 1
    %10023 = torch.aten.transpose.int %10022, %int0_13893, %int1_13894 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.27.linear1.bias = util.global.load @__auto.sampler.single_blocks.27.linear1.bias : tensor<21504xf16>
    %10024 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_13895 = torch.constant.int 6
    %10025 = torch.prims.convert_element_type %10024, %int6_13895 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_13896 = torch.constant.int 6
    %10026 = torch.prims.convert_element_type %10021, %int6_13896 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_13897 = torch.constant.int 6
    %10027 = torch.prims.convert_element_type %10023, %int6_13897 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %10028 = torch.aten.mm %10026, %10027 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_13898 = torch.constant.int 1
    %10029 = torch.aten.mul.Scalar %10028, %int1_13898 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_13899 = torch.constant.int 1
    %10030 = torch.aten.mul.Scalar %10025, %int1_13899 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_13900 = torch.constant.int 1
    %10031 = torch.aten.add.Tensor %10029, %10030, %int1_13900 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_13901 = torch.constant.int 5
    %10032 = torch.prims.convert_element_type %10031, %int5_13901 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_13902 = torch.constant.int 1
    %int4608_13903 = torch.constant.int 4608
    %int21504_13904 = torch.constant.int 21504
    %10033 = torch.prim.ListConstruct %int1_13902, %int4608_13903, %int21504_13904 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10034 = torch.aten.view %10032, %10033 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_13905 = torch.constant.int -1
    %int0_13906 = torch.constant.int 0
    %int9216_13907 = torch.constant.int 9216
    %int1_13908 = torch.constant.int 1
    %10035 = torch.aten.slice.Tensor %10034, %int-1_13905, %int0_13906, %int9216_13907, %int1_13908 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_13909 = torch.constant.int -1
    %int9216_13910 = torch.constant.int 9216
    %int21504_13911 = torch.constant.int 21504
    %int1_13912 = torch.constant.int 1
    %10036 = torch.aten.slice.Tensor %10034, %int-1_13909, %int9216_13910, %int21504_13911, %int1_13912 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_13913 = torch.constant.int 1
    %int4608_13914 = torch.constant.int 4608
    %int3_13915 = torch.constant.int 3
    %int24_13916 = torch.constant.int 24
    %int128_13917 = torch.constant.int 128
    %10037 = torch.prim.ListConstruct %int1_13913, %int4608_13914, %int3_13915, %int24_13916, %int128_13917 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10038 = torch.aten.view %10035, %10037 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_13918 = torch.constant.int 2
    %int0_13919 = torch.constant.int 0
    %int3_13920 = torch.constant.int 3
    %int1_13921 = torch.constant.int 1
    %int4_13922 = torch.constant.int 4
    %10039 = torch.prim.ListConstruct %int2_13918, %int0_13919, %int3_13920, %int1_13921, %int4_13922 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10040 = torch.aten.permute %10038, %10039 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_13923 = torch.constant.int 0
    %int0_13924 = torch.constant.int 0
    %10041 = torch.aten.select.int %10040, %int0_13923, %int0_13924 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13925 = torch.constant.int 0
    %int1_13926 = torch.constant.int 1
    %10042 = torch.aten.select.int %10040, %int0_13925, %int1_13926 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13927 = torch.constant.int 0
    %int2_13928 = torch.constant.int 2
    %10043 = torch.aten.select.int %10040, %int0_13927, %int2_13928 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13929 = torch.constant.int 6
    %10044 = torch.prims.convert_element_type %10041, %int6_13929 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13930 = torch.constant.int 2
    %10045 = torch.aten.pow.Tensor_Scalar %10044, %int2_13930 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13931 = torch.constant.int -1
    %10046 = torch.prim.ListConstruct %int-1_13931 : (!torch.int) -> !torch.list<int>
    %true_13932 = torch.constant.bool true
    %none_13933 = torch.constant.none
    %10047 = torch.aten.mean.dim %10045, %10046, %true_13932, %none_13933 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13934 = torch.constant.float 9.9999999999999995E-7
    %int1_13935 = torch.constant.int 1
    %10048 = torch.aten.add.Scalar %10047, %float9.999990e-07_13934, %int1_13935 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10049 = torch.aten.rsqrt %10048 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10050 = torch.aten.mul.Tensor %10044, %10049 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13936 = torch.constant.int 5
    %10051 = torch.prims.convert_element_type %10050, %int5_13936 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.27.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.27.norm.query_norm.scale : tensor<128xf16>
    %10052 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10053 = torch.aten.mul.Tensor %10051, %10052 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13937 = torch.constant.int 6
    %10054 = torch.prims.convert_element_type %10042, %int6_13937 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13938 = torch.constant.int 2
    %10055 = torch.aten.pow.Tensor_Scalar %10054, %int2_13938 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13939 = torch.constant.int -1
    %10056 = torch.prim.ListConstruct %int-1_13939 : (!torch.int) -> !torch.list<int>
    %true_13940 = torch.constant.bool true
    %none_13941 = torch.constant.none
    %10057 = torch.aten.mean.dim %10055, %10056, %true_13940, %none_13941 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13942 = torch.constant.float 9.9999999999999995E-7
    %int1_13943 = torch.constant.int 1
    %10058 = torch.aten.add.Scalar %10057, %float9.999990e-07_13942, %int1_13943 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10059 = torch.aten.rsqrt %10058 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10060 = torch.aten.mul.Tensor %10054, %10059 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13944 = torch.constant.int 5
    %10061 = torch.prims.convert_element_type %10060, %int5_13944 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.27.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.27.norm.key_norm.scale : tensor<128xf16>
    %10062 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10063 = torch.aten.mul.Tensor %10061, %10062 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13945 = torch.constant.int 5
    %10064 = torch.prims.convert_element_type %10053, %int5_13945 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13946 = torch.constant.int 5
    %10065 = torch.prims.convert_element_type %10063, %int5_13946 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13947 = torch.constant.int 6
    %10066 = torch.prims.convert_element_type %10064, %int6_13947 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13948 = torch.constant.int 1
    %int24_13949 = torch.constant.int 24
    %int4608_13950 = torch.constant.int 4608
    %int64_13951 = torch.constant.int 64
    %int1_13952 = torch.constant.int 1
    %int2_13953 = torch.constant.int 2
    %10067 = torch.prim.ListConstruct %int1_13948, %int24_13949, %int4608_13950, %int64_13951, %int1_13952, %int2_13953 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10068 = torch.aten.view %10066, %10067 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_13954 = torch.constant.int 6
    %10069 = torch.prims.convert_element_type %10065, %int6_13954 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13955 = torch.constant.int 1
    %int24_13956 = torch.constant.int 24
    %int4608_13957 = torch.constant.int 4608
    %int64_13958 = torch.constant.int 64
    %int1_13959 = torch.constant.int 1
    %int2_13960 = torch.constant.int 2
    %10070 = torch.prim.ListConstruct %int1_13955, %int24_13956, %int4608_13957, %int64_13958, %int1_13959, %int2_13960 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10071 = torch.aten.view %10069, %10070 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_13961 = torch.constant.int 5
    %int0_13962 = torch.constant.int 0
    %10072 = torch.aten.select.int %211, %int5_13961, %int0_13962 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13963 = torch.constant.int 5
    %int0_13964 = torch.constant.int 0
    %10073 = torch.aten.select.int %10068, %int5_13963, %int0_13964 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10074 = torch.aten.mul.Tensor %10072, %10073 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13965 = torch.constant.int 5
    %int1_13966 = torch.constant.int 1
    %10075 = torch.aten.select.int %211, %int5_13965, %int1_13966 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13967 = torch.constant.int 5
    %int1_13968 = torch.constant.int 1
    %10076 = torch.aten.select.int %10068, %int5_13967, %int1_13968 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10077 = torch.aten.mul.Tensor %10075, %10076 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13969 = torch.constant.int 1
    %10078 = torch.aten.add.Tensor %10074, %10077, %int1_13969 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13970 = torch.constant.int 5
    %int0_13971 = torch.constant.int 0
    %10079 = torch.aten.select.int %211, %int5_13970, %int0_13971 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13972 = torch.constant.int 5
    %int0_13973 = torch.constant.int 0
    %10080 = torch.aten.select.int %10071, %int5_13972, %int0_13973 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10081 = torch.aten.mul.Tensor %10079, %10080 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13974 = torch.constant.int 5
    %int1_13975 = torch.constant.int 1
    %10082 = torch.aten.select.int %211, %int5_13974, %int1_13975 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13976 = torch.constant.int 5
    %int1_13977 = torch.constant.int 1
    %10083 = torch.aten.select.int %10071, %int5_13976, %int1_13977 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10084 = torch.aten.mul.Tensor %10082, %10083 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13978 = torch.constant.int 1
    %10085 = torch.aten.add.Tensor %10081, %10084, %int1_13978 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13979 = torch.constant.int 1
    %int24_13980 = torch.constant.int 24
    %int4608_13981 = torch.constant.int 4608
    %int128_13982 = torch.constant.int 128
    %10086 = torch.prim.ListConstruct %int1_13979, %int24_13980, %int4608_13981, %int128_13982 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10087 = torch.aten.view %10078, %10086 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13983 = torch.constant.int 5
    %10088 = torch.prims.convert_element_type %10087, %int5_13983 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_13984 = torch.constant.int 1
    %int24_13985 = torch.constant.int 24
    %int4608_13986 = torch.constant.int 4608
    %int128_13987 = torch.constant.int 128
    %10089 = torch.prim.ListConstruct %int1_13984, %int24_13985, %int4608_13986, %int128_13987 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10090 = torch.aten.view %10085, %10089 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13988 = torch.constant.int 5
    %10091 = torch.prims.convert_element_type %10090, %int5_13988 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_13989 = torch.constant.float 0.000000e+00
    %false_13990 = torch.constant.bool false
    %none_13991 = torch.constant.none
    %none_13992 = torch.constant.none
    %10092:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%10088, %10091, %10043, %float0.000000e00_13989, %false_13990, %none_13991, %none_13992) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_13993 = torch.constant.int 0
    %int2_13994 = torch.constant.int 2
    %int1_13995 = torch.constant.int 1
    %int3_13996 = torch.constant.int 3
    %10093 = torch.prim.ListConstruct %int0_13993, %int2_13994, %int1_13995, %int3_13996 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10094 = torch.aten.permute %10092#0, %10093 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_13997 = torch.constant.int 1
    %int4608_13998 = torch.constant.int 4608
    %int3072_13999 = torch.constant.int 3072
    %10095 = torch.prim.ListConstruct %int1_13997, %int4608_13998, %int3072_13999 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10096 = torch.aten.view %10094, %10095 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_14000 = torch.constant.str "tanh"
    %10097 = torch.aten.gelu %10036, %str_14000 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %10098 = torch.prim.ListConstruct %10096, %10097 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_14001 = torch.constant.int 2
    %10099 = torch.aten.cat %10098, %int2_14001 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_14002 = torch.constant.int 4608
    %int15360_14003 = torch.constant.int 15360
    %10100 = torch.prim.ListConstruct %int4608_14002, %int15360_14003 : (!torch.int, !torch.int) -> !torch.list<int>
    %10101 = torch.aten.view %10099, %10100 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.27.linear2.weight = util.global.load @__auto.sampler.single_blocks.27.linear2.weight : tensor<3072x15360xf16>
    %10102 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_14004 = torch.constant.int 0
    %int1_14005 = torch.constant.int 1
    %10103 = torch.aten.transpose.int %10102, %int0_14004, %int1_14005 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.27.linear2.bias = util.global.load @__auto.sampler.single_blocks.27.linear2.bias : tensor<3072xf16>
    %10104 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_14006 = torch.constant.int 6
    %10105 = torch.prims.convert_element_type %10104, %int6_14006 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_14007 = torch.constant.int 6
    %10106 = torch.prims.convert_element_type %10101, %int6_14007 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_14008 = torch.constant.int 6
    %10107 = torch.prims.convert_element_type %10103, %int6_14008 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %10108 = torch.aten.mm %10106, %10107 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_14009 = torch.constant.int 1
    %10109 = torch.aten.mul.Scalar %10108, %int1_14009 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_14010 = torch.constant.int 1
    %10110 = torch.aten.mul.Scalar %10105, %int1_14010 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_14011 = torch.constant.int 1
    %10111 = torch.aten.add.Tensor %10109, %10110, %int1_14011 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_14012 = torch.constant.int 5
    %10112 = torch.prims.convert_element_type %10111, %int5_14012 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_14013 = torch.constant.int 1
    %int4608_14014 = torch.constant.int 4608
    %int3072_14015 = torch.constant.int 3072
    %10113 = torch.prim.ListConstruct %int1_14013, %int4608_14014, %int3072_14015 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10114 = torch.aten.view %10112, %10113 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %10115 = torch.aten.mul.Tensor %10009, %10114 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_14016 = torch.constant.int 1
    %10116 = torch.aten.add.Tensor %9991, %10115, %int1_14016 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10117 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.28.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.28.modulation.lin.weight : tensor<9216x3072xf16>
    %10118 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_14017 = torch.constant.int 0
    %int1_14018 = torch.constant.int 1
    %10119 = torch.aten.transpose.int %10118, %int0_14017, %int1_14018 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.28.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.28.modulation.lin.bias : tensor<9216xf16>
    %10120 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_14019 = torch.constant.int 6
    %10121 = torch.prims.convert_element_type %10120, %int6_14019 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_14020 = torch.constant.int 6
    %10122 = torch.prims.convert_element_type %10117, %int6_14020 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_14021 = torch.constant.int 6
    %10123 = torch.prims.convert_element_type %10119, %int6_14021 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %10124 = torch.aten.mm %10122, %10123 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_14022 = torch.constant.int 1
    %10125 = torch.aten.mul.Scalar %10124, %int1_14022 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_14023 = torch.constant.int 1
    %10126 = torch.aten.mul.Scalar %10121, %int1_14023 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_14024 = torch.constant.int 1
    %10127 = torch.aten.add.Tensor %10125, %10126, %int1_14024 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_14025 = torch.constant.int 5
    %10128 = torch.prims.convert_element_type %10127, %int5_14025 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_14026 = torch.constant.int 0
    %int0_14027 = torch.constant.int 0
    %int9223372036854775807_14028 = torch.constant.int 9223372036854775807
    %int1_14029 = torch.constant.int 1
    %10129 = torch.aten.slice.Tensor %10128, %int0_14026, %int0_14027, %int9223372036854775807_14028, %int1_14029 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_14030 = torch.constant.int 1
    %10130 = torch.aten.unsqueeze %10129, %int1_14030 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_14031 = torch.constant.int 2
    %int0_14032 = torch.constant.int 0
    %int9223372036854775807_14033 = torch.constant.int 9223372036854775807
    %int1_14034 = torch.constant.int 1
    %10131 = torch.aten.slice.Tensor %10130, %int2_14031, %int0_14032, %int9223372036854775807_14033, %int1_14034 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_14035 = torch.constant.int -1
    %int0_14036 = torch.constant.int 0
    %int3072_14037 = torch.constant.int 3072
    %int1_14038 = torch.constant.int 1
    %10132 = torch.aten.slice.Tensor %10131, %int-1_14035, %int0_14036, %int3072_14037, %int1_14038 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_14039 = torch.constant.int -1
    %int3072_14040 = torch.constant.int 3072
    %int6144_14041 = torch.constant.int 6144
    %int1_14042 = torch.constant.int 1
    %10133 = torch.aten.slice.Tensor %10131, %int-1_14039, %int3072_14040, %int6144_14041, %int1_14042 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_14043 = torch.constant.int -1
    %int6144_14044 = torch.constant.int 6144
    %int9216_14045 = torch.constant.int 9216
    %int1_14046 = torch.constant.int 1
    %10134 = torch.aten.slice.Tensor %10131, %int-1_14043, %int6144_14044, %int9216_14045, %int1_14046 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_14047 = torch.constant.int 1
    %int1_14048 = torch.constant.int 1
    %10135 = torch.aten.add.Scalar %10133, %int1_14047, %int1_14048 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_14049 = torch.constant.int 6
    %10136 = torch.prims.convert_element_type %10116, %int6_14049 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_14050 = torch.constant.int 2
    %10137 = torch.prim.ListConstruct %int2_14050 : (!torch.int) -> !torch.list<int>
    %int0_14051 = torch.constant.int 0
    %true_14052 = torch.constant.bool true
    %result0_14053, %result1_14054 = torch.aten.var_mean.correction %10136, %10137, %int0_14051, %true_14052 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_14055 = torch.constant.float 9.9999999999999995E-7
    %int1_14056 = torch.constant.int 1
    %10138 = torch.aten.add.Scalar %result0_14053, %float9.999990e-07_14055, %int1_14056 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %10139 = torch.aten.rsqrt %10138 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_14057 = torch.constant.int 1
    %10140 = torch.aten.sub.Tensor %10116, %result1_14054, %int1_14057 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %10141 = torch.aten.mul.Tensor %10140, %10139 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_14058 = torch.constant.int 5
    %10142 = torch.prims.convert_element_type %10141, %int5_14058 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10143 = torch.aten.mul.Tensor %10135, %10142 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_14059 = torch.constant.int 1
    %10144 = torch.aten.add.Tensor %10143, %10132, %int1_14059 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_14060 = torch.constant.int 4608
    %int3072_14061 = torch.constant.int 3072
    %10145 = torch.prim.ListConstruct %int4608_14060, %int3072_14061 : (!torch.int, !torch.int) -> !torch.list<int>
    %10146 = torch.aten.view %10144, %10145 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.28.linear1.weight = util.global.load @__auto.sampler.single_blocks.28.linear1.weight : tensor<21504x3072xf16>
    %10147 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_14062 = torch.constant.int 0
    %int1_14063 = torch.constant.int 1
    %10148 = torch.aten.transpose.int %10147, %int0_14062, %int1_14063 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.28.linear1.bias = util.global.load @__auto.sampler.single_blocks.28.linear1.bias : tensor<21504xf16>
    %10149 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_14064 = torch.constant.int 6
    %10150 = torch.prims.convert_element_type %10149, %int6_14064 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_14065 = torch.constant.int 6
    %10151 = torch.prims.convert_element_type %10146, %int6_14065 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_14066 = torch.constant.int 6
    %10152 = torch.prims.convert_element_type %10148, %int6_14066 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %10153 = torch.aten.mm %10151, %10152 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_14067 = torch.constant.int 1
    %10154 = torch.aten.mul.Scalar %10153, %int1_14067 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_14068 = torch.constant.int 1
    %10155 = torch.aten.mul.Scalar %10150, %int1_14068 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_14069 = torch.constant.int 1
    %10156 = torch.aten.add.Tensor %10154, %10155, %int1_14069 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_14070 = torch.constant.int 5
    %10157 = torch.prims.convert_element_type %10156, %int5_14070 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_14071 = torch.constant.int 1
    %int4608_14072 = torch.constant.int 4608
    %int21504_14073 = torch.constant.int 21504
    %10158 = torch.prim.ListConstruct %int1_14071, %int4608_14072, %int21504_14073 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10159 = torch.aten.view %10157, %10158 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_14074 = torch.constant.int -1
    %int0_14075 = torch.constant.int 0
    %int9216_14076 = torch.constant.int 9216
    %int1_14077 = torch.constant.int 1
    %10160 = torch.aten.slice.Tensor %10159, %int-1_14074, %int0_14075, %int9216_14076, %int1_14077 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_14078 = torch.constant.int -1
    %int9216_14079 = torch.constant.int 9216
    %int21504_14080 = torch.constant.int 21504
    %int1_14081 = torch.constant.int 1
    %10161 = torch.aten.slice.Tensor %10159, %int-1_14078, %int9216_14079, %int21504_14080, %int1_14081 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_14082 = torch.constant.int 1
    %int4608_14083 = torch.constant.int 4608
    %int3_14084 = torch.constant.int 3
    %int24_14085 = torch.constant.int 24
    %int128_14086 = torch.constant.int 128
    %10162 = torch.prim.ListConstruct %int1_14082, %int4608_14083, %int3_14084, %int24_14085, %int128_14086 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10163 = torch.aten.view %10160, %10162 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_14087 = torch.constant.int 2
    %int0_14088 = torch.constant.int 0
    %int3_14089 = torch.constant.int 3
    %int1_14090 = torch.constant.int 1
    %int4_14091 = torch.constant.int 4
    %10164 = torch.prim.ListConstruct %int2_14087, %int0_14088, %int3_14089, %int1_14090, %int4_14091 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10165 = torch.aten.permute %10163, %10164 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_14092 = torch.constant.int 0
    %int0_14093 = torch.constant.int 0
    %10166 = torch.aten.select.int %10165, %int0_14092, %int0_14093 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_14094 = torch.constant.int 0
    %int1_14095 = torch.constant.int 1
    %10167 = torch.aten.select.int %10165, %int0_14094, %int1_14095 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_14096 = torch.constant.int 0
    %int2_14097 = torch.constant.int 2
    %10168 = torch.aten.select.int %10165, %int0_14096, %int2_14097 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14098 = torch.constant.int 6
    %10169 = torch.prims.convert_element_type %10166, %int6_14098 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_14099 = torch.constant.int 2
    %10170 = torch.aten.pow.Tensor_Scalar %10169, %int2_14099 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_14100 = torch.constant.int -1
    %10171 = torch.prim.ListConstruct %int-1_14100 : (!torch.int) -> !torch.list<int>
    %true_14101 = torch.constant.bool true
    %none_14102 = torch.constant.none
    %10172 = torch.aten.mean.dim %10170, %10171, %true_14101, %none_14102 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_14103 = torch.constant.float 9.9999999999999995E-7
    %int1_14104 = torch.constant.int 1
    %10173 = torch.aten.add.Scalar %10172, %float9.999990e-07_14103, %int1_14104 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10174 = torch.aten.rsqrt %10173 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10175 = torch.aten.mul.Tensor %10169, %10174 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14105 = torch.constant.int 5
    %10176 = torch.prims.convert_element_type %10175, %int5_14105 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.28.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.28.norm.query_norm.scale : tensor<128xf16>
    %10177 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10178 = torch.aten.mul.Tensor %10176, %10177 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14106 = torch.constant.int 6
    %10179 = torch.prims.convert_element_type %10167, %int6_14106 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_14107 = torch.constant.int 2
    %10180 = torch.aten.pow.Tensor_Scalar %10179, %int2_14107 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_14108 = torch.constant.int -1
    %10181 = torch.prim.ListConstruct %int-1_14108 : (!torch.int) -> !torch.list<int>
    %true_14109 = torch.constant.bool true
    %none_14110 = torch.constant.none
    %10182 = torch.aten.mean.dim %10180, %10181, %true_14109, %none_14110 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_14111 = torch.constant.float 9.9999999999999995E-7
    %int1_14112 = torch.constant.int 1
    %10183 = torch.aten.add.Scalar %10182, %float9.999990e-07_14111, %int1_14112 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10184 = torch.aten.rsqrt %10183 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10185 = torch.aten.mul.Tensor %10179, %10184 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14113 = torch.constant.int 5
    %10186 = torch.prims.convert_element_type %10185, %int5_14113 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.28.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.28.norm.key_norm.scale : tensor<128xf16>
    %10187 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10188 = torch.aten.mul.Tensor %10186, %10187 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_14114 = torch.constant.int 5
    %10189 = torch.prims.convert_element_type %10178, %int5_14114 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_14115 = torch.constant.int 5
    %10190 = torch.prims.convert_element_type %10188, %int5_14115 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14116 = torch.constant.int 6
    %10191 = torch.prims.convert_element_type %10189, %int6_14116 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_14117 = torch.constant.int 1
    %int24_14118 = torch.constant.int 24
    %int4608_14119 = torch.constant.int 4608
    %int64_14120 = torch.constant.int 64
    %int1_14121 = torch.constant.int 1
    %int2_14122 = torch.constant.int 2
    %10192 = torch.prim.ListConstruct %int1_14117, %int24_14118, %int4608_14119, %int64_14120, %int1_14121, %int2_14122 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10193 = torch.aten.view %10191, %10192 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_14123 = torch.constant.int 6
    %10194 = torch.prims.convert_element_type %10190, %int6_14123 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_14124 = torch.constant.int 1
    %int24_14125 = torch.constant.int 24
    %int4608_14126 = torch.constant.int 4608
    %int64_14127 = torch.constant.int 64
    %int1_14128 = torch.constant.int 1
    %int2_14129 = torch.constant.int 2
    %10195 = torch.prim.ListConstruct %int1_14124, %int24_14125, %int4608_14126, %int64_14127, %int1_14128, %int2_14129 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10196 = torch.aten.view %10194, %10195 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_14130 = torch.constant.int 5
    %int0_14131 = torch.constant.int 0
    %10197 = torch.aten.select.int %211, %int5_14130, %int0_14131 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14132 = torch.constant.int 5
    %int0_14133 = torch.constant.int 0
    %10198 = torch.aten.select.int %10193, %int5_14132, %int0_14133 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10199 = torch.aten.mul.Tensor %10197, %10198 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14134 = torch.constant.int 5
    %int1_14135 = torch.constant.int 1
    %10200 = torch.aten.select.int %211, %int5_14134, %int1_14135 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14136 = torch.constant.int 5
    %int1_14137 = torch.constant.int 1
    %10201 = torch.aten.select.int %10193, %int5_14136, %int1_14137 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10202 = torch.aten.mul.Tensor %10200, %10201 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14138 = torch.constant.int 1
    %10203 = torch.aten.add.Tensor %10199, %10202, %int1_14138 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14139 = torch.constant.int 5
    %int0_14140 = torch.constant.int 0
    %10204 = torch.aten.select.int %211, %int5_14139, %int0_14140 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14141 = torch.constant.int 5
    %int0_14142 = torch.constant.int 0
    %10205 = torch.aten.select.int %10196, %int5_14141, %int0_14142 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10206 = torch.aten.mul.Tensor %10204, %10205 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14143 = torch.constant.int 5
    %int1_14144 = torch.constant.int 1
    %10207 = torch.aten.select.int %211, %int5_14143, %int1_14144 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14145 = torch.constant.int 5
    %int1_14146 = torch.constant.int 1
    %10208 = torch.aten.select.int %10196, %int5_14145, %int1_14146 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10209 = torch.aten.mul.Tensor %10207, %10208 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14147 = torch.constant.int 1
    %10210 = torch.aten.add.Tensor %10206, %10209, %int1_14147 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14148 = torch.constant.int 1
    %int24_14149 = torch.constant.int 24
    %int4608_14150 = torch.constant.int 4608
    %int128_14151 = torch.constant.int 128
    %10211 = torch.prim.ListConstruct %int1_14148, %int24_14149, %int4608_14150, %int128_14151 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10212 = torch.aten.view %10203, %10211 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14152 = torch.constant.int 5
    %10213 = torch.prims.convert_element_type %10212, %int5_14152 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_14153 = torch.constant.int 1
    %int24_14154 = torch.constant.int 24
    %int4608_14155 = torch.constant.int 4608
    %int128_14156 = torch.constant.int 128
    %10214 = torch.prim.ListConstruct %int1_14153, %int24_14154, %int4608_14155, %int128_14156 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10215 = torch.aten.view %10210, %10214 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14157 = torch.constant.int 5
    %10216 = torch.prims.convert_element_type %10215, %int5_14157 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_14158 = torch.constant.float 0.000000e+00
    %false_14159 = torch.constant.bool false
    %none_14160 = torch.constant.none
    %none_14161 = torch.constant.none
    %10217:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%10213, %10216, %10168, %float0.000000e00_14158, %false_14159, %none_14160, %none_14161) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_14162 = torch.constant.int 0
    %int2_14163 = torch.constant.int 2
    %int1_14164 = torch.constant.int 1
    %int3_14165 = torch.constant.int 3
    %10218 = torch.prim.ListConstruct %int0_14162, %int2_14163, %int1_14164, %int3_14165 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10219 = torch.aten.permute %10217#0, %10218 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_14166 = torch.constant.int 1
    %int4608_14167 = torch.constant.int 4608
    %int3072_14168 = torch.constant.int 3072
    %10220 = torch.prim.ListConstruct %int1_14166, %int4608_14167, %int3072_14168 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10221 = torch.aten.view %10219, %10220 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_14169 = torch.constant.str "tanh"
    %10222 = torch.aten.gelu %10161, %str_14169 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %10223 = torch.prim.ListConstruct %10221, %10222 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_14170 = torch.constant.int 2
    %10224 = torch.aten.cat %10223, %int2_14170 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_14171 = torch.constant.int 4608
    %int15360_14172 = torch.constant.int 15360
    %10225 = torch.prim.ListConstruct %int4608_14171, %int15360_14172 : (!torch.int, !torch.int) -> !torch.list<int>
    %10226 = torch.aten.view %10224, %10225 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.28.linear2.weight = util.global.load @__auto.sampler.single_blocks.28.linear2.weight : tensor<3072x15360xf16>
    %10227 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_14173 = torch.constant.int 0
    %int1_14174 = torch.constant.int 1
    %10228 = torch.aten.transpose.int %10227, %int0_14173, %int1_14174 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.28.linear2.bias = util.global.load @__auto.sampler.single_blocks.28.linear2.bias : tensor<3072xf16>
    %10229 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_14175 = torch.constant.int 6
    %10230 = torch.prims.convert_element_type %10229, %int6_14175 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_14176 = torch.constant.int 6
    %10231 = torch.prims.convert_element_type %10226, %int6_14176 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_14177 = torch.constant.int 6
    %10232 = torch.prims.convert_element_type %10228, %int6_14177 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %10233 = torch.aten.mm %10231, %10232 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_14178 = torch.constant.int 1
    %10234 = torch.aten.mul.Scalar %10233, %int1_14178 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_14179 = torch.constant.int 1
    %10235 = torch.aten.mul.Scalar %10230, %int1_14179 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_14180 = torch.constant.int 1
    %10236 = torch.aten.add.Tensor %10234, %10235, %int1_14180 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_14181 = torch.constant.int 5
    %10237 = torch.prims.convert_element_type %10236, %int5_14181 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_14182 = torch.constant.int 1
    %int4608_14183 = torch.constant.int 4608
    %int3072_14184 = torch.constant.int 3072
    %10238 = torch.prim.ListConstruct %int1_14182, %int4608_14183, %int3072_14184 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10239 = torch.aten.view %10237, %10238 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %10240 = torch.aten.mul.Tensor %10134, %10239 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_14185 = torch.constant.int 1
    %10241 = torch.aten.add.Tensor %10116, %10240, %int1_14185 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10242 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.29.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.29.modulation.lin.weight : tensor<9216x3072xf16>
    %10243 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_14186 = torch.constant.int 0
    %int1_14187 = torch.constant.int 1
    %10244 = torch.aten.transpose.int %10243, %int0_14186, %int1_14187 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.29.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.29.modulation.lin.bias : tensor<9216xf16>
    %10245 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_14188 = torch.constant.int 6
    %10246 = torch.prims.convert_element_type %10245, %int6_14188 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_14189 = torch.constant.int 6
    %10247 = torch.prims.convert_element_type %10242, %int6_14189 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_14190 = torch.constant.int 6
    %10248 = torch.prims.convert_element_type %10244, %int6_14190 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %10249 = torch.aten.mm %10247, %10248 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_14191 = torch.constant.int 1
    %10250 = torch.aten.mul.Scalar %10249, %int1_14191 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_14192 = torch.constant.int 1
    %10251 = torch.aten.mul.Scalar %10246, %int1_14192 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_14193 = torch.constant.int 1
    %10252 = torch.aten.add.Tensor %10250, %10251, %int1_14193 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_14194 = torch.constant.int 5
    %10253 = torch.prims.convert_element_type %10252, %int5_14194 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_14195 = torch.constant.int 0
    %int0_14196 = torch.constant.int 0
    %int9223372036854775807_14197 = torch.constant.int 9223372036854775807
    %int1_14198 = torch.constant.int 1
    %10254 = torch.aten.slice.Tensor %10253, %int0_14195, %int0_14196, %int9223372036854775807_14197, %int1_14198 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_14199 = torch.constant.int 1
    %10255 = torch.aten.unsqueeze %10254, %int1_14199 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_14200 = torch.constant.int 2
    %int0_14201 = torch.constant.int 0
    %int9223372036854775807_14202 = torch.constant.int 9223372036854775807
    %int1_14203 = torch.constant.int 1
    %10256 = torch.aten.slice.Tensor %10255, %int2_14200, %int0_14201, %int9223372036854775807_14202, %int1_14203 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_14204 = torch.constant.int -1
    %int0_14205 = torch.constant.int 0
    %int3072_14206 = torch.constant.int 3072
    %int1_14207 = torch.constant.int 1
    %10257 = torch.aten.slice.Tensor %10256, %int-1_14204, %int0_14205, %int3072_14206, %int1_14207 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_14208 = torch.constant.int -1
    %int3072_14209 = torch.constant.int 3072
    %int6144_14210 = torch.constant.int 6144
    %int1_14211 = torch.constant.int 1
    %10258 = torch.aten.slice.Tensor %10256, %int-1_14208, %int3072_14209, %int6144_14210, %int1_14211 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_14212 = torch.constant.int -1
    %int6144_14213 = torch.constant.int 6144
    %int9216_14214 = torch.constant.int 9216
    %int1_14215 = torch.constant.int 1
    %10259 = torch.aten.slice.Tensor %10256, %int-1_14212, %int6144_14213, %int9216_14214, %int1_14215 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_14216 = torch.constant.int 1
    %int1_14217 = torch.constant.int 1
    %10260 = torch.aten.add.Scalar %10258, %int1_14216, %int1_14217 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_14218 = torch.constant.int 6
    %10261 = torch.prims.convert_element_type %10241, %int6_14218 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_14219 = torch.constant.int 2
    %10262 = torch.prim.ListConstruct %int2_14219 : (!torch.int) -> !torch.list<int>
    %int0_14220 = torch.constant.int 0
    %true_14221 = torch.constant.bool true
    %result0_14222, %result1_14223 = torch.aten.var_mean.correction %10261, %10262, %int0_14220, %true_14221 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_14224 = torch.constant.float 9.9999999999999995E-7
    %int1_14225 = torch.constant.int 1
    %10263 = torch.aten.add.Scalar %result0_14222, %float9.999990e-07_14224, %int1_14225 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %10264 = torch.aten.rsqrt %10263 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_14226 = torch.constant.int 1
    %10265 = torch.aten.sub.Tensor %10241, %result1_14223, %int1_14226 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %10266 = torch.aten.mul.Tensor %10265, %10264 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_14227 = torch.constant.int 5
    %10267 = torch.prims.convert_element_type %10266, %int5_14227 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10268 = torch.aten.mul.Tensor %10260, %10267 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_14228 = torch.constant.int 1
    %10269 = torch.aten.add.Tensor %10268, %10257, %int1_14228 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_14229 = torch.constant.int 4608
    %int3072_14230 = torch.constant.int 3072
    %10270 = torch.prim.ListConstruct %int4608_14229, %int3072_14230 : (!torch.int, !torch.int) -> !torch.list<int>
    %10271 = torch.aten.view %10269, %10270 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.29.linear1.weight = util.global.load @__auto.sampler.single_blocks.29.linear1.weight : tensor<21504x3072xf16>
    %10272 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_14231 = torch.constant.int 0
    %int1_14232 = torch.constant.int 1
    %10273 = torch.aten.transpose.int %10272, %int0_14231, %int1_14232 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.29.linear1.bias = util.global.load @__auto.sampler.single_blocks.29.linear1.bias : tensor<21504xf16>
    %10274 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_14233 = torch.constant.int 6
    %10275 = torch.prims.convert_element_type %10274, %int6_14233 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_14234 = torch.constant.int 6
    %10276 = torch.prims.convert_element_type %10271, %int6_14234 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_14235 = torch.constant.int 6
    %10277 = torch.prims.convert_element_type %10273, %int6_14235 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %10278 = torch.aten.mm %10276, %10277 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_14236 = torch.constant.int 1
    %10279 = torch.aten.mul.Scalar %10278, %int1_14236 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_14237 = torch.constant.int 1
    %10280 = torch.aten.mul.Scalar %10275, %int1_14237 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_14238 = torch.constant.int 1
    %10281 = torch.aten.add.Tensor %10279, %10280, %int1_14238 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_14239 = torch.constant.int 5
    %10282 = torch.prims.convert_element_type %10281, %int5_14239 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_14240 = torch.constant.int 1
    %int4608_14241 = torch.constant.int 4608
    %int21504_14242 = torch.constant.int 21504
    %10283 = torch.prim.ListConstruct %int1_14240, %int4608_14241, %int21504_14242 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10284 = torch.aten.view %10282, %10283 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_14243 = torch.constant.int -1
    %int0_14244 = torch.constant.int 0
    %int9216_14245 = torch.constant.int 9216
    %int1_14246 = torch.constant.int 1
    %10285 = torch.aten.slice.Tensor %10284, %int-1_14243, %int0_14244, %int9216_14245, %int1_14246 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_14247 = torch.constant.int -1
    %int9216_14248 = torch.constant.int 9216
    %int21504_14249 = torch.constant.int 21504
    %int1_14250 = torch.constant.int 1
    %10286 = torch.aten.slice.Tensor %10284, %int-1_14247, %int9216_14248, %int21504_14249, %int1_14250 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_14251 = torch.constant.int 1
    %int4608_14252 = torch.constant.int 4608
    %int3_14253 = torch.constant.int 3
    %int24_14254 = torch.constant.int 24
    %int128_14255 = torch.constant.int 128
    %10287 = torch.prim.ListConstruct %int1_14251, %int4608_14252, %int3_14253, %int24_14254, %int128_14255 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10288 = torch.aten.view %10285, %10287 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_14256 = torch.constant.int 2
    %int0_14257 = torch.constant.int 0
    %int3_14258 = torch.constant.int 3
    %int1_14259 = torch.constant.int 1
    %int4_14260 = torch.constant.int 4
    %10289 = torch.prim.ListConstruct %int2_14256, %int0_14257, %int3_14258, %int1_14259, %int4_14260 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10290 = torch.aten.permute %10288, %10289 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_14261 = torch.constant.int 0
    %int0_14262 = torch.constant.int 0
    %10291 = torch.aten.select.int %10290, %int0_14261, %int0_14262 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_14263 = torch.constant.int 0
    %int1_14264 = torch.constant.int 1
    %10292 = torch.aten.select.int %10290, %int0_14263, %int1_14264 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_14265 = torch.constant.int 0
    %int2_14266 = torch.constant.int 2
    %10293 = torch.aten.select.int %10290, %int0_14265, %int2_14266 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14267 = torch.constant.int 6
    %10294 = torch.prims.convert_element_type %10291, %int6_14267 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_14268 = torch.constant.int 2
    %10295 = torch.aten.pow.Tensor_Scalar %10294, %int2_14268 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_14269 = torch.constant.int -1
    %10296 = torch.prim.ListConstruct %int-1_14269 : (!torch.int) -> !torch.list<int>
    %true_14270 = torch.constant.bool true
    %none_14271 = torch.constant.none
    %10297 = torch.aten.mean.dim %10295, %10296, %true_14270, %none_14271 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_14272 = torch.constant.float 9.9999999999999995E-7
    %int1_14273 = torch.constant.int 1
    %10298 = torch.aten.add.Scalar %10297, %float9.999990e-07_14272, %int1_14273 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10299 = torch.aten.rsqrt %10298 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10300 = torch.aten.mul.Tensor %10294, %10299 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14274 = torch.constant.int 5
    %10301 = torch.prims.convert_element_type %10300, %int5_14274 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.29.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.29.norm.query_norm.scale : tensor<128xf16>
    %10302 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10303 = torch.aten.mul.Tensor %10301, %10302 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14275 = torch.constant.int 6
    %10304 = torch.prims.convert_element_type %10292, %int6_14275 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_14276 = torch.constant.int 2
    %10305 = torch.aten.pow.Tensor_Scalar %10304, %int2_14276 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_14277 = torch.constant.int -1
    %10306 = torch.prim.ListConstruct %int-1_14277 : (!torch.int) -> !torch.list<int>
    %true_14278 = torch.constant.bool true
    %none_14279 = torch.constant.none
    %10307 = torch.aten.mean.dim %10305, %10306, %true_14278, %none_14279 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_14280 = torch.constant.float 9.9999999999999995E-7
    %int1_14281 = torch.constant.int 1
    %10308 = torch.aten.add.Scalar %10307, %float9.999990e-07_14280, %int1_14281 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10309 = torch.aten.rsqrt %10308 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10310 = torch.aten.mul.Tensor %10304, %10309 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14282 = torch.constant.int 5
    %10311 = torch.prims.convert_element_type %10310, %int5_14282 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.29.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.29.norm.key_norm.scale : tensor<128xf16>
    %10312 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10313 = torch.aten.mul.Tensor %10311, %10312 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_14283 = torch.constant.int 5
    %10314 = torch.prims.convert_element_type %10303, %int5_14283 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_14284 = torch.constant.int 5
    %10315 = torch.prims.convert_element_type %10313, %int5_14284 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14285 = torch.constant.int 6
    %10316 = torch.prims.convert_element_type %10314, %int6_14285 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_14286 = torch.constant.int 1
    %int24_14287 = torch.constant.int 24
    %int4608_14288 = torch.constant.int 4608
    %int64_14289 = torch.constant.int 64
    %int1_14290 = torch.constant.int 1
    %int2_14291 = torch.constant.int 2
    %10317 = torch.prim.ListConstruct %int1_14286, %int24_14287, %int4608_14288, %int64_14289, %int1_14290, %int2_14291 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10318 = torch.aten.view %10316, %10317 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_14292 = torch.constant.int 6
    %10319 = torch.prims.convert_element_type %10315, %int6_14292 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_14293 = torch.constant.int 1
    %int24_14294 = torch.constant.int 24
    %int4608_14295 = torch.constant.int 4608
    %int64_14296 = torch.constant.int 64
    %int1_14297 = torch.constant.int 1
    %int2_14298 = torch.constant.int 2
    %10320 = torch.prim.ListConstruct %int1_14293, %int24_14294, %int4608_14295, %int64_14296, %int1_14297, %int2_14298 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10321 = torch.aten.view %10319, %10320 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_14299 = torch.constant.int 5
    %int0_14300 = torch.constant.int 0
    %10322 = torch.aten.select.int %211, %int5_14299, %int0_14300 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14301 = torch.constant.int 5
    %int0_14302 = torch.constant.int 0
    %10323 = torch.aten.select.int %10318, %int5_14301, %int0_14302 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10324 = torch.aten.mul.Tensor %10322, %10323 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14303 = torch.constant.int 5
    %int1_14304 = torch.constant.int 1
    %10325 = torch.aten.select.int %211, %int5_14303, %int1_14304 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14305 = torch.constant.int 5
    %int1_14306 = torch.constant.int 1
    %10326 = torch.aten.select.int %10318, %int5_14305, %int1_14306 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10327 = torch.aten.mul.Tensor %10325, %10326 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14307 = torch.constant.int 1
    %10328 = torch.aten.add.Tensor %10324, %10327, %int1_14307 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14308 = torch.constant.int 5
    %int0_14309 = torch.constant.int 0
    %10329 = torch.aten.select.int %211, %int5_14308, %int0_14309 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14310 = torch.constant.int 5
    %int0_14311 = torch.constant.int 0
    %10330 = torch.aten.select.int %10321, %int5_14310, %int0_14311 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10331 = torch.aten.mul.Tensor %10329, %10330 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14312 = torch.constant.int 5
    %int1_14313 = torch.constant.int 1
    %10332 = torch.aten.select.int %211, %int5_14312, %int1_14313 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14314 = torch.constant.int 5
    %int1_14315 = torch.constant.int 1
    %10333 = torch.aten.select.int %10321, %int5_14314, %int1_14315 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10334 = torch.aten.mul.Tensor %10332, %10333 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14316 = torch.constant.int 1
    %10335 = torch.aten.add.Tensor %10331, %10334, %int1_14316 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14317 = torch.constant.int 1
    %int24_14318 = torch.constant.int 24
    %int4608_14319 = torch.constant.int 4608
    %int128_14320 = torch.constant.int 128
    %10336 = torch.prim.ListConstruct %int1_14317, %int24_14318, %int4608_14319, %int128_14320 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10337 = torch.aten.view %10328, %10336 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14321 = torch.constant.int 5
    %10338 = torch.prims.convert_element_type %10337, %int5_14321 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_14322 = torch.constant.int 1
    %int24_14323 = torch.constant.int 24
    %int4608_14324 = torch.constant.int 4608
    %int128_14325 = torch.constant.int 128
    %10339 = torch.prim.ListConstruct %int1_14322, %int24_14323, %int4608_14324, %int128_14325 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10340 = torch.aten.view %10335, %10339 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14326 = torch.constant.int 5
    %10341 = torch.prims.convert_element_type %10340, %int5_14326 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_14327 = torch.constant.float 0.000000e+00
    %false_14328 = torch.constant.bool false
    %none_14329 = torch.constant.none
    %none_14330 = torch.constant.none
    %10342:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%10338, %10341, %10293, %float0.000000e00_14327, %false_14328, %none_14329, %none_14330) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_14331 = torch.constant.int 0
    %int2_14332 = torch.constant.int 2
    %int1_14333 = torch.constant.int 1
    %int3_14334 = torch.constant.int 3
    %10343 = torch.prim.ListConstruct %int0_14331, %int2_14332, %int1_14333, %int3_14334 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10344 = torch.aten.permute %10342#0, %10343 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_14335 = torch.constant.int 1
    %int4608_14336 = torch.constant.int 4608
    %int3072_14337 = torch.constant.int 3072
    %10345 = torch.prim.ListConstruct %int1_14335, %int4608_14336, %int3072_14337 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10346 = torch.aten.view %10344, %10345 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_14338 = torch.constant.str "tanh"
    %10347 = torch.aten.gelu %10286, %str_14338 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %10348 = torch.prim.ListConstruct %10346, %10347 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_14339 = torch.constant.int 2
    %10349 = torch.aten.cat %10348, %int2_14339 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_14340 = torch.constant.int 4608
    %int15360_14341 = torch.constant.int 15360
    %10350 = torch.prim.ListConstruct %int4608_14340, %int15360_14341 : (!torch.int, !torch.int) -> !torch.list<int>
    %10351 = torch.aten.view %10349, %10350 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.29.linear2.weight = util.global.load @__auto.sampler.single_blocks.29.linear2.weight : tensor<3072x15360xf16>
    %10352 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_14342 = torch.constant.int 0
    %int1_14343 = torch.constant.int 1
    %10353 = torch.aten.transpose.int %10352, %int0_14342, %int1_14343 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.29.linear2.bias = util.global.load @__auto.sampler.single_blocks.29.linear2.bias : tensor<3072xf16>
    %10354 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_14344 = torch.constant.int 6
    %10355 = torch.prims.convert_element_type %10354, %int6_14344 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_14345 = torch.constant.int 6
    %10356 = torch.prims.convert_element_type %10351, %int6_14345 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_14346 = torch.constant.int 6
    %10357 = torch.prims.convert_element_type %10353, %int6_14346 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %10358 = torch.aten.mm %10356, %10357 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_14347 = torch.constant.int 1
    %10359 = torch.aten.mul.Scalar %10358, %int1_14347 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_14348 = torch.constant.int 1
    %10360 = torch.aten.mul.Scalar %10355, %int1_14348 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_14349 = torch.constant.int 1
    %10361 = torch.aten.add.Tensor %10359, %10360, %int1_14349 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_14350 = torch.constant.int 5
    %10362 = torch.prims.convert_element_type %10361, %int5_14350 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_14351 = torch.constant.int 1
    %int4608_14352 = torch.constant.int 4608
    %int3072_14353 = torch.constant.int 3072
    %10363 = torch.prim.ListConstruct %int1_14351, %int4608_14352, %int3072_14353 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10364 = torch.aten.view %10362, %10363 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %10365 = torch.aten.mul.Tensor %10259, %10364 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_14354 = torch.constant.int 1
    %10366 = torch.aten.add.Tensor %10241, %10365, %int1_14354 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10367 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.30.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.30.modulation.lin.weight : tensor<9216x3072xf16>
    %10368 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_14355 = torch.constant.int 0
    %int1_14356 = torch.constant.int 1
    %10369 = torch.aten.transpose.int %10368, %int0_14355, %int1_14356 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.30.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.30.modulation.lin.bias : tensor<9216xf16>
    %10370 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_14357 = torch.constant.int 6
    %10371 = torch.prims.convert_element_type %10370, %int6_14357 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_14358 = torch.constant.int 6
    %10372 = torch.prims.convert_element_type %10367, %int6_14358 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_14359 = torch.constant.int 6
    %10373 = torch.prims.convert_element_type %10369, %int6_14359 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %10374 = torch.aten.mm %10372, %10373 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_14360 = torch.constant.int 1
    %10375 = torch.aten.mul.Scalar %10374, %int1_14360 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_14361 = torch.constant.int 1
    %10376 = torch.aten.mul.Scalar %10371, %int1_14361 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_14362 = torch.constant.int 1
    %10377 = torch.aten.add.Tensor %10375, %10376, %int1_14362 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_14363 = torch.constant.int 5
    %10378 = torch.prims.convert_element_type %10377, %int5_14363 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_14364 = torch.constant.int 0
    %int0_14365 = torch.constant.int 0
    %int9223372036854775807_14366 = torch.constant.int 9223372036854775807
    %int1_14367 = torch.constant.int 1
    %10379 = torch.aten.slice.Tensor %10378, %int0_14364, %int0_14365, %int9223372036854775807_14366, %int1_14367 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_14368 = torch.constant.int 1
    %10380 = torch.aten.unsqueeze %10379, %int1_14368 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_14369 = torch.constant.int 2
    %int0_14370 = torch.constant.int 0
    %int9223372036854775807_14371 = torch.constant.int 9223372036854775807
    %int1_14372 = torch.constant.int 1
    %10381 = torch.aten.slice.Tensor %10380, %int2_14369, %int0_14370, %int9223372036854775807_14371, %int1_14372 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_14373 = torch.constant.int -1
    %int0_14374 = torch.constant.int 0
    %int3072_14375 = torch.constant.int 3072
    %int1_14376 = torch.constant.int 1
    %10382 = torch.aten.slice.Tensor %10381, %int-1_14373, %int0_14374, %int3072_14375, %int1_14376 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_14377 = torch.constant.int -1
    %int3072_14378 = torch.constant.int 3072
    %int6144_14379 = torch.constant.int 6144
    %int1_14380 = torch.constant.int 1
    %10383 = torch.aten.slice.Tensor %10381, %int-1_14377, %int3072_14378, %int6144_14379, %int1_14380 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_14381 = torch.constant.int -1
    %int6144_14382 = torch.constant.int 6144
    %int9216_14383 = torch.constant.int 9216
    %int1_14384 = torch.constant.int 1
    %10384 = torch.aten.slice.Tensor %10381, %int-1_14381, %int6144_14382, %int9216_14383, %int1_14384 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_14385 = torch.constant.int 1
    %int1_14386 = torch.constant.int 1
    %10385 = torch.aten.add.Scalar %10383, %int1_14385, %int1_14386 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_14387 = torch.constant.int 6
    %10386 = torch.prims.convert_element_type %10366, %int6_14387 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_14388 = torch.constant.int 2
    %10387 = torch.prim.ListConstruct %int2_14388 : (!torch.int) -> !torch.list<int>
    %int0_14389 = torch.constant.int 0
    %true_14390 = torch.constant.bool true
    %result0_14391, %result1_14392 = torch.aten.var_mean.correction %10386, %10387, %int0_14389, %true_14390 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_14393 = torch.constant.float 9.9999999999999995E-7
    %int1_14394 = torch.constant.int 1
    %10388 = torch.aten.add.Scalar %result0_14391, %float9.999990e-07_14393, %int1_14394 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %10389 = torch.aten.rsqrt %10388 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_14395 = torch.constant.int 1
    %10390 = torch.aten.sub.Tensor %10366, %result1_14392, %int1_14395 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %10391 = torch.aten.mul.Tensor %10390, %10389 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_14396 = torch.constant.int 5
    %10392 = torch.prims.convert_element_type %10391, %int5_14396 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10393 = torch.aten.mul.Tensor %10385, %10392 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_14397 = torch.constant.int 1
    %10394 = torch.aten.add.Tensor %10393, %10382, %int1_14397 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_14398 = torch.constant.int 4608
    %int3072_14399 = torch.constant.int 3072
    %10395 = torch.prim.ListConstruct %int4608_14398, %int3072_14399 : (!torch.int, !torch.int) -> !torch.list<int>
    %10396 = torch.aten.view %10394, %10395 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.30.linear1.weight = util.global.load @__auto.sampler.single_blocks.30.linear1.weight : tensor<21504x3072xf16>
    %10397 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_14400 = torch.constant.int 0
    %int1_14401 = torch.constant.int 1
    %10398 = torch.aten.transpose.int %10397, %int0_14400, %int1_14401 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.30.linear1.bias = util.global.load @__auto.sampler.single_blocks.30.linear1.bias : tensor<21504xf16>
    %10399 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_14402 = torch.constant.int 6
    %10400 = torch.prims.convert_element_type %10399, %int6_14402 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_14403 = torch.constant.int 6
    %10401 = torch.prims.convert_element_type %10396, %int6_14403 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_14404 = torch.constant.int 6
    %10402 = torch.prims.convert_element_type %10398, %int6_14404 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %10403 = torch.aten.mm %10401, %10402 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_14405 = torch.constant.int 1
    %10404 = torch.aten.mul.Scalar %10403, %int1_14405 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_14406 = torch.constant.int 1
    %10405 = torch.aten.mul.Scalar %10400, %int1_14406 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_14407 = torch.constant.int 1
    %10406 = torch.aten.add.Tensor %10404, %10405, %int1_14407 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_14408 = torch.constant.int 5
    %10407 = torch.prims.convert_element_type %10406, %int5_14408 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_14409 = torch.constant.int 1
    %int4608_14410 = torch.constant.int 4608
    %int21504_14411 = torch.constant.int 21504
    %10408 = torch.prim.ListConstruct %int1_14409, %int4608_14410, %int21504_14411 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10409 = torch.aten.view %10407, %10408 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_14412 = torch.constant.int -1
    %int0_14413 = torch.constant.int 0
    %int9216_14414 = torch.constant.int 9216
    %int1_14415 = torch.constant.int 1
    %10410 = torch.aten.slice.Tensor %10409, %int-1_14412, %int0_14413, %int9216_14414, %int1_14415 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_14416 = torch.constant.int -1
    %int9216_14417 = torch.constant.int 9216
    %int21504_14418 = torch.constant.int 21504
    %int1_14419 = torch.constant.int 1
    %10411 = torch.aten.slice.Tensor %10409, %int-1_14416, %int9216_14417, %int21504_14418, %int1_14419 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_14420 = torch.constant.int 1
    %int4608_14421 = torch.constant.int 4608
    %int3_14422 = torch.constant.int 3
    %int24_14423 = torch.constant.int 24
    %int128_14424 = torch.constant.int 128
    %10412 = torch.prim.ListConstruct %int1_14420, %int4608_14421, %int3_14422, %int24_14423, %int128_14424 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10413 = torch.aten.view %10410, %10412 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_14425 = torch.constant.int 2
    %int0_14426 = torch.constant.int 0
    %int3_14427 = torch.constant.int 3
    %int1_14428 = torch.constant.int 1
    %int4_14429 = torch.constant.int 4
    %10414 = torch.prim.ListConstruct %int2_14425, %int0_14426, %int3_14427, %int1_14428, %int4_14429 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10415 = torch.aten.permute %10413, %10414 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_14430 = torch.constant.int 0
    %int0_14431 = torch.constant.int 0
    %10416 = torch.aten.select.int %10415, %int0_14430, %int0_14431 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_14432 = torch.constant.int 0
    %int1_14433 = torch.constant.int 1
    %10417 = torch.aten.select.int %10415, %int0_14432, %int1_14433 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_14434 = torch.constant.int 0
    %int2_14435 = torch.constant.int 2
    %10418 = torch.aten.select.int %10415, %int0_14434, %int2_14435 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14436 = torch.constant.int 6
    %10419 = torch.prims.convert_element_type %10416, %int6_14436 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_14437 = torch.constant.int 2
    %10420 = torch.aten.pow.Tensor_Scalar %10419, %int2_14437 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_14438 = torch.constant.int -1
    %10421 = torch.prim.ListConstruct %int-1_14438 : (!torch.int) -> !torch.list<int>
    %true_14439 = torch.constant.bool true
    %none_14440 = torch.constant.none
    %10422 = torch.aten.mean.dim %10420, %10421, %true_14439, %none_14440 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_14441 = torch.constant.float 9.9999999999999995E-7
    %int1_14442 = torch.constant.int 1
    %10423 = torch.aten.add.Scalar %10422, %float9.999990e-07_14441, %int1_14442 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10424 = torch.aten.rsqrt %10423 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10425 = torch.aten.mul.Tensor %10419, %10424 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14443 = torch.constant.int 5
    %10426 = torch.prims.convert_element_type %10425, %int5_14443 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.30.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.30.norm.query_norm.scale : tensor<128xf16>
    %10427 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10428 = torch.aten.mul.Tensor %10426, %10427 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14444 = torch.constant.int 6
    %10429 = torch.prims.convert_element_type %10417, %int6_14444 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_14445 = torch.constant.int 2
    %10430 = torch.aten.pow.Tensor_Scalar %10429, %int2_14445 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_14446 = torch.constant.int -1
    %10431 = torch.prim.ListConstruct %int-1_14446 : (!torch.int) -> !torch.list<int>
    %true_14447 = torch.constant.bool true
    %none_14448 = torch.constant.none
    %10432 = torch.aten.mean.dim %10430, %10431, %true_14447, %none_14448 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_14449 = torch.constant.float 9.9999999999999995E-7
    %int1_14450 = torch.constant.int 1
    %10433 = torch.aten.add.Scalar %10432, %float9.999990e-07_14449, %int1_14450 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10434 = torch.aten.rsqrt %10433 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10435 = torch.aten.mul.Tensor %10429, %10434 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14451 = torch.constant.int 5
    %10436 = torch.prims.convert_element_type %10435, %int5_14451 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.30.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.30.norm.key_norm.scale : tensor<128xf16>
    %10437 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10438 = torch.aten.mul.Tensor %10436, %10437 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_14452 = torch.constant.int 5
    %10439 = torch.prims.convert_element_type %10428, %int5_14452 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_14453 = torch.constant.int 5
    %10440 = torch.prims.convert_element_type %10438, %int5_14453 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14454 = torch.constant.int 6
    %10441 = torch.prims.convert_element_type %10439, %int6_14454 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_14455 = torch.constant.int 1
    %int24_14456 = torch.constant.int 24
    %int4608_14457 = torch.constant.int 4608
    %int64_14458 = torch.constant.int 64
    %int1_14459 = torch.constant.int 1
    %int2_14460 = torch.constant.int 2
    %10442 = torch.prim.ListConstruct %int1_14455, %int24_14456, %int4608_14457, %int64_14458, %int1_14459, %int2_14460 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10443 = torch.aten.view %10441, %10442 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_14461 = torch.constant.int 6
    %10444 = torch.prims.convert_element_type %10440, %int6_14461 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_14462 = torch.constant.int 1
    %int24_14463 = torch.constant.int 24
    %int4608_14464 = torch.constant.int 4608
    %int64_14465 = torch.constant.int 64
    %int1_14466 = torch.constant.int 1
    %int2_14467 = torch.constant.int 2
    %10445 = torch.prim.ListConstruct %int1_14462, %int24_14463, %int4608_14464, %int64_14465, %int1_14466, %int2_14467 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10446 = torch.aten.view %10444, %10445 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_14468 = torch.constant.int 5
    %int0_14469 = torch.constant.int 0
    %10447 = torch.aten.select.int %211, %int5_14468, %int0_14469 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14470 = torch.constant.int 5
    %int0_14471 = torch.constant.int 0
    %10448 = torch.aten.select.int %10443, %int5_14470, %int0_14471 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10449 = torch.aten.mul.Tensor %10447, %10448 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14472 = torch.constant.int 5
    %int1_14473 = torch.constant.int 1
    %10450 = torch.aten.select.int %211, %int5_14472, %int1_14473 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14474 = torch.constant.int 5
    %int1_14475 = torch.constant.int 1
    %10451 = torch.aten.select.int %10443, %int5_14474, %int1_14475 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10452 = torch.aten.mul.Tensor %10450, %10451 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14476 = torch.constant.int 1
    %10453 = torch.aten.add.Tensor %10449, %10452, %int1_14476 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14477 = torch.constant.int 5
    %int0_14478 = torch.constant.int 0
    %10454 = torch.aten.select.int %211, %int5_14477, %int0_14478 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14479 = torch.constant.int 5
    %int0_14480 = torch.constant.int 0
    %10455 = torch.aten.select.int %10446, %int5_14479, %int0_14480 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10456 = torch.aten.mul.Tensor %10454, %10455 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14481 = torch.constant.int 5
    %int1_14482 = torch.constant.int 1
    %10457 = torch.aten.select.int %211, %int5_14481, %int1_14482 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14483 = torch.constant.int 5
    %int1_14484 = torch.constant.int 1
    %10458 = torch.aten.select.int %10446, %int5_14483, %int1_14484 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10459 = torch.aten.mul.Tensor %10457, %10458 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14485 = torch.constant.int 1
    %10460 = torch.aten.add.Tensor %10456, %10459, %int1_14485 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14486 = torch.constant.int 1
    %int24_14487 = torch.constant.int 24
    %int4608_14488 = torch.constant.int 4608
    %int128_14489 = torch.constant.int 128
    %10461 = torch.prim.ListConstruct %int1_14486, %int24_14487, %int4608_14488, %int128_14489 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10462 = torch.aten.view %10453, %10461 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14490 = torch.constant.int 5
    %10463 = torch.prims.convert_element_type %10462, %int5_14490 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_14491 = torch.constant.int 1
    %int24_14492 = torch.constant.int 24
    %int4608_14493 = torch.constant.int 4608
    %int128_14494 = torch.constant.int 128
    %10464 = torch.prim.ListConstruct %int1_14491, %int24_14492, %int4608_14493, %int128_14494 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10465 = torch.aten.view %10460, %10464 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14495 = torch.constant.int 5
    %10466 = torch.prims.convert_element_type %10465, %int5_14495 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_14496 = torch.constant.float 0.000000e+00
    %false_14497 = torch.constant.bool false
    %none_14498 = torch.constant.none
    %none_14499 = torch.constant.none
    %10467:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%10463, %10466, %10418, %float0.000000e00_14496, %false_14497, %none_14498, %none_14499) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_14500 = torch.constant.int 0
    %int2_14501 = torch.constant.int 2
    %int1_14502 = torch.constant.int 1
    %int3_14503 = torch.constant.int 3
    %10468 = torch.prim.ListConstruct %int0_14500, %int2_14501, %int1_14502, %int3_14503 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10469 = torch.aten.permute %10467#0, %10468 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_14504 = torch.constant.int 1
    %int4608_14505 = torch.constant.int 4608
    %int3072_14506 = torch.constant.int 3072
    %10470 = torch.prim.ListConstruct %int1_14504, %int4608_14505, %int3072_14506 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10471 = torch.aten.view %10469, %10470 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_14507 = torch.constant.str "tanh"
    %10472 = torch.aten.gelu %10411, %str_14507 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %10473 = torch.prim.ListConstruct %10471, %10472 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_14508 = torch.constant.int 2
    %10474 = torch.aten.cat %10473, %int2_14508 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_14509 = torch.constant.int 4608
    %int15360_14510 = torch.constant.int 15360
    %10475 = torch.prim.ListConstruct %int4608_14509, %int15360_14510 : (!torch.int, !torch.int) -> !torch.list<int>
    %10476 = torch.aten.view %10474, %10475 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.30.linear2.weight = util.global.load @__auto.sampler.single_blocks.30.linear2.weight : tensor<3072x15360xf16>
    %10477 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_14511 = torch.constant.int 0
    %int1_14512 = torch.constant.int 1
    %10478 = torch.aten.transpose.int %10477, %int0_14511, %int1_14512 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.30.linear2.bias = util.global.load @__auto.sampler.single_blocks.30.linear2.bias : tensor<3072xf16>
    %10479 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_14513 = torch.constant.int 6
    %10480 = torch.prims.convert_element_type %10479, %int6_14513 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_14514 = torch.constant.int 6
    %10481 = torch.prims.convert_element_type %10476, %int6_14514 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_14515 = torch.constant.int 6
    %10482 = torch.prims.convert_element_type %10478, %int6_14515 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %10483 = torch.aten.mm %10481, %10482 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_14516 = torch.constant.int 1
    %10484 = torch.aten.mul.Scalar %10483, %int1_14516 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_14517 = torch.constant.int 1
    %10485 = torch.aten.mul.Scalar %10480, %int1_14517 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_14518 = torch.constant.int 1
    %10486 = torch.aten.add.Tensor %10484, %10485, %int1_14518 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_14519 = torch.constant.int 5
    %10487 = torch.prims.convert_element_type %10486, %int5_14519 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_14520 = torch.constant.int 1
    %int4608_14521 = torch.constant.int 4608
    %int3072_14522 = torch.constant.int 3072
    %10488 = torch.prim.ListConstruct %int1_14520, %int4608_14521, %int3072_14522 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10489 = torch.aten.view %10487, %10488 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %10490 = torch.aten.mul.Tensor %10384, %10489 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_14523 = torch.constant.int 1
    %10491 = torch.aten.add.Tensor %10366, %10490, %int1_14523 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10492 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.31.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.31.modulation.lin.weight : tensor<9216x3072xf16>
    %10493 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_14524 = torch.constant.int 0
    %int1_14525 = torch.constant.int 1
    %10494 = torch.aten.transpose.int %10493, %int0_14524, %int1_14525 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.31.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.31.modulation.lin.bias : tensor<9216xf16>
    %10495 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_14526 = torch.constant.int 6
    %10496 = torch.prims.convert_element_type %10495, %int6_14526 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_14527 = torch.constant.int 6
    %10497 = torch.prims.convert_element_type %10492, %int6_14527 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_14528 = torch.constant.int 6
    %10498 = torch.prims.convert_element_type %10494, %int6_14528 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %10499 = torch.aten.mm %10497, %10498 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_14529 = torch.constant.int 1
    %10500 = torch.aten.mul.Scalar %10499, %int1_14529 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_14530 = torch.constant.int 1
    %10501 = torch.aten.mul.Scalar %10496, %int1_14530 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_14531 = torch.constant.int 1
    %10502 = torch.aten.add.Tensor %10500, %10501, %int1_14531 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_14532 = torch.constant.int 5
    %10503 = torch.prims.convert_element_type %10502, %int5_14532 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_14533 = torch.constant.int 0
    %int0_14534 = torch.constant.int 0
    %int9223372036854775807_14535 = torch.constant.int 9223372036854775807
    %int1_14536 = torch.constant.int 1
    %10504 = torch.aten.slice.Tensor %10503, %int0_14533, %int0_14534, %int9223372036854775807_14535, %int1_14536 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_14537 = torch.constant.int 1
    %10505 = torch.aten.unsqueeze %10504, %int1_14537 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_14538 = torch.constant.int 2
    %int0_14539 = torch.constant.int 0
    %int9223372036854775807_14540 = torch.constant.int 9223372036854775807
    %int1_14541 = torch.constant.int 1
    %10506 = torch.aten.slice.Tensor %10505, %int2_14538, %int0_14539, %int9223372036854775807_14540, %int1_14541 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_14542 = torch.constant.int -1
    %int0_14543 = torch.constant.int 0
    %int3072_14544 = torch.constant.int 3072
    %int1_14545 = torch.constant.int 1
    %10507 = torch.aten.slice.Tensor %10506, %int-1_14542, %int0_14543, %int3072_14544, %int1_14545 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_14546 = torch.constant.int -1
    %int3072_14547 = torch.constant.int 3072
    %int6144_14548 = torch.constant.int 6144
    %int1_14549 = torch.constant.int 1
    %10508 = torch.aten.slice.Tensor %10506, %int-1_14546, %int3072_14547, %int6144_14548, %int1_14549 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_14550 = torch.constant.int -1
    %int6144_14551 = torch.constant.int 6144
    %int9216_14552 = torch.constant.int 9216
    %int1_14553 = torch.constant.int 1
    %10509 = torch.aten.slice.Tensor %10506, %int-1_14550, %int6144_14551, %int9216_14552, %int1_14553 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_14554 = torch.constant.int 1
    %int1_14555 = torch.constant.int 1
    %10510 = torch.aten.add.Scalar %10508, %int1_14554, %int1_14555 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_14556 = torch.constant.int 6
    %10511 = torch.prims.convert_element_type %10491, %int6_14556 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_14557 = torch.constant.int 2
    %10512 = torch.prim.ListConstruct %int2_14557 : (!torch.int) -> !torch.list<int>
    %int0_14558 = torch.constant.int 0
    %true_14559 = torch.constant.bool true
    %result0_14560, %result1_14561 = torch.aten.var_mean.correction %10511, %10512, %int0_14558, %true_14559 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_14562 = torch.constant.float 9.9999999999999995E-7
    %int1_14563 = torch.constant.int 1
    %10513 = torch.aten.add.Scalar %result0_14560, %float9.999990e-07_14562, %int1_14563 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %10514 = torch.aten.rsqrt %10513 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_14564 = torch.constant.int 1
    %10515 = torch.aten.sub.Tensor %10491, %result1_14561, %int1_14564 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %10516 = torch.aten.mul.Tensor %10515, %10514 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_14565 = torch.constant.int 5
    %10517 = torch.prims.convert_element_type %10516, %int5_14565 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10518 = torch.aten.mul.Tensor %10510, %10517 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_14566 = torch.constant.int 1
    %10519 = torch.aten.add.Tensor %10518, %10507, %int1_14566 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_14567 = torch.constant.int 4608
    %int3072_14568 = torch.constant.int 3072
    %10520 = torch.prim.ListConstruct %int4608_14567, %int3072_14568 : (!torch.int, !torch.int) -> !torch.list<int>
    %10521 = torch.aten.view %10519, %10520 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.31.linear1.weight = util.global.load @__auto.sampler.single_blocks.31.linear1.weight : tensor<21504x3072xf16>
    %10522 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_14569 = torch.constant.int 0
    %int1_14570 = torch.constant.int 1
    %10523 = torch.aten.transpose.int %10522, %int0_14569, %int1_14570 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.31.linear1.bias = util.global.load @__auto.sampler.single_blocks.31.linear1.bias : tensor<21504xf16>
    %10524 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_14571 = torch.constant.int 6
    %10525 = torch.prims.convert_element_type %10524, %int6_14571 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_14572 = torch.constant.int 6
    %10526 = torch.prims.convert_element_type %10521, %int6_14572 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_14573 = torch.constant.int 6
    %10527 = torch.prims.convert_element_type %10523, %int6_14573 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %10528 = torch.aten.mm %10526, %10527 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_14574 = torch.constant.int 1
    %10529 = torch.aten.mul.Scalar %10528, %int1_14574 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_14575 = torch.constant.int 1
    %10530 = torch.aten.mul.Scalar %10525, %int1_14575 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_14576 = torch.constant.int 1
    %10531 = torch.aten.add.Tensor %10529, %10530, %int1_14576 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_14577 = torch.constant.int 5
    %10532 = torch.prims.convert_element_type %10531, %int5_14577 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_14578 = torch.constant.int 1
    %int4608_14579 = torch.constant.int 4608
    %int21504_14580 = torch.constant.int 21504
    %10533 = torch.prim.ListConstruct %int1_14578, %int4608_14579, %int21504_14580 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10534 = torch.aten.view %10532, %10533 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_14581 = torch.constant.int -1
    %int0_14582 = torch.constant.int 0
    %int9216_14583 = torch.constant.int 9216
    %int1_14584 = torch.constant.int 1
    %10535 = torch.aten.slice.Tensor %10534, %int-1_14581, %int0_14582, %int9216_14583, %int1_14584 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_14585 = torch.constant.int -1
    %int9216_14586 = torch.constant.int 9216
    %int21504_14587 = torch.constant.int 21504
    %int1_14588 = torch.constant.int 1
    %10536 = torch.aten.slice.Tensor %10534, %int-1_14585, %int9216_14586, %int21504_14587, %int1_14588 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_14589 = torch.constant.int 1
    %int4608_14590 = torch.constant.int 4608
    %int3_14591 = torch.constant.int 3
    %int24_14592 = torch.constant.int 24
    %int128_14593 = torch.constant.int 128
    %10537 = torch.prim.ListConstruct %int1_14589, %int4608_14590, %int3_14591, %int24_14592, %int128_14593 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10538 = torch.aten.view %10535, %10537 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_14594 = torch.constant.int 2
    %int0_14595 = torch.constant.int 0
    %int3_14596 = torch.constant.int 3
    %int1_14597 = torch.constant.int 1
    %int4_14598 = torch.constant.int 4
    %10539 = torch.prim.ListConstruct %int2_14594, %int0_14595, %int3_14596, %int1_14597, %int4_14598 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10540 = torch.aten.permute %10538, %10539 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_14599 = torch.constant.int 0
    %int0_14600 = torch.constant.int 0
    %10541 = torch.aten.select.int %10540, %int0_14599, %int0_14600 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_14601 = torch.constant.int 0
    %int1_14602 = torch.constant.int 1
    %10542 = torch.aten.select.int %10540, %int0_14601, %int1_14602 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_14603 = torch.constant.int 0
    %int2_14604 = torch.constant.int 2
    %10543 = torch.aten.select.int %10540, %int0_14603, %int2_14604 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14605 = torch.constant.int 6
    %10544 = torch.prims.convert_element_type %10541, %int6_14605 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_14606 = torch.constant.int 2
    %10545 = torch.aten.pow.Tensor_Scalar %10544, %int2_14606 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_14607 = torch.constant.int -1
    %10546 = torch.prim.ListConstruct %int-1_14607 : (!torch.int) -> !torch.list<int>
    %true_14608 = torch.constant.bool true
    %none_14609 = torch.constant.none
    %10547 = torch.aten.mean.dim %10545, %10546, %true_14608, %none_14609 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_14610 = torch.constant.float 9.9999999999999995E-7
    %int1_14611 = torch.constant.int 1
    %10548 = torch.aten.add.Scalar %10547, %float9.999990e-07_14610, %int1_14611 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10549 = torch.aten.rsqrt %10548 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10550 = torch.aten.mul.Tensor %10544, %10549 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14612 = torch.constant.int 5
    %10551 = torch.prims.convert_element_type %10550, %int5_14612 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.31.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.31.norm.query_norm.scale : tensor<128xf16>
    %10552 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10553 = torch.aten.mul.Tensor %10551, %10552 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14613 = torch.constant.int 6
    %10554 = torch.prims.convert_element_type %10542, %int6_14613 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_14614 = torch.constant.int 2
    %10555 = torch.aten.pow.Tensor_Scalar %10554, %int2_14614 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_14615 = torch.constant.int -1
    %10556 = torch.prim.ListConstruct %int-1_14615 : (!torch.int) -> !torch.list<int>
    %true_14616 = torch.constant.bool true
    %none_14617 = torch.constant.none
    %10557 = torch.aten.mean.dim %10555, %10556, %true_14616, %none_14617 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_14618 = torch.constant.float 9.9999999999999995E-7
    %int1_14619 = torch.constant.int 1
    %10558 = torch.aten.add.Scalar %10557, %float9.999990e-07_14618, %int1_14619 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10559 = torch.aten.rsqrt %10558 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10560 = torch.aten.mul.Tensor %10554, %10559 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14620 = torch.constant.int 5
    %10561 = torch.prims.convert_element_type %10560, %int5_14620 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.31.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.31.norm.key_norm.scale : tensor<128xf16>
    %10562 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10563 = torch.aten.mul.Tensor %10561, %10562 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_14621 = torch.constant.int 5
    %10564 = torch.prims.convert_element_type %10553, %int5_14621 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_14622 = torch.constant.int 5
    %10565 = torch.prims.convert_element_type %10563, %int5_14622 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14623 = torch.constant.int 6
    %10566 = torch.prims.convert_element_type %10564, %int6_14623 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_14624 = torch.constant.int 1
    %int24_14625 = torch.constant.int 24
    %int4608_14626 = torch.constant.int 4608
    %int64_14627 = torch.constant.int 64
    %int1_14628 = torch.constant.int 1
    %int2_14629 = torch.constant.int 2
    %10567 = torch.prim.ListConstruct %int1_14624, %int24_14625, %int4608_14626, %int64_14627, %int1_14628, %int2_14629 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10568 = torch.aten.view %10566, %10567 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_14630 = torch.constant.int 6
    %10569 = torch.prims.convert_element_type %10565, %int6_14630 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_14631 = torch.constant.int 1
    %int24_14632 = torch.constant.int 24
    %int4608_14633 = torch.constant.int 4608
    %int64_14634 = torch.constant.int 64
    %int1_14635 = torch.constant.int 1
    %int2_14636 = torch.constant.int 2
    %10570 = torch.prim.ListConstruct %int1_14631, %int24_14632, %int4608_14633, %int64_14634, %int1_14635, %int2_14636 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10571 = torch.aten.view %10569, %10570 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_14637 = torch.constant.int 5
    %int0_14638 = torch.constant.int 0
    %10572 = torch.aten.select.int %211, %int5_14637, %int0_14638 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14639 = torch.constant.int 5
    %int0_14640 = torch.constant.int 0
    %10573 = torch.aten.select.int %10568, %int5_14639, %int0_14640 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10574 = torch.aten.mul.Tensor %10572, %10573 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14641 = torch.constant.int 5
    %int1_14642 = torch.constant.int 1
    %10575 = torch.aten.select.int %211, %int5_14641, %int1_14642 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14643 = torch.constant.int 5
    %int1_14644 = torch.constant.int 1
    %10576 = torch.aten.select.int %10568, %int5_14643, %int1_14644 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10577 = torch.aten.mul.Tensor %10575, %10576 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14645 = torch.constant.int 1
    %10578 = torch.aten.add.Tensor %10574, %10577, %int1_14645 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14646 = torch.constant.int 5
    %int0_14647 = torch.constant.int 0
    %10579 = torch.aten.select.int %211, %int5_14646, %int0_14647 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14648 = torch.constant.int 5
    %int0_14649 = torch.constant.int 0
    %10580 = torch.aten.select.int %10571, %int5_14648, %int0_14649 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10581 = torch.aten.mul.Tensor %10579, %10580 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14650 = torch.constant.int 5
    %int1_14651 = torch.constant.int 1
    %10582 = torch.aten.select.int %211, %int5_14650, %int1_14651 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14652 = torch.constant.int 5
    %int1_14653 = torch.constant.int 1
    %10583 = torch.aten.select.int %10571, %int5_14652, %int1_14653 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10584 = torch.aten.mul.Tensor %10582, %10583 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14654 = torch.constant.int 1
    %10585 = torch.aten.add.Tensor %10581, %10584, %int1_14654 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14655 = torch.constant.int 1
    %int24_14656 = torch.constant.int 24
    %int4608_14657 = torch.constant.int 4608
    %int128_14658 = torch.constant.int 128
    %10586 = torch.prim.ListConstruct %int1_14655, %int24_14656, %int4608_14657, %int128_14658 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10587 = torch.aten.view %10578, %10586 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14659 = torch.constant.int 5
    %10588 = torch.prims.convert_element_type %10587, %int5_14659 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_14660 = torch.constant.int 1
    %int24_14661 = torch.constant.int 24
    %int4608_14662 = torch.constant.int 4608
    %int128_14663 = torch.constant.int 128
    %10589 = torch.prim.ListConstruct %int1_14660, %int24_14661, %int4608_14662, %int128_14663 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10590 = torch.aten.view %10585, %10589 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14664 = torch.constant.int 5
    %10591 = torch.prims.convert_element_type %10590, %int5_14664 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_14665 = torch.constant.float 0.000000e+00
    %false_14666 = torch.constant.bool false
    %none_14667 = torch.constant.none
    %none_14668 = torch.constant.none
    %10592:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%10588, %10591, %10543, %float0.000000e00_14665, %false_14666, %none_14667, %none_14668) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_14669 = torch.constant.int 0
    %int2_14670 = torch.constant.int 2
    %int1_14671 = torch.constant.int 1
    %int3_14672 = torch.constant.int 3
    %10593 = torch.prim.ListConstruct %int0_14669, %int2_14670, %int1_14671, %int3_14672 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10594 = torch.aten.permute %10592#0, %10593 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_14673 = torch.constant.int 1
    %int4608_14674 = torch.constant.int 4608
    %int3072_14675 = torch.constant.int 3072
    %10595 = torch.prim.ListConstruct %int1_14673, %int4608_14674, %int3072_14675 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10596 = torch.aten.view %10594, %10595 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_14676 = torch.constant.str "tanh"
    %10597 = torch.aten.gelu %10536, %str_14676 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %10598 = torch.prim.ListConstruct %10596, %10597 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_14677 = torch.constant.int 2
    %10599 = torch.aten.cat %10598, %int2_14677 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_14678 = torch.constant.int 4608
    %int15360_14679 = torch.constant.int 15360
    %10600 = torch.prim.ListConstruct %int4608_14678, %int15360_14679 : (!torch.int, !torch.int) -> !torch.list<int>
    %10601 = torch.aten.view %10599, %10600 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.31.linear2.weight = util.global.load @__auto.sampler.single_blocks.31.linear2.weight : tensor<3072x15360xf16>
    %10602 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_14680 = torch.constant.int 0
    %int1_14681 = torch.constant.int 1
    %10603 = torch.aten.transpose.int %10602, %int0_14680, %int1_14681 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.31.linear2.bias = util.global.load @__auto.sampler.single_blocks.31.linear2.bias : tensor<3072xf16>
    %10604 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_14682 = torch.constant.int 6
    %10605 = torch.prims.convert_element_type %10604, %int6_14682 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_14683 = torch.constant.int 6
    %10606 = torch.prims.convert_element_type %10601, %int6_14683 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_14684 = torch.constant.int 6
    %10607 = torch.prims.convert_element_type %10603, %int6_14684 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %10608 = torch.aten.mm %10606, %10607 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_14685 = torch.constant.int 1
    %10609 = torch.aten.mul.Scalar %10608, %int1_14685 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_14686 = torch.constant.int 1
    %10610 = torch.aten.mul.Scalar %10605, %int1_14686 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_14687 = torch.constant.int 1
    %10611 = torch.aten.add.Tensor %10609, %10610, %int1_14687 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_14688 = torch.constant.int 5
    %10612 = torch.prims.convert_element_type %10611, %int5_14688 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_14689 = torch.constant.int 1
    %int4608_14690 = torch.constant.int 4608
    %int3072_14691 = torch.constant.int 3072
    %10613 = torch.prim.ListConstruct %int1_14689, %int4608_14690, %int3072_14691 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10614 = torch.aten.view %10612, %10613 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %10615 = torch.aten.mul.Tensor %10509, %10614 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_14692 = torch.constant.int 1
    %10616 = torch.aten.add.Tensor %10491, %10615, %int1_14692 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10617 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.32.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.32.modulation.lin.weight : tensor<9216x3072xf16>
    %10618 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_14693 = torch.constant.int 0
    %int1_14694 = torch.constant.int 1
    %10619 = torch.aten.transpose.int %10618, %int0_14693, %int1_14694 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.32.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.32.modulation.lin.bias : tensor<9216xf16>
    %10620 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_14695 = torch.constant.int 6
    %10621 = torch.prims.convert_element_type %10620, %int6_14695 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_14696 = torch.constant.int 6
    %10622 = torch.prims.convert_element_type %10617, %int6_14696 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_14697 = torch.constant.int 6
    %10623 = torch.prims.convert_element_type %10619, %int6_14697 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %10624 = torch.aten.mm %10622, %10623 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_14698 = torch.constant.int 1
    %10625 = torch.aten.mul.Scalar %10624, %int1_14698 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_14699 = torch.constant.int 1
    %10626 = torch.aten.mul.Scalar %10621, %int1_14699 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_14700 = torch.constant.int 1
    %10627 = torch.aten.add.Tensor %10625, %10626, %int1_14700 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_14701 = torch.constant.int 5
    %10628 = torch.prims.convert_element_type %10627, %int5_14701 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_14702 = torch.constant.int 0
    %int0_14703 = torch.constant.int 0
    %int9223372036854775807_14704 = torch.constant.int 9223372036854775807
    %int1_14705 = torch.constant.int 1
    %10629 = torch.aten.slice.Tensor %10628, %int0_14702, %int0_14703, %int9223372036854775807_14704, %int1_14705 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_14706 = torch.constant.int 1
    %10630 = torch.aten.unsqueeze %10629, %int1_14706 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_14707 = torch.constant.int 2
    %int0_14708 = torch.constant.int 0
    %int9223372036854775807_14709 = torch.constant.int 9223372036854775807
    %int1_14710 = torch.constant.int 1
    %10631 = torch.aten.slice.Tensor %10630, %int2_14707, %int0_14708, %int9223372036854775807_14709, %int1_14710 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_14711 = torch.constant.int -1
    %int0_14712 = torch.constant.int 0
    %int3072_14713 = torch.constant.int 3072
    %int1_14714 = torch.constant.int 1
    %10632 = torch.aten.slice.Tensor %10631, %int-1_14711, %int0_14712, %int3072_14713, %int1_14714 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_14715 = torch.constant.int -1
    %int3072_14716 = torch.constant.int 3072
    %int6144_14717 = torch.constant.int 6144
    %int1_14718 = torch.constant.int 1
    %10633 = torch.aten.slice.Tensor %10631, %int-1_14715, %int3072_14716, %int6144_14717, %int1_14718 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_14719 = torch.constant.int -1
    %int6144_14720 = torch.constant.int 6144
    %int9216_14721 = torch.constant.int 9216
    %int1_14722 = torch.constant.int 1
    %10634 = torch.aten.slice.Tensor %10631, %int-1_14719, %int6144_14720, %int9216_14721, %int1_14722 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_14723 = torch.constant.int 1
    %int1_14724 = torch.constant.int 1
    %10635 = torch.aten.add.Scalar %10633, %int1_14723, %int1_14724 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_14725 = torch.constant.int 6
    %10636 = torch.prims.convert_element_type %10616, %int6_14725 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_14726 = torch.constant.int 2
    %10637 = torch.prim.ListConstruct %int2_14726 : (!torch.int) -> !torch.list<int>
    %int0_14727 = torch.constant.int 0
    %true_14728 = torch.constant.bool true
    %result0_14729, %result1_14730 = torch.aten.var_mean.correction %10636, %10637, %int0_14727, %true_14728 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_14731 = torch.constant.float 9.9999999999999995E-7
    %int1_14732 = torch.constant.int 1
    %10638 = torch.aten.add.Scalar %result0_14729, %float9.999990e-07_14731, %int1_14732 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %10639 = torch.aten.rsqrt %10638 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_14733 = torch.constant.int 1
    %10640 = torch.aten.sub.Tensor %10616, %result1_14730, %int1_14733 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %10641 = torch.aten.mul.Tensor %10640, %10639 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_14734 = torch.constant.int 5
    %10642 = torch.prims.convert_element_type %10641, %int5_14734 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10643 = torch.aten.mul.Tensor %10635, %10642 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_14735 = torch.constant.int 1
    %10644 = torch.aten.add.Tensor %10643, %10632, %int1_14735 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_14736 = torch.constant.int 4608
    %int3072_14737 = torch.constant.int 3072
    %10645 = torch.prim.ListConstruct %int4608_14736, %int3072_14737 : (!torch.int, !torch.int) -> !torch.list<int>
    %10646 = torch.aten.view %10644, %10645 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.32.linear1.weight = util.global.load @__auto.sampler.single_blocks.32.linear1.weight : tensor<21504x3072xf16>
    %10647 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_14738 = torch.constant.int 0
    %int1_14739 = torch.constant.int 1
    %10648 = torch.aten.transpose.int %10647, %int0_14738, %int1_14739 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.32.linear1.bias = util.global.load @__auto.sampler.single_blocks.32.linear1.bias : tensor<21504xf16>
    %10649 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_14740 = torch.constant.int 6
    %10650 = torch.prims.convert_element_type %10649, %int6_14740 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_14741 = torch.constant.int 6
    %10651 = torch.prims.convert_element_type %10646, %int6_14741 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_14742 = torch.constant.int 6
    %10652 = torch.prims.convert_element_type %10648, %int6_14742 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %10653 = torch.aten.mm %10651, %10652 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_14743 = torch.constant.int 1
    %10654 = torch.aten.mul.Scalar %10653, %int1_14743 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_14744 = torch.constant.int 1
    %10655 = torch.aten.mul.Scalar %10650, %int1_14744 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_14745 = torch.constant.int 1
    %10656 = torch.aten.add.Tensor %10654, %10655, %int1_14745 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_14746 = torch.constant.int 5
    %10657 = torch.prims.convert_element_type %10656, %int5_14746 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_14747 = torch.constant.int 1
    %int4608_14748 = torch.constant.int 4608
    %int21504_14749 = torch.constant.int 21504
    %10658 = torch.prim.ListConstruct %int1_14747, %int4608_14748, %int21504_14749 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10659 = torch.aten.view %10657, %10658 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_14750 = torch.constant.int -1
    %int0_14751 = torch.constant.int 0
    %int9216_14752 = torch.constant.int 9216
    %int1_14753 = torch.constant.int 1
    %10660 = torch.aten.slice.Tensor %10659, %int-1_14750, %int0_14751, %int9216_14752, %int1_14753 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_14754 = torch.constant.int -1
    %int9216_14755 = torch.constant.int 9216
    %int21504_14756 = torch.constant.int 21504
    %int1_14757 = torch.constant.int 1
    %10661 = torch.aten.slice.Tensor %10659, %int-1_14754, %int9216_14755, %int21504_14756, %int1_14757 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_14758 = torch.constant.int 1
    %int4608_14759 = torch.constant.int 4608
    %int3_14760 = torch.constant.int 3
    %int24_14761 = torch.constant.int 24
    %int128_14762 = torch.constant.int 128
    %10662 = torch.prim.ListConstruct %int1_14758, %int4608_14759, %int3_14760, %int24_14761, %int128_14762 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10663 = torch.aten.view %10660, %10662 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_14763 = torch.constant.int 2
    %int0_14764 = torch.constant.int 0
    %int3_14765 = torch.constant.int 3
    %int1_14766 = torch.constant.int 1
    %int4_14767 = torch.constant.int 4
    %10664 = torch.prim.ListConstruct %int2_14763, %int0_14764, %int3_14765, %int1_14766, %int4_14767 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10665 = torch.aten.permute %10663, %10664 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_14768 = torch.constant.int 0
    %int0_14769 = torch.constant.int 0
    %10666 = torch.aten.select.int %10665, %int0_14768, %int0_14769 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_14770 = torch.constant.int 0
    %int1_14771 = torch.constant.int 1
    %10667 = torch.aten.select.int %10665, %int0_14770, %int1_14771 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_14772 = torch.constant.int 0
    %int2_14773 = torch.constant.int 2
    %10668 = torch.aten.select.int %10665, %int0_14772, %int2_14773 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14774 = torch.constant.int 6
    %10669 = torch.prims.convert_element_type %10666, %int6_14774 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_14775 = torch.constant.int 2
    %10670 = torch.aten.pow.Tensor_Scalar %10669, %int2_14775 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_14776 = torch.constant.int -1
    %10671 = torch.prim.ListConstruct %int-1_14776 : (!torch.int) -> !torch.list<int>
    %true_14777 = torch.constant.bool true
    %none_14778 = torch.constant.none
    %10672 = torch.aten.mean.dim %10670, %10671, %true_14777, %none_14778 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_14779 = torch.constant.float 9.9999999999999995E-7
    %int1_14780 = torch.constant.int 1
    %10673 = torch.aten.add.Scalar %10672, %float9.999990e-07_14779, %int1_14780 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10674 = torch.aten.rsqrt %10673 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10675 = torch.aten.mul.Tensor %10669, %10674 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14781 = torch.constant.int 5
    %10676 = torch.prims.convert_element_type %10675, %int5_14781 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.32.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.32.norm.query_norm.scale : tensor<128xf16>
    %10677 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10678 = torch.aten.mul.Tensor %10676, %10677 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14782 = torch.constant.int 6
    %10679 = torch.prims.convert_element_type %10667, %int6_14782 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_14783 = torch.constant.int 2
    %10680 = torch.aten.pow.Tensor_Scalar %10679, %int2_14783 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_14784 = torch.constant.int -1
    %10681 = torch.prim.ListConstruct %int-1_14784 : (!torch.int) -> !torch.list<int>
    %true_14785 = torch.constant.bool true
    %none_14786 = torch.constant.none
    %10682 = torch.aten.mean.dim %10680, %10681, %true_14785, %none_14786 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_14787 = torch.constant.float 9.9999999999999995E-7
    %int1_14788 = torch.constant.int 1
    %10683 = torch.aten.add.Scalar %10682, %float9.999990e-07_14787, %int1_14788 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10684 = torch.aten.rsqrt %10683 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10685 = torch.aten.mul.Tensor %10679, %10684 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14789 = torch.constant.int 5
    %10686 = torch.prims.convert_element_type %10685, %int5_14789 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.32.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.32.norm.key_norm.scale : tensor<128xf16>
    %10687 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10688 = torch.aten.mul.Tensor %10686, %10687 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_14790 = torch.constant.int 5
    %10689 = torch.prims.convert_element_type %10678, %int5_14790 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_14791 = torch.constant.int 5
    %10690 = torch.prims.convert_element_type %10688, %int5_14791 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14792 = torch.constant.int 6
    %10691 = torch.prims.convert_element_type %10689, %int6_14792 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_14793 = torch.constant.int 1
    %int24_14794 = torch.constant.int 24
    %int4608_14795 = torch.constant.int 4608
    %int64_14796 = torch.constant.int 64
    %int1_14797 = torch.constant.int 1
    %int2_14798 = torch.constant.int 2
    %10692 = torch.prim.ListConstruct %int1_14793, %int24_14794, %int4608_14795, %int64_14796, %int1_14797, %int2_14798 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10693 = torch.aten.view %10691, %10692 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_14799 = torch.constant.int 6
    %10694 = torch.prims.convert_element_type %10690, %int6_14799 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_14800 = torch.constant.int 1
    %int24_14801 = torch.constant.int 24
    %int4608_14802 = torch.constant.int 4608
    %int64_14803 = torch.constant.int 64
    %int1_14804 = torch.constant.int 1
    %int2_14805 = torch.constant.int 2
    %10695 = torch.prim.ListConstruct %int1_14800, %int24_14801, %int4608_14802, %int64_14803, %int1_14804, %int2_14805 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10696 = torch.aten.view %10694, %10695 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_14806 = torch.constant.int 5
    %int0_14807 = torch.constant.int 0
    %10697 = torch.aten.select.int %211, %int5_14806, %int0_14807 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14808 = torch.constant.int 5
    %int0_14809 = torch.constant.int 0
    %10698 = torch.aten.select.int %10693, %int5_14808, %int0_14809 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10699 = torch.aten.mul.Tensor %10697, %10698 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14810 = torch.constant.int 5
    %int1_14811 = torch.constant.int 1
    %10700 = torch.aten.select.int %211, %int5_14810, %int1_14811 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14812 = torch.constant.int 5
    %int1_14813 = torch.constant.int 1
    %10701 = torch.aten.select.int %10693, %int5_14812, %int1_14813 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10702 = torch.aten.mul.Tensor %10700, %10701 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14814 = torch.constant.int 1
    %10703 = torch.aten.add.Tensor %10699, %10702, %int1_14814 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14815 = torch.constant.int 5
    %int0_14816 = torch.constant.int 0
    %10704 = torch.aten.select.int %211, %int5_14815, %int0_14816 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14817 = torch.constant.int 5
    %int0_14818 = torch.constant.int 0
    %10705 = torch.aten.select.int %10696, %int5_14817, %int0_14818 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10706 = torch.aten.mul.Tensor %10704, %10705 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14819 = torch.constant.int 5
    %int1_14820 = torch.constant.int 1
    %10707 = torch.aten.select.int %211, %int5_14819, %int1_14820 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14821 = torch.constant.int 5
    %int1_14822 = torch.constant.int 1
    %10708 = torch.aten.select.int %10696, %int5_14821, %int1_14822 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10709 = torch.aten.mul.Tensor %10707, %10708 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14823 = torch.constant.int 1
    %10710 = torch.aten.add.Tensor %10706, %10709, %int1_14823 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14824 = torch.constant.int 1
    %int24_14825 = torch.constant.int 24
    %int4608_14826 = torch.constant.int 4608
    %int128_14827 = torch.constant.int 128
    %10711 = torch.prim.ListConstruct %int1_14824, %int24_14825, %int4608_14826, %int128_14827 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10712 = torch.aten.view %10703, %10711 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14828 = torch.constant.int 5
    %10713 = torch.prims.convert_element_type %10712, %int5_14828 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_14829 = torch.constant.int 1
    %int24_14830 = torch.constant.int 24
    %int4608_14831 = torch.constant.int 4608
    %int128_14832 = torch.constant.int 128
    %10714 = torch.prim.ListConstruct %int1_14829, %int24_14830, %int4608_14831, %int128_14832 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10715 = torch.aten.view %10710, %10714 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14833 = torch.constant.int 5
    %10716 = torch.prims.convert_element_type %10715, %int5_14833 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_14834 = torch.constant.float 0.000000e+00
    %false_14835 = torch.constant.bool false
    %none_14836 = torch.constant.none
    %none_14837 = torch.constant.none
    %10717:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%10713, %10716, %10668, %float0.000000e00_14834, %false_14835, %none_14836, %none_14837) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_14838 = torch.constant.int 0
    %int2_14839 = torch.constant.int 2
    %int1_14840 = torch.constant.int 1
    %int3_14841 = torch.constant.int 3
    %10718 = torch.prim.ListConstruct %int0_14838, %int2_14839, %int1_14840, %int3_14841 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10719 = torch.aten.permute %10717#0, %10718 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_14842 = torch.constant.int 1
    %int4608_14843 = torch.constant.int 4608
    %int3072_14844 = torch.constant.int 3072
    %10720 = torch.prim.ListConstruct %int1_14842, %int4608_14843, %int3072_14844 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10721 = torch.aten.view %10719, %10720 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_14845 = torch.constant.str "tanh"
    %10722 = torch.aten.gelu %10661, %str_14845 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %10723 = torch.prim.ListConstruct %10721, %10722 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_14846 = torch.constant.int 2
    %10724 = torch.aten.cat %10723, %int2_14846 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_14847 = torch.constant.int 4608
    %int15360_14848 = torch.constant.int 15360
    %10725 = torch.prim.ListConstruct %int4608_14847, %int15360_14848 : (!torch.int, !torch.int) -> !torch.list<int>
    %10726 = torch.aten.view %10724, %10725 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.32.linear2.weight = util.global.load @__auto.sampler.single_blocks.32.linear2.weight : tensor<3072x15360xf16>
    %10727 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_14849 = torch.constant.int 0
    %int1_14850 = torch.constant.int 1
    %10728 = torch.aten.transpose.int %10727, %int0_14849, %int1_14850 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.32.linear2.bias = util.global.load @__auto.sampler.single_blocks.32.linear2.bias : tensor<3072xf16>
    %10729 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_14851 = torch.constant.int 6
    %10730 = torch.prims.convert_element_type %10729, %int6_14851 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_14852 = torch.constant.int 6
    %10731 = torch.prims.convert_element_type %10726, %int6_14852 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_14853 = torch.constant.int 6
    %10732 = torch.prims.convert_element_type %10728, %int6_14853 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %10733 = torch.aten.mm %10731, %10732 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_14854 = torch.constant.int 1
    %10734 = torch.aten.mul.Scalar %10733, %int1_14854 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_14855 = torch.constant.int 1
    %10735 = torch.aten.mul.Scalar %10730, %int1_14855 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_14856 = torch.constant.int 1
    %10736 = torch.aten.add.Tensor %10734, %10735, %int1_14856 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_14857 = torch.constant.int 5
    %10737 = torch.prims.convert_element_type %10736, %int5_14857 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_14858 = torch.constant.int 1
    %int4608_14859 = torch.constant.int 4608
    %int3072_14860 = torch.constant.int 3072
    %10738 = torch.prim.ListConstruct %int1_14858, %int4608_14859, %int3072_14860 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10739 = torch.aten.view %10737, %10738 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %10740 = torch.aten.mul.Tensor %10634, %10739 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_14861 = torch.constant.int 1
    %10741 = torch.aten.add.Tensor %10616, %10740, %int1_14861 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10742 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.33.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.33.modulation.lin.weight : tensor<9216x3072xf16>
    %10743 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_14862 = torch.constant.int 0
    %int1_14863 = torch.constant.int 1
    %10744 = torch.aten.transpose.int %10743, %int0_14862, %int1_14863 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.33.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.33.modulation.lin.bias : tensor<9216xf16>
    %10745 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_14864 = torch.constant.int 6
    %10746 = torch.prims.convert_element_type %10745, %int6_14864 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_14865 = torch.constant.int 6
    %10747 = torch.prims.convert_element_type %10742, %int6_14865 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_14866 = torch.constant.int 6
    %10748 = torch.prims.convert_element_type %10744, %int6_14866 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %10749 = torch.aten.mm %10747, %10748 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_14867 = torch.constant.int 1
    %10750 = torch.aten.mul.Scalar %10749, %int1_14867 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_14868 = torch.constant.int 1
    %10751 = torch.aten.mul.Scalar %10746, %int1_14868 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_14869 = torch.constant.int 1
    %10752 = torch.aten.add.Tensor %10750, %10751, %int1_14869 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_14870 = torch.constant.int 5
    %10753 = torch.prims.convert_element_type %10752, %int5_14870 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_14871 = torch.constant.int 0
    %int0_14872 = torch.constant.int 0
    %int9223372036854775807_14873 = torch.constant.int 9223372036854775807
    %int1_14874 = torch.constant.int 1
    %10754 = torch.aten.slice.Tensor %10753, %int0_14871, %int0_14872, %int9223372036854775807_14873, %int1_14874 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_14875 = torch.constant.int 1
    %10755 = torch.aten.unsqueeze %10754, %int1_14875 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_14876 = torch.constant.int 2
    %int0_14877 = torch.constant.int 0
    %int9223372036854775807_14878 = torch.constant.int 9223372036854775807
    %int1_14879 = torch.constant.int 1
    %10756 = torch.aten.slice.Tensor %10755, %int2_14876, %int0_14877, %int9223372036854775807_14878, %int1_14879 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_14880 = torch.constant.int -1
    %int0_14881 = torch.constant.int 0
    %int3072_14882 = torch.constant.int 3072
    %int1_14883 = torch.constant.int 1
    %10757 = torch.aten.slice.Tensor %10756, %int-1_14880, %int0_14881, %int3072_14882, %int1_14883 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_14884 = torch.constant.int -1
    %int3072_14885 = torch.constant.int 3072
    %int6144_14886 = torch.constant.int 6144
    %int1_14887 = torch.constant.int 1
    %10758 = torch.aten.slice.Tensor %10756, %int-1_14884, %int3072_14885, %int6144_14886, %int1_14887 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_14888 = torch.constant.int -1
    %int6144_14889 = torch.constant.int 6144
    %int9216_14890 = torch.constant.int 9216
    %int1_14891 = torch.constant.int 1
    %10759 = torch.aten.slice.Tensor %10756, %int-1_14888, %int6144_14889, %int9216_14890, %int1_14891 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_14892 = torch.constant.int 1
    %int1_14893 = torch.constant.int 1
    %10760 = torch.aten.add.Scalar %10758, %int1_14892, %int1_14893 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_14894 = torch.constant.int 6
    %10761 = torch.prims.convert_element_type %10741, %int6_14894 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_14895 = torch.constant.int 2
    %10762 = torch.prim.ListConstruct %int2_14895 : (!torch.int) -> !torch.list<int>
    %int0_14896 = torch.constant.int 0
    %true_14897 = torch.constant.bool true
    %result0_14898, %result1_14899 = torch.aten.var_mean.correction %10761, %10762, %int0_14896, %true_14897 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_14900 = torch.constant.float 9.9999999999999995E-7
    %int1_14901 = torch.constant.int 1
    %10763 = torch.aten.add.Scalar %result0_14898, %float9.999990e-07_14900, %int1_14901 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %10764 = torch.aten.rsqrt %10763 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_14902 = torch.constant.int 1
    %10765 = torch.aten.sub.Tensor %10741, %result1_14899, %int1_14902 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %10766 = torch.aten.mul.Tensor %10765, %10764 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_14903 = torch.constant.int 5
    %10767 = torch.prims.convert_element_type %10766, %int5_14903 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10768 = torch.aten.mul.Tensor %10760, %10767 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_14904 = torch.constant.int 1
    %10769 = torch.aten.add.Tensor %10768, %10757, %int1_14904 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_14905 = torch.constant.int 4608
    %int3072_14906 = torch.constant.int 3072
    %10770 = torch.prim.ListConstruct %int4608_14905, %int3072_14906 : (!torch.int, !torch.int) -> !torch.list<int>
    %10771 = torch.aten.view %10769, %10770 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.33.linear1.weight = util.global.load @__auto.sampler.single_blocks.33.linear1.weight : tensor<21504x3072xf16>
    %10772 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_14907 = torch.constant.int 0
    %int1_14908 = torch.constant.int 1
    %10773 = torch.aten.transpose.int %10772, %int0_14907, %int1_14908 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.33.linear1.bias = util.global.load @__auto.sampler.single_blocks.33.linear1.bias : tensor<21504xf16>
    %10774 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_14909 = torch.constant.int 6
    %10775 = torch.prims.convert_element_type %10774, %int6_14909 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_14910 = torch.constant.int 6
    %10776 = torch.prims.convert_element_type %10771, %int6_14910 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_14911 = torch.constant.int 6
    %10777 = torch.prims.convert_element_type %10773, %int6_14911 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %10778 = torch.aten.mm %10776, %10777 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_14912 = torch.constant.int 1
    %10779 = torch.aten.mul.Scalar %10778, %int1_14912 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_14913 = torch.constant.int 1
    %10780 = torch.aten.mul.Scalar %10775, %int1_14913 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_14914 = torch.constant.int 1
    %10781 = torch.aten.add.Tensor %10779, %10780, %int1_14914 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_14915 = torch.constant.int 5
    %10782 = torch.prims.convert_element_type %10781, %int5_14915 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_14916 = torch.constant.int 1
    %int4608_14917 = torch.constant.int 4608
    %int21504_14918 = torch.constant.int 21504
    %10783 = torch.prim.ListConstruct %int1_14916, %int4608_14917, %int21504_14918 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10784 = torch.aten.view %10782, %10783 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_14919 = torch.constant.int -1
    %int0_14920 = torch.constant.int 0
    %int9216_14921 = torch.constant.int 9216
    %int1_14922 = torch.constant.int 1
    %10785 = torch.aten.slice.Tensor %10784, %int-1_14919, %int0_14920, %int9216_14921, %int1_14922 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_14923 = torch.constant.int -1
    %int9216_14924 = torch.constant.int 9216
    %int21504_14925 = torch.constant.int 21504
    %int1_14926 = torch.constant.int 1
    %10786 = torch.aten.slice.Tensor %10784, %int-1_14923, %int9216_14924, %int21504_14925, %int1_14926 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_14927 = torch.constant.int 1
    %int4608_14928 = torch.constant.int 4608
    %int3_14929 = torch.constant.int 3
    %int24_14930 = torch.constant.int 24
    %int128_14931 = torch.constant.int 128
    %10787 = torch.prim.ListConstruct %int1_14927, %int4608_14928, %int3_14929, %int24_14930, %int128_14931 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10788 = torch.aten.view %10785, %10787 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_14932 = torch.constant.int 2
    %int0_14933 = torch.constant.int 0
    %int3_14934 = torch.constant.int 3
    %int1_14935 = torch.constant.int 1
    %int4_14936 = torch.constant.int 4
    %10789 = torch.prim.ListConstruct %int2_14932, %int0_14933, %int3_14934, %int1_14935, %int4_14936 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10790 = torch.aten.permute %10788, %10789 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_14937 = torch.constant.int 0
    %int0_14938 = torch.constant.int 0
    %10791 = torch.aten.select.int %10790, %int0_14937, %int0_14938 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_14939 = torch.constant.int 0
    %int1_14940 = torch.constant.int 1
    %10792 = torch.aten.select.int %10790, %int0_14939, %int1_14940 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_14941 = torch.constant.int 0
    %int2_14942 = torch.constant.int 2
    %10793 = torch.aten.select.int %10790, %int0_14941, %int2_14942 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14943 = torch.constant.int 6
    %10794 = torch.prims.convert_element_type %10791, %int6_14943 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_14944 = torch.constant.int 2
    %10795 = torch.aten.pow.Tensor_Scalar %10794, %int2_14944 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_14945 = torch.constant.int -1
    %10796 = torch.prim.ListConstruct %int-1_14945 : (!torch.int) -> !torch.list<int>
    %true_14946 = torch.constant.bool true
    %none_14947 = torch.constant.none
    %10797 = torch.aten.mean.dim %10795, %10796, %true_14946, %none_14947 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_14948 = torch.constant.float 9.9999999999999995E-7
    %int1_14949 = torch.constant.int 1
    %10798 = torch.aten.add.Scalar %10797, %float9.999990e-07_14948, %int1_14949 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10799 = torch.aten.rsqrt %10798 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10800 = torch.aten.mul.Tensor %10794, %10799 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14950 = torch.constant.int 5
    %10801 = torch.prims.convert_element_type %10800, %int5_14950 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.33.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.33.norm.query_norm.scale : tensor<128xf16>
    %10802 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10803 = torch.aten.mul.Tensor %10801, %10802 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14951 = torch.constant.int 6
    %10804 = torch.prims.convert_element_type %10792, %int6_14951 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_14952 = torch.constant.int 2
    %10805 = torch.aten.pow.Tensor_Scalar %10804, %int2_14952 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_14953 = torch.constant.int -1
    %10806 = torch.prim.ListConstruct %int-1_14953 : (!torch.int) -> !torch.list<int>
    %true_14954 = torch.constant.bool true
    %none_14955 = torch.constant.none
    %10807 = torch.aten.mean.dim %10805, %10806, %true_14954, %none_14955 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_14956 = torch.constant.float 9.9999999999999995E-7
    %int1_14957 = torch.constant.int 1
    %10808 = torch.aten.add.Scalar %10807, %float9.999990e-07_14956, %int1_14957 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10809 = torch.aten.rsqrt %10808 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10810 = torch.aten.mul.Tensor %10804, %10809 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14958 = torch.constant.int 5
    %10811 = torch.prims.convert_element_type %10810, %int5_14958 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.33.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.33.norm.key_norm.scale : tensor<128xf16>
    %10812 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10813 = torch.aten.mul.Tensor %10811, %10812 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_14959 = torch.constant.int 5
    %10814 = torch.prims.convert_element_type %10803, %int5_14959 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_14960 = torch.constant.int 5
    %10815 = torch.prims.convert_element_type %10813, %int5_14960 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14961 = torch.constant.int 6
    %10816 = torch.prims.convert_element_type %10814, %int6_14961 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_14962 = torch.constant.int 1
    %int24_14963 = torch.constant.int 24
    %int4608_14964 = torch.constant.int 4608
    %int64_14965 = torch.constant.int 64
    %int1_14966 = torch.constant.int 1
    %int2_14967 = torch.constant.int 2
    %10817 = torch.prim.ListConstruct %int1_14962, %int24_14963, %int4608_14964, %int64_14965, %int1_14966, %int2_14967 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10818 = torch.aten.view %10816, %10817 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_14968 = torch.constant.int 6
    %10819 = torch.prims.convert_element_type %10815, %int6_14968 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_14969 = torch.constant.int 1
    %int24_14970 = torch.constant.int 24
    %int4608_14971 = torch.constant.int 4608
    %int64_14972 = torch.constant.int 64
    %int1_14973 = torch.constant.int 1
    %int2_14974 = torch.constant.int 2
    %10820 = torch.prim.ListConstruct %int1_14969, %int24_14970, %int4608_14971, %int64_14972, %int1_14973, %int2_14974 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10821 = torch.aten.view %10819, %10820 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_14975 = torch.constant.int 5
    %int0_14976 = torch.constant.int 0
    %10822 = torch.aten.select.int %211, %int5_14975, %int0_14976 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14977 = torch.constant.int 5
    %int0_14978 = torch.constant.int 0
    %10823 = torch.aten.select.int %10818, %int5_14977, %int0_14978 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10824 = torch.aten.mul.Tensor %10822, %10823 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14979 = torch.constant.int 5
    %int1_14980 = torch.constant.int 1
    %10825 = torch.aten.select.int %211, %int5_14979, %int1_14980 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14981 = torch.constant.int 5
    %int1_14982 = torch.constant.int 1
    %10826 = torch.aten.select.int %10818, %int5_14981, %int1_14982 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10827 = torch.aten.mul.Tensor %10825, %10826 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14983 = torch.constant.int 1
    %10828 = torch.aten.add.Tensor %10824, %10827, %int1_14983 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14984 = torch.constant.int 5
    %int0_14985 = torch.constant.int 0
    %10829 = torch.aten.select.int %211, %int5_14984, %int0_14985 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14986 = torch.constant.int 5
    %int0_14987 = torch.constant.int 0
    %10830 = torch.aten.select.int %10821, %int5_14986, %int0_14987 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10831 = torch.aten.mul.Tensor %10829, %10830 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14988 = torch.constant.int 5
    %int1_14989 = torch.constant.int 1
    %10832 = torch.aten.select.int %211, %int5_14988, %int1_14989 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14990 = torch.constant.int 5
    %int1_14991 = torch.constant.int 1
    %10833 = torch.aten.select.int %10821, %int5_14990, %int1_14991 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10834 = torch.aten.mul.Tensor %10832, %10833 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14992 = torch.constant.int 1
    %10835 = torch.aten.add.Tensor %10831, %10834, %int1_14992 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14993 = torch.constant.int 1
    %int24_14994 = torch.constant.int 24
    %int4608_14995 = torch.constant.int 4608
    %int128_14996 = torch.constant.int 128
    %10836 = torch.prim.ListConstruct %int1_14993, %int24_14994, %int4608_14995, %int128_14996 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10837 = torch.aten.view %10828, %10836 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14997 = torch.constant.int 5
    %10838 = torch.prims.convert_element_type %10837, %int5_14997 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_14998 = torch.constant.int 1
    %int24_14999 = torch.constant.int 24
    %int4608_15000 = torch.constant.int 4608
    %int128_15001 = torch.constant.int 128
    %10839 = torch.prim.ListConstruct %int1_14998, %int24_14999, %int4608_15000, %int128_15001 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10840 = torch.aten.view %10835, %10839 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15002 = torch.constant.int 5
    %10841 = torch.prims.convert_element_type %10840, %int5_15002 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_15003 = torch.constant.float 0.000000e+00
    %false_15004 = torch.constant.bool false
    %none_15005 = torch.constant.none
    %none_15006 = torch.constant.none
    %10842:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%10838, %10841, %10793, %float0.000000e00_15003, %false_15004, %none_15005, %none_15006) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_15007 = torch.constant.int 0
    %int2_15008 = torch.constant.int 2
    %int1_15009 = torch.constant.int 1
    %int3_15010 = torch.constant.int 3
    %10843 = torch.prim.ListConstruct %int0_15007, %int2_15008, %int1_15009, %int3_15010 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10844 = torch.aten.permute %10842#0, %10843 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_15011 = torch.constant.int 1
    %int4608_15012 = torch.constant.int 4608
    %int3072_15013 = torch.constant.int 3072
    %10845 = torch.prim.ListConstruct %int1_15011, %int4608_15012, %int3072_15013 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10846 = torch.aten.view %10844, %10845 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_15014 = torch.constant.str "tanh"
    %10847 = torch.aten.gelu %10786, %str_15014 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %10848 = torch.prim.ListConstruct %10846, %10847 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_15015 = torch.constant.int 2
    %10849 = torch.aten.cat %10848, %int2_15015 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_15016 = torch.constant.int 4608
    %int15360_15017 = torch.constant.int 15360
    %10850 = torch.prim.ListConstruct %int4608_15016, %int15360_15017 : (!torch.int, !torch.int) -> !torch.list<int>
    %10851 = torch.aten.view %10849, %10850 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.33.linear2.weight = util.global.load @__auto.sampler.single_blocks.33.linear2.weight : tensor<3072x15360xf16>
    %10852 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_15018 = torch.constant.int 0
    %int1_15019 = torch.constant.int 1
    %10853 = torch.aten.transpose.int %10852, %int0_15018, %int1_15019 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.33.linear2.bias = util.global.load @__auto.sampler.single_blocks.33.linear2.bias : tensor<3072xf16>
    %10854 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_15020 = torch.constant.int 6
    %10855 = torch.prims.convert_element_type %10854, %int6_15020 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_15021 = torch.constant.int 6
    %10856 = torch.prims.convert_element_type %10851, %int6_15021 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_15022 = torch.constant.int 6
    %10857 = torch.prims.convert_element_type %10853, %int6_15022 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %10858 = torch.aten.mm %10856, %10857 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_15023 = torch.constant.int 1
    %10859 = torch.aten.mul.Scalar %10858, %int1_15023 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_15024 = torch.constant.int 1
    %10860 = torch.aten.mul.Scalar %10855, %int1_15024 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_15025 = torch.constant.int 1
    %10861 = torch.aten.add.Tensor %10859, %10860, %int1_15025 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_15026 = torch.constant.int 5
    %10862 = torch.prims.convert_element_type %10861, %int5_15026 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_15027 = torch.constant.int 1
    %int4608_15028 = torch.constant.int 4608
    %int3072_15029 = torch.constant.int 3072
    %10863 = torch.prim.ListConstruct %int1_15027, %int4608_15028, %int3072_15029 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10864 = torch.aten.view %10862, %10863 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %10865 = torch.aten.mul.Tensor %10759, %10864 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_15030 = torch.constant.int 1
    %10866 = torch.aten.add.Tensor %10741, %10865, %int1_15030 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10867 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.34.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.34.modulation.lin.weight : tensor<9216x3072xf16>
    %10868 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_15031 = torch.constant.int 0
    %int1_15032 = torch.constant.int 1
    %10869 = torch.aten.transpose.int %10868, %int0_15031, %int1_15032 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.34.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.34.modulation.lin.bias : tensor<9216xf16>
    %10870 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_15033 = torch.constant.int 6
    %10871 = torch.prims.convert_element_type %10870, %int6_15033 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_15034 = torch.constant.int 6
    %10872 = torch.prims.convert_element_type %10867, %int6_15034 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_15035 = torch.constant.int 6
    %10873 = torch.prims.convert_element_type %10869, %int6_15035 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %10874 = torch.aten.mm %10872, %10873 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_15036 = torch.constant.int 1
    %10875 = torch.aten.mul.Scalar %10874, %int1_15036 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_15037 = torch.constant.int 1
    %10876 = torch.aten.mul.Scalar %10871, %int1_15037 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_15038 = torch.constant.int 1
    %10877 = torch.aten.add.Tensor %10875, %10876, %int1_15038 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_15039 = torch.constant.int 5
    %10878 = torch.prims.convert_element_type %10877, %int5_15039 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_15040 = torch.constant.int 0
    %int0_15041 = torch.constant.int 0
    %int9223372036854775807_15042 = torch.constant.int 9223372036854775807
    %int1_15043 = torch.constant.int 1
    %10879 = torch.aten.slice.Tensor %10878, %int0_15040, %int0_15041, %int9223372036854775807_15042, %int1_15043 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_15044 = torch.constant.int 1
    %10880 = torch.aten.unsqueeze %10879, %int1_15044 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_15045 = torch.constant.int 2
    %int0_15046 = torch.constant.int 0
    %int9223372036854775807_15047 = torch.constant.int 9223372036854775807
    %int1_15048 = torch.constant.int 1
    %10881 = torch.aten.slice.Tensor %10880, %int2_15045, %int0_15046, %int9223372036854775807_15047, %int1_15048 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_15049 = torch.constant.int -1
    %int0_15050 = torch.constant.int 0
    %int3072_15051 = torch.constant.int 3072
    %int1_15052 = torch.constant.int 1
    %10882 = torch.aten.slice.Tensor %10881, %int-1_15049, %int0_15050, %int3072_15051, %int1_15052 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_15053 = torch.constant.int -1
    %int3072_15054 = torch.constant.int 3072
    %int6144_15055 = torch.constant.int 6144
    %int1_15056 = torch.constant.int 1
    %10883 = torch.aten.slice.Tensor %10881, %int-1_15053, %int3072_15054, %int6144_15055, %int1_15056 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_15057 = torch.constant.int -1
    %int6144_15058 = torch.constant.int 6144
    %int9216_15059 = torch.constant.int 9216
    %int1_15060 = torch.constant.int 1
    %10884 = torch.aten.slice.Tensor %10881, %int-1_15057, %int6144_15058, %int9216_15059, %int1_15060 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_15061 = torch.constant.int 1
    %int1_15062 = torch.constant.int 1
    %10885 = torch.aten.add.Scalar %10883, %int1_15061, %int1_15062 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_15063 = torch.constant.int 6
    %10886 = torch.prims.convert_element_type %10866, %int6_15063 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_15064 = torch.constant.int 2
    %10887 = torch.prim.ListConstruct %int2_15064 : (!torch.int) -> !torch.list<int>
    %int0_15065 = torch.constant.int 0
    %true_15066 = torch.constant.bool true
    %result0_15067, %result1_15068 = torch.aten.var_mean.correction %10886, %10887, %int0_15065, %true_15066 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_15069 = torch.constant.float 9.9999999999999995E-7
    %int1_15070 = torch.constant.int 1
    %10888 = torch.aten.add.Scalar %result0_15067, %float9.999990e-07_15069, %int1_15070 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %10889 = torch.aten.rsqrt %10888 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_15071 = torch.constant.int 1
    %10890 = torch.aten.sub.Tensor %10866, %result1_15068, %int1_15071 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %10891 = torch.aten.mul.Tensor %10890, %10889 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_15072 = torch.constant.int 5
    %10892 = torch.prims.convert_element_type %10891, %int5_15072 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10893 = torch.aten.mul.Tensor %10885, %10892 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_15073 = torch.constant.int 1
    %10894 = torch.aten.add.Tensor %10893, %10882, %int1_15073 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_15074 = torch.constant.int 4608
    %int3072_15075 = torch.constant.int 3072
    %10895 = torch.prim.ListConstruct %int4608_15074, %int3072_15075 : (!torch.int, !torch.int) -> !torch.list<int>
    %10896 = torch.aten.view %10894, %10895 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.34.linear1.weight = util.global.load @__auto.sampler.single_blocks.34.linear1.weight : tensor<21504x3072xf16>
    %10897 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_15076 = torch.constant.int 0
    %int1_15077 = torch.constant.int 1
    %10898 = torch.aten.transpose.int %10897, %int0_15076, %int1_15077 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.34.linear1.bias = util.global.load @__auto.sampler.single_blocks.34.linear1.bias : tensor<21504xf16>
    %10899 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_15078 = torch.constant.int 6
    %10900 = torch.prims.convert_element_type %10899, %int6_15078 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_15079 = torch.constant.int 6
    %10901 = torch.prims.convert_element_type %10896, %int6_15079 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_15080 = torch.constant.int 6
    %10902 = torch.prims.convert_element_type %10898, %int6_15080 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %10903 = torch.aten.mm %10901, %10902 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_15081 = torch.constant.int 1
    %10904 = torch.aten.mul.Scalar %10903, %int1_15081 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_15082 = torch.constant.int 1
    %10905 = torch.aten.mul.Scalar %10900, %int1_15082 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_15083 = torch.constant.int 1
    %10906 = torch.aten.add.Tensor %10904, %10905, %int1_15083 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_15084 = torch.constant.int 5
    %10907 = torch.prims.convert_element_type %10906, %int5_15084 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_15085 = torch.constant.int 1
    %int4608_15086 = torch.constant.int 4608
    %int21504_15087 = torch.constant.int 21504
    %10908 = torch.prim.ListConstruct %int1_15085, %int4608_15086, %int21504_15087 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10909 = torch.aten.view %10907, %10908 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_15088 = torch.constant.int -1
    %int0_15089 = torch.constant.int 0
    %int9216_15090 = torch.constant.int 9216
    %int1_15091 = torch.constant.int 1
    %10910 = torch.aten.slice.Tensor %10909, %int-1_15088, %int0_15089, %int9216_15090, %int1_15091 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_15092 = torch.constant.int -1
    %int9216_15093 = torch.constant.int 9216
    %int21504_15094 = torch.constant.int 21504
    %int1_15095 = torch.constant.int 1
    %10911 = torch.aten.slice.Tensor %10909, %int-1_15092, %int9216_15093, %int21504_15094, %int1_15095 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_15096 = torch.constant.int 1
    %int4608_15097 = torch.constant.int 4608
    %int3_15098 = torch.constant.int 3
    %int24_15099 = torch.constant.int 24
    %int128_15100 = torch.constant.int 128
    %10912 = torch.prim.ListConstruct %int1_15096, %int4608_15097, %int3_15098, %int24_15099, %int128_15100 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10913 = torch.aten.view %10910, %10912 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_15101 = torch.constant.int 2
    %int0_15102 = torch.constant.int 0
    %int3_15103 = torch.constant.int 3
    %int1_15104 = torch.constant.int 1
    %int4_15105 = torch.constant.int 4
    %10914 = torch.prim.ListConstruct %int2_15101, %int0_15102, %int3_15103, %int1_15104, %int4_15105 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10915 = torch.aten.permute %10913, %10914 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_15106 = torch.constant.int 0
    %int0_15107 = torch.constant.int 0
    %10916 = torch.aten.select.int %10915, %int0_15106, %int0_15107 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_15108 = torch.constant.int 0
    %int1_15109 = torch.constant.int 1
    %10917 = torch.aten.select.int %10915, %int0_15108, %int1_15109 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_15110 = torch.constant.int 0
    %int2_15111 = torch.constant.int 2
    %10918 = torch.aten.select.int %10915, %int0_15110, %int2_15111 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_15112 = torch.constant.int 6
    %10919 = torch.prims.convert_element_type %10916, %int6_15112 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_15113 = torch.constant.int 2
    %10920 = torch.aten.pow.Tensor_Scalar %10919, %int2_15113 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_15114 = torch.constant.int -1
    %10921 = torch.prim.ListConstruct %int-1_15114 : (!torch.int) -> !torch.list<int>
    %true_15115 = torch.constant.bool true
    %none_15116 = torch.constant.none
    %10922 = torch.aten.mean.dim %10920, %10921, %true_15115, %none_15116 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_15117 = torch.constant.float 9.9999999999999995E-7
    %int1_15118 = torch.constant.int 1
    %10923 = torch.aten.add.Scalar %10922, %float9.999990e-07_15117, %int1_15118 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10924 = torch.aten.rsqrt %10923 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10925 = torch.aten.mul.Tensor %10919, %10924 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15119 = torch.constant.int 5
    %10926 = torch.prims.convert_element_type %10925, %int5_15119 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.34.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.34.norm.query_norm.scale : tensor<128xf16>
    %10927 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10928 = torch.aten.mul.Tensor %10926, %10927 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_15120 = torch.constant.int 6
    %10929 = torch.prims.convert_element_type %10917, %int6_15120 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_15121 = torch.constant.int 2
    %10930 = torch.aten.pow.Tensor_Scalar %10929, %int2_15121 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_15122 = torch.constant.int -1
    %10931 = torch.prim.ListConstruct %int-1_15122 : (!torch.int) -> !torch.list<int>
    %true_15123 = torch.constant.bool true
    %none_15124 = torch.constant.none
    %10932 = torch.aten.mean.dim %10930, %10931, %true_15123, %none_15124 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_15125 = torch.constant.float 9.9999999999999995E-7
    %int1_15126 = torch.constant.int 1
    %10933 = torch.aten.add.Scalar %10932, %float9.999990e-07_15125, %int1_15126 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10934 = torch.aten.rsqrt %10933 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10935 = torch.aten.mul.Tensor %10929, %10934 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15127 = torch.constant.int 5
    %10936 = torch.prims.convert_element_type %10935, %int5_15127 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.34.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.34.norm.key_norm.scale : tensor<128xf16>
    %10937 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10938 = torch.aten.mul.Tensor %10936, %10937 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_15128 = torch.constant.int 5
    %10939 = torch.prims.convert_element_type %10928, %int5_15128 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_15129 = torch.constant.int 5
    %10940 = torch.prims.convert_element_type %10938, %int5_15129 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_15130 = torch.constant.int 6
    %10941 = torch.prims.convert_element_type %10939, %int6_15130 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_15131 = torch.constant.int 1
    %int24_15132 = torch.constant.int 24
    %int4608_15133 = torch.constant.int 4608
    %int64_15134 = torch.constant.int 64
    %int1_15135 = torch.constant.int 1
    %int2_15136 = torch.constant.int 2
    %10942 = torch.prim.ListConstruct %int1_15131, %int24_15132, %int4608_15133, %int64_15134, %int1_15135, %int2_15136 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10943 = torch.aten.view %10941, %10942 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_15137 = torch.constant.int 6
    %10944 = torch.prims.convert_element_type %10940, %int6_15137 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_15138 = torch.constant.int 1
    %int24_15139 = torch.constant.int 24
    %int4608_15140 = torch.constant.int 4608
    %int64_15141 = torch.constant.int 64
    %int1_15142 = torch.constant.int 1
    %int2_15143 = torch.constant.int 2
    %10945 = torch.prim.ListConstruct %int1_15138, %int24_15139, %int4608_15140, %int64_15141, %int1_15142, %int2_15143 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10946 = torch.aten.view %10944, %10945 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_15144 = torch.constant.int 5
    %int0_15145 = torch.constant.int 0
    %10947 = torch.aten.select.int %211, %int5_15144, %int0_15145 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15146 = torch.constant.int 5
    %int0_15147 = torch.constant.int 0
    %10948 = torch.aten.select.int %10943, %int5_15146, %int0_15147 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10949 = torch.aten.mul.Tensor %10947, %10948 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_15148 = torch.constant.int 5
    %int1_15149 = torch.constant.int 1
    %10950 = torch.aten.select.int %211, %int5_15148, %int1_15149 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15150 = torch.constant.int 5
    %int1_15151 = torch.constant.int 1
    %10951 = torch.aten.select.int %10943, %int5_15150, %int1_15151 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10952 = torch.aten.mul.Tensor %10950, %10951 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_15152 = torch.constant.int 1
    %10953 = torch.aten.add.Tensor %10949, %10952, %int1_15152 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_15153 = torch.constant.int 5
    %int0_15154 = torch.constant.int 0
    %10954 = torch.aten.select.int %211, %int5_15153, %int0_15154 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15155 = torch.constant.int 5
    %int0_15156 = torch.constant.int 0
    %10955 = torch.aten.select.int %10946, %int5_15155, %int0_15156 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10956 = torch.aten.mul.Tensor %10954, %10955 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_15157 = torch.constant.int 5
    %int1_15158 = torch.constant.int 1
    %10957 = torch.aten.select.int %211, %int5_15157, %int1_15158 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15159 = torch.constant.int 5
    %int1_15160 = torch.constant.int 1
    %10958 = torch.aten.select.int %10946, %int5_15159, %int1_15160 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10959 = torch.aten.mul.Tensor %10957, %10958 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_15161 = torch.constant.int 1
    %10960 = torch.aten.add.Tensor %10956, %10959, %int1_15161 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_15162 = torch.constant.int 1
    %int24_15163 = torch.constant.int 24
    %int4608_15164 = torch.constant.int 4608
    %int128_15165 = torch.constant.int 128
    %10961 = torch.prim.ListConstruct %int1_15162, %int24_15163, %int4608_15164, %int128_15165 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10962 = torch.aten.view %10953, %10961 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15166 = torch.constant.int 5
    %10963 = torch.prims.convert_element_type %10962, %int5_15166 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_15167 = torch.constant.int 1
    %int24_15168 = torch.constant.int 24
    %int4608_15169 = torch.constant.int 4608
    %int128_15170 = torch.constant.int 128
    %10964 = torch.prim.ListConstruct %int1_15167, %int24_15168, %int4608_15169, %int128_15170 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10965 = torch.aten.view %10960, %10964 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15171 = torch.constant.int 5
    %10966 = torch.prims.convert_element_type %10965, %int5_15171 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_15172 = torch.constant.float 0.000000e+00
    %false_15173 = torch.constant.bool false
    %none_15174 = torch.constant.none
    %none_15175 = torch.constant.none
    %10967:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%10963, %10966, %10918, %float0.000000e00_15172, %false_15173, %none_15174, %none_15175) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_15176 = torch.constant.int 0
    %int2_15177 = torch.constant.int 2
    %int1_15178 = torch.constant.int 1
    %int3_15179 = torch.constant.int 3
    %10968 = torch.prim.ListConstruct %int0_15176, %int2_15177, %int1_15178, %int3_15179 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10969 = torch.aten.permute %10967#0, %10968 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_15180 = torch.constant.int 1
    %int4608_15181 = torch.constant.int 4608
    %int3072_15182 = torch.constant.int 3072
    %10970 = torch.prim.ListConstruct %int1_15180, %int4608_15181, %int3072_15182 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10971 = torch.aten.view %10969, %10970 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_15183 = torch.constant.str "tanh"
    %10972 = torch.aten.gelu %10911, %str_15183 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %10973 = torch.prim.ListConstruct %10971, %10972 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_15184 = torch.constant.int 2
    %10974 = torch.aten.cat %10973, %int2_15184 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_15185 = torch.constant.int 4608
    %int15360_15186 = torch.constant.int 15360
    %10975 = torch.prim.ListConstruct %int4608_15185, %int15360_15186 : (!torch.int, !torch.int) -> !torch.list<int>
    %10976 = torch.aten.view %10974, %10975 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.34.linear2.weight = util.global.load @__auto.sampler.single_blocks.34.linear2.weight : tensor<3072x15360xf16>
    %10977 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_15187 = torch.constant.int 0
    %int1_15188 = torch.constant.int 1
    %10978 = torch.aten.transpose.int %10977, %int0_15187, %int1_15188 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.34.linear2.bias = util.global.load @__auto.sampler.single_blocks.34.linear2.bias : tensor<3072xf16>
    %10979 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_15189 = torch.constant.int 6
    %10980 = torch.prims.convert_element_type %10979, %int6_15189 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_15190 = torch.constant.int 6
    %10981 = torch.prims.convert_element_type %10976, %int6_15190 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_15191 = torch.constant.int 6
    %10982 = torch.prims.convert_element_type %10978, %int6_15191 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %10983 = torch.aten.mm %10981, %10982 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_15192 = torch.constant.int 1
    %10984 = torch.aten.mul.Scalar %10983, %int1_15192 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_15193 = torch.constant.int 1
    %10985 = torch.aten.mul.Scalar %10980, %int1_15193 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_15194 = torch.constant.int 1
    %10986 = torch.aten.add.Tensor %10984, %10985, %int1_15194 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_15195 = torch.constant.int 5
    %10987 = torch.prims.convert_element_type %10986, %int5_15195 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_15196 = torch.constant.int 1
    %int4608_15197 = torch.constant.int 4608
    %int3072_15198 = torch.constant.int 3072
    %10988 = torch.prim.ListConstruct %int1_15196, %int4608_15197, %int3072_15198 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10989 = torch.aten.view %10987, %10988 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %10990 = torch.aten.mul.Tensor %10884, %10989 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_15199 = torch.constant.int 1
    %10991 = torch.aten.add.Tensor %10866, %10990, %int1_15199 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10992 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.35.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.35.modulation.lin.weight : tensor<9216x3072xf16>
    %10993 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_15200 = torch.constant.int 0
    %int1_15201 = torch.constant.int 1
    %10994 = torch.aten.transpose.int %10993, %int0_15200, %int1_15201 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.35.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.35.modulation.lin.bias : tensor<9216xf16>
    %10995 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_15202 = torch.constant.int 6
    %10996 = torch.prims.convert_element_type %10995, %int6_15202 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_15203 = torch.constant.int 6
    %10997 = torch.prims.convert_element_type %10992, %int6_15203 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_15204 = torch.constant.int 6
    %10998 = torch.prims.convert_element_type %10994, %int6_15204 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %10999 = torch.aten.mm %10997, %10998 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_15205 = torch.constant.int 1
    %11000 = torch.aten.mul.Scalar %10999, %int1_15205 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_15206 = torch.constant.int 1
    %11001 = torch.aten.mul.Scalar %10996, %int1_15206 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_15207 = torch.constant.int 1
    %11002 = torch.aten.add.Tensor %11000, %11001, %int1_15207 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_15208 = torch.constant.int 5
    %11003 = torch.prims.convert_element_type %11002, %int5_15208 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_15209 = torch.constant.int 0
    %int0_15210 = torch.constant.int 0
    %int9223372036854775807_15211 = torch.constant.int 9223372036854775807
    %int1_15212 = torch.constant.int 1
    %11004 = torch.aten.slice.Tensor %11003, %int0_15209, %int0_15210, %int9223372036854775807_15211, %int1_15212 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_15213 = torch.constant.int 1
    %11005 = torch.aten.unsqueeze %11004, %int1_15213 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_15214 = torch.constant.int 2
    %int0_15215 = torch.constant.int 0
    %int9223372036854775807_15216 = torch.constant.int 9223372036854775807
    %int1_15217 = torch.constant.int 1
    %11006 = torch.aten.slice.Tensor %11005, %int2_15214, %int0_15215, %int9223372036854775807_15216, %int1_15217 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_15218 = torch.constant.int -1
    %int0_15219 = torch.constant.int 0
    %int3072_15220 = torch.constant.int 3072
    %int1_15221 = torch.constant.int 1
    %11007 = torch.aten.slice.Tensor %11006, %int-1_15218, %int0_15219, %int3072_15220, %int1_15221 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_15222 = torch.constant.int -1
    %int3072_15223 = torch.constant.int 3072
    %int6144_15224 = torch.constant.int 6144
    %int1_15225 = torch.constant.int 1
    %11008 = torch.aten.slice.Tensor %11006, %int-1_15222, %int3072_15223, %int6144_15224, %int1_15225 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_15226 = torch.constant.int -1
    %int6144_15227 = torch.constant.int 6144
    %int9216_15228 = torch.constant.int 9216
    %int1_15229 = torch.constant.int 1
    %11009 = torch.aten.slice.Tensor %11006, %int-1_15226, %int6144_15227, %int9216_15228, %int1_15229 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_15230 = torch.constant.int 1
    %int1_15231 = torch.constant.int 1
    %11010 = torch.aten.add.Scalar %11008, %int1_15230, %int1_15231 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_15232 = torch.constant.int 6
    %11011 = torch.prims.convert_element_type %10991, %int6_15232 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_15233 = torch.constant.int 2
    %11012 = torch.prim.ListConstruct %int2_15233 : (!torch.int) -> !torch.list<int>
    %int0_15234 = torch.constant.int 0
    %true_15235 = torch.constant.bool true
    %result0_15236, %result1_15237 = torch.aten.var_mean.correction %11011, %11012, %int0_15234, %true_15235 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_15238 = torch.constant.float 9.9999999999999995E-7
    %int1_15239 = torch.constant.int 1
    %11013 = torch.aten.add.Scalar %result0_15236, %float9.999990e-07_15238, %int1_15239 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %11014 = torch.aten.rsqrt %11013 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_15240 = torch.constant.int 1
    %11015 = torch.aten.sub.Tensor %10991, %result1_15237, %int1_15240 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %11016 = torch.aten.mul.Tensor %11015, %11014 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_15241 = torch.constant.int 5
    %11017 = torch.prims.convert_element_type %11016, %int5_15241 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %11018 = torch.aten.mul.Tensor %11010, %11017 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_15242 = torch.constant.int 1
    %11019 = torch.aten.add.Tensor %11018, %11007, %int1_15242 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_15243 = torch.constant.int 4608
    %int3072_15244 = torch.constant.int 3072
    %11020 = torch.prim.ListConstruct %int4608_15243, %int3072_15244 : (!torch.int, !torch.int) -> !torch.list<int>
    %11021 = torch.aten.view %11019, %11020 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.35.linear1.weight = util.global.load @__auto.sampler.single_blocks.35.linear1.weight : tensor<21504x3072xf16>
    %11022 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_15245 = torch.constant.int 0
    %int1_15246 = torch.constant.int 1
    %11023 = torch.aten.transpose.int %11022, %int0_15245, %int1_15246 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.35.linear1.bias = util.global.load @__auto.sampler.single_blocks.35.linear1.bias : tensor<21504xf16>
    %11024 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_15247 = torch.constant.int 6
    %11025 = torch.prims.convert_element_type %11024, %int6_15247 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_15248 = torch.constant.int 6
    %11026 = torch.prims.convert_element_type %11021, %int6_15248 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_15249 = torch.constant.int 6
    %11027 = torch.prims.convert_element_type %11023, %int6_15249 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %11028 = torch.aten.mm %11026, %11027 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_15250 = torch.constant.int 1
    %11029 = torch.aten.mul.Scalar %11028, %int1_15250 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_15251 = torch.constant.int 1
    %11030 = torch.aten.mul.Scalar %11025, %int1_15251 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_15252 = torch.constant.int 1
    %11031 = torch.aten.add.Tensor %11029, %11030, %int1_15252 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_15253 = torch.constant.int 5
    %11032 = torch.prims.convert_element_type %11031, %int5_15253 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_15254 = torch.constant.int 1
    %int4608_15255 = torch.constant.int 4608
    %int21504_15256 = torch.constant.int 21504
    %11033 = torch.prim.ListConstruct %int1_15254, %int4608_15255, %int21504_15256 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11034 = torch.aten.view %11032, %11033 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_15257 = torch.constant.int -1
    %int0_15258 = torch.constant.int 0
    %int9216_15259 = torch.constant.int 9216
    %int1_15260 = torch.constant.int 1
    %11035 = torch.aten.slice.Tensor %11034, %int-1_15257, %int0_15258, %int9216_15259, %int1_15260 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_15261 = torch.constant.int -1
    %int9216_15262 = torch.constant.int 9216
    %int21504_15263 = torch.constant.int 21504
    %int1_15264 = torch.constant.int 1
    %11036 = torch.aten.slice.Tensor %11034, %int-1_15261, %int9216_15262, %int21504_15263, %int1_15264 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_15265 = torch.constant.int 1
    %int4608_15266 = torch.constant.int 4608
    %int3_15267 = torch.constant.int 3
    %int24_15268 = torch.constant.int 24
    %int128_15269 = torch.constant.int 128
    %11037 = torch.prim.ListConstruct %int1_15265, %int4608_15266, %int3_15267, %int24_15268, %int128_15269 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11038 = torch.aten.view %11035, %11037 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_15270 = torch.constant.int 2
    %int0_15271 = torch.constant.int 0
    %int3_15272 = torch.constant.int 3
    %int1_15273 = torch.constant.int 1
    %int4_15274 = torch.constant.int 4
    %11039 = torch.prim.ListConstruct %int2_15270, %int0_15271, %int3_15272, %int1_15273, %int4_15274 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11040 = torch.aten.permute %11038, %11039 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_15275 = torch.constant.int 0
    %int0_15276 = torch.constant.int 0
    %11041 = torch.aten.select.int %11040, %int0_15275, %int0_15276 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_15277 = torch.constant.int 0
    %int1_15278 = torch.constant.int 1
    %11042 = torch.aten.select.int %11040, %int0_15277, %int1_15278 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_15279 = torch.constant.int 0
    %int2_15280 = torch.constant.int 2
    %11043 = torch.aten.select.int %11040, %int0_15279, %int2_15280 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_15281 = torch.constant.int 6
    %11044 = torch.prims.convert_element_type %11041, %int6_15281 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_15282 = torch.constant.int 2
    %11045 = torch.aten.pow.Tensor_Scalar %11044, %int2_15282 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_15283 = torch.constant.int -1
    %11046 = torch.prim.ListConstruct %int-1_15283 : (!torch.int) -> !torch.list<int>
    %true_15284 = torch.constant.bool true
    %none_15285 = torch.constant.none
    %11047 = torch.aten.mean.dim %11045, %11046, %true_15284, %none_15285 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_15286 = torch.constant.float 9.9999999999999995E-7
    %int1_15287 = torch.constant.int 1
    %11048 = torch.aten.add.Scalar %11047, %float9.999990e-07_15286, %int1_15287 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %11049 = torch.aten.rsqrt %11048 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %11050 = torch.aten.mul.Tensor %11044, %11049 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15288 = torch.constant.int 5
    %11051 = torch.prims.convert_element_type %11050, %int5_15288 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.35.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.35.norm.query_norm.scale : tensor<128xf16>
    %11052 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %11053 = torch.aten.mul.Tensor %11051, %11052 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_15289 = torch.constant.int 6
    %11054 = torch.prims.convert_element_type %11042, %int6_15289 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_15290 = torch.constant.int 2
    %11055 = torch.aten.pow.Tensor_Scalar %11054, %int2_15290 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_15291 = torch.constant.int -1
    %11056 = torch.prim.ListConstruct %int-1_15291 : (!torch.int) -> !torch.list<int>
    %true_15292 = torch.constant.bool true
    %none_15293 = torch.constant.none
    %11057 = torch.aten.mean.dim %11055, %11056, %true_15292, %none_15293 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_15294 = torch.constant.float 9.9999999999999995E-7
    %int1_15295 = torch.constant.int 1
    %11058 = torch.aten.add.Scalar %11057, %float9.999990e-07_15294, %int1_15295 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %11059 = torch.aten.rsqrt %11058 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %11060 = torch.aten.mul.Tensor %11054, %11059 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15296 = torch.constant.int 5
    %11061 = torch.prims.convert_element_type %11060, %int5_15296 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.35.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.35.norm.key_norm.scale : tensor<128xf16>
    %11062 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %11063 = torch.aten.mul.Tensor %11061, %11062 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_15297 = torch.constant.int 5
    %11064 = torch.prims.convert_element_type %11053, %int5_15297 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_15298 = torch.constant.int 5
    %11065 = torch.prims.convert_element_type %11063, %int5_15298 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_15299 = torch.constant.int 6
    %11066 = torch.prims.convert_element_type %11064, %int6_15299 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_15300 = torch.constant.int 1
    %int24_15301 = torch.constant.int 24
    %int4608_15302 = torch.constant.int 4608
    %int64_15303 = torch.constant.int 64
    %int1_15304 = torch.constant.int 1
    %int2_15305 = torch.constant.int 2
    %11067 = torch.prim.ListConstruct %int1_15300, %int24_15301, %int4608_15302, %int64_15303, %int1_15304, %int2_15305 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11068 = torch.aten.view %11066, %11067 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_15306 = torch.constant.int 6
    %11069 = torch.prims.convert_element_type %11065, %int6_15306 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_15307 = torch.constant.int 1
    %int24_15308 = torch.constant.int 24
    %int4608_15309 = torch.constant.int 4608
    %int64_15310 = torch.constant.int 64
    %int1_15311 = torch.constant.int 1
    %int2_15312 = torch.constant.int 2
    %11070 = torch.prim.ListConstruct %int1_15307, %int24_15308, %int4608_15309, %int64_15310, %int1_15311, %int2_15312 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11071 = torch.aten.view %11069, %11070 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_15313 = torch.constant.int 5
    %int0_15314 = torch.constant.int 0
    %11072 = torch.aten.select.int %211, %int5_15313, %int0_15314 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15315 = torch.constant.int 5
    %int0_15316 = torch.constant.int 0
    %11073 = torch.aten.select.int %11068, %int5_15315, %int0_15316 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %11074 = torch.aten.mul.Tensor %11072, %11073 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_15317 = torch.constant.int 5
    %int1_15318 = torch.constant.int 1
    %11075 = torch.aten.select.int %211, %int5_15317, %int1_15318 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15319 = torch.constant.int 5
    %int1_15320 = torch.constant.int 1
    %11076 = torch.aten.select.int %11068, %int5_15319, %int1_15320 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %11077 = torch.aten.mul.Tensor %11075, %11076 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_15321 = torch.constant.int 1
    %11078 = torch.aten.add.Tensor %11074, %11077, %int1_15321 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_15322 = torch.constant.int 5
    %int0_15323 = torch.constant.int 0
    %11079 = torch.aten.select.int %211, %int5_15322, %int0_15323 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15324 = torch.constant.int 5
    %int0_15325 = torch.constant.int 0
    %11080 = torch.aten.select.int %11071, %int5_15324, %int0_15325 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %11081 = torch.aten.mul.Tensor %11079, %11080 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_15326 = torch.constant.int 5
    %int1_15327 = torch.constant.int 1
    %11082 = torch.aten.select.int %211, %int5_15326, %int1_15327 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15328 = torch.constant.int 5
    %int1_15329 = torch.constant.int 1
    %11083 = torch.aten.select.int %11071, %int5_15328, %int1_15329 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %11084 = torch.aten.mul.Tensor %11082, %11083 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_15330 = torch.constant.int 1
    %11085 = torch.aten.add.Tensor %11081, %11084, %int1_15330 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_15331 = torch.constant.int 1
    %int24_15332 = torch.constant.int 24
    %int4608_15333 = torch.constant.int 4608
    %int128_15334 = torch.constant.int 128
    %11086 = torch.prim.ListConstruct %int1_15331, %int24_15332, %int4608_15333, %int128_15334 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11087 = torch.aten.view %11078, %11086 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15335 = torch.constant.int 5
    %11088 = torch.prims.convert_element_type %11087, %int5_15335 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_15336 = torch.constant.int 1
    %int24_15337 = torch.constant.int 24
    %int4608_15338 = torch.constant.int 4608
    %int128_15339 = torch.constant.int 128
    %11089 = torch.prim.ListConstruct %int1_15336, %int24_15337, %int4608_15338, %int128_15339 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11090 = torch.aten.view %11085, %11089 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15340 = torch.constant.int 5
    %11091 = torch.prims.convert_element_type %11090, %int5_15340 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_15341 = torch.constant.float 0.000000e+00
    %false_15342 = torch.constant.bool false
    %none_15343 = torch.constant.none
    %none_15344 = torch.constant.none
    %11092:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%11088, %11091, %11043, %float0.000000e00_15341, %false_15342, %none_15343, %none_15344) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_15345 = torch.constant.int 0
    %int2_15346 = torch.constant.int 2
    %int1_15347 = torch.constant.int 1
    %int3_15348 = torch.constant.int 3
    %11093 = torch.prim.ListConstruct %int0_15345, %int2_15346, %int1_15347, %int3_15348 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11094 = torch.aten.permute %11092#0, %11093 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_15349 = torch.constant.int 1
    %int4608_15350 = torch.constant.int 4608
    %int3072_15351 = torch.constant.int 3072
    %11095 = torch.prim.ListConstruct %int1_15349, %int4608_15350, %int3072_15351 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11096 = torch.aten.view %11094, %11095 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_15352 = torch.constant.str "tanh"
    %11097 = torch.aten.gelu %11036, %str_15352 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %11098 = torch.prim.ListConstruct %11096, %11097 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_15353 = torch.constant.int 2
    %11099 = torch.aten.cat %11098, %int2_15353 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_15354 = torch.constant.int 4608
    %int15360_15355 = torch.constant.int 15360
    %11100 = torch.prim.ListConstruct %int4608_15354, %int15360_15355 : (!torch.int, !torch.int) -> !torch.list<int>
    %11101 = torch.aten.view %11099, %11100 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.35.linear2.weight = util.global.load @__auto.sampler.single_blocks.35.linear2.weight : tensor<3072x15360xf16>
    %11102 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_15356 = torch.constant.int 0
    %int1_15357 = torch.constant.int 1
    %11103 = torch.aten.transpose.int %11102, %int0_15356, %int1_15357 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.35.linear2.bias = util.global.load @__auto.sampler.single_blocks.35.linear2.bias : tensor<3072xf16>
    %11104 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_15358 = torch.constant.int 6
    %11105 = torch.prims.convert_element_type %11104, %int6_15358 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_15359 = torch.constant.int 6
    %11106 = torch.prims.convert_element_type %11101, %int6_15359 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_15360 = torch.constant.int 6
    %11107 = torch.prims.convert_element_type %11103, %int6_15360 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %11108 = torch.aten.mm %11106, %11107 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_15361 = torch.constant.int 1
    %11109 = torch.aten.mul.Scalar %11108, %int1_15361 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_15362 = torch.constant.int 1
    %11110 = torch.aten.mul.Scalar %11105, %int1_15362 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_15363 = torch.constant.int 1
    %11111 = torch.aten.add.Tensor %11109, %11110, %int1_15363 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_15364 = torch.constant.int 5
    %11112 = torch.prims.convert_element_type %11111, %int5_15364 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_15365 = torch.constant.int 1
    %int4608_15366 = torch.constant.int 4608
    %int3072_15367 = torch.constant.int 3072
    %11113 = torch.prim.ListConstruct %int1_15365, %int4608_15366, %int3072_15367 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11114 = torch.aten.view %11112, %11113 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %11115 = torch.aten.mul.Tensor %11009, %11114 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_15368 = torch.constant.int 1
    %11116 = torch.aten.add.Tensor %10991, %11115, %int1_15368 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %11117 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.36.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.36.modulation.lin.weight : tensor<9216x3072xf16>
    %11118 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_15369 = torch.constant.int 0
    %int1_15370 = torch.constant.int 1
    %11119 = torch.aten.transpose.int %11118, %int0_15369, %int1_15370 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.36.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.36.modulation.lin.bias : tensor<9216xf16>
    %11120 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_15371 = torch.constant.int 6
    %11121 = torch.prims.convert_element_type %11120, %int6_15371 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_15372 = torch.constant.int 6
    %11122 = torch.prims.convert_element_type %11117, %int6_15372 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_15373 = torch.constant.int 6
    %11123 = torch.prims.convert_element_type %11119, %int6_15373 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %11124 = torch.aten.mm %11122, %11123 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_15374 = torch.constant.int 1
    %11125 = torch.aten.mul.Scalar %11124, %int1_15374 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_15375 = torch.constant.int 1
    %11126 = torch.aten.mul.Scalar %11121, %int1_15375 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_15376 = torch.constant.int 1
    %11127 = torch.aten.add.Tensor %11125, %11126, %int1_15376 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_15377 = torch.constant.int 5
    %11128 = torch.prims.convert_element_type %11127, %int5_15377 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_15378 = torch.constant.int 0
    %int0_15379 = torch.constant.int 0
    %int9223372036854775807_15380 = torch.constant.int 9223372036854775807
    %int1_15381 = torch.constant.int 1
    %11129 = torch.aten.slice.Tensor %11128, %int0_15378, %int0_15379, %int9223372036854775807_15380, %int1_15381 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_15382 = torch.constant.int 1
    %11130 = torch.aten.unsqueeze %11129, %int1_15382 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_15383 = torch.constant.int 2
    %int0_15384 = torch.constant.int 0
    %int9223372036854775807_15385 = torch.constant.int 9223372036854775807
    %int1_15386 = torch.constant.int 1
    %11131 = torch.aten.slice.Tensor %11130, %int2_15383, %int0_15384, %int9223372036854775807_15385, %int1_15386 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_15387 = torch.constant.int -1
    %int0_15388 = torch.constant.int 0
    %int3072_15389 = torch.constant.int 3072
    %int1_15390 = torch.constant.int 1
    %11132 = torch.aten.slice.Tensor %11131, %int-1_15387, %int0_15388, %int3072_15389, %int1_15390 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_15391 = torch.constant.int -1
    %int3072_15392 = torch.constant.int 3072
    %int6144_15393 = torch.constant.int 6144
    %int1_15394 = torch.constant.int 1
    %11133 = torch.aten.slice.Tensor %11131, %int-1_15391, %int3072_15392, %int6144_15393, %int1_15394 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_15395 = torch.constant.int -1
    %int6144_15396 = torch.constant.int 6144
    %int9216_15397 = torch.constant.int 9216
    %int1_15398 = torch.constant.int 1
    %11134 = torch.aten.slice.Tensor %11131, %int-1_15395, %int6144_15396, %int9216_15397, %int1_15398 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_15399 = torch.constant.int 1
    %int1_15400 = torch.constant.int 1
    %11135 = torch.aten.add.Scalar %11133, %int1_15399, %int1_15400 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_15401 = torch.constant.int 6
    %11136 = torch.prims.convert_element_type %11116, %int6_15401 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_15402 = torch.constant.int 2
    %11137 = torch.prim.ListConstruct %int2_15402 : (!torch.int) -> !torch.list<int>
    %int0_15403 = torch.constant.int 0
    %true_15404 = torch.constant.bool true
    %result0_15405, %result1_15406 = torch.aten.var_mean.correction %11136, %11137, %int0_15403, %true_15404 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_15407 = torch.constant.float 9.9999999999999995E-7
    %int1_15408 = torch.constant.int 1
    %11138 = torch.aten.add.Scalar %result0_15405, %float9.999990e-07_15407, %int1_15408 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %11139 = torch.aten.rsqrt %11138 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_15409 = torch.constant.int 1
    %11140 = torch.aten.sub.Tensor %11116, %result1_15406, %int1_15409 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %11141 = torch.aten.mul.Tensor %11140, %11139 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_15410 = torch.constant.int 5
    %11142 = torch.prims.convert_element_type %11141, %int5_15410 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %11143 = torch.aten.mul.Tensor %11135, %11142 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_15411 = torch.constant.int 1
    %11144 = torch.aten.add.Tensor %11143, %11132, %int1_15411 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_15412 = torch.constant.int 4608
    %int3072_15413 = torch.constant.int 3072
    %11145 = torch.prim.ListConstruct %int4608_15412, %int3072_15413 : (!torch.int, !torch.int) -> !torch.list<int>
    %11146 = torch.aten.view %11144, %11145 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.36.linear1.weight = util.global.load @__auto.sampler.single_blocks.36.linear1.weight : tensor<21504x3072xf16>
    %11147 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_15414 = torch.constant.int 0
    %int1_15415 = torch.constant.int 1
    %11148 = torch.aten.transpose.int %11147, %int0_15414, %int1_15415 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.36.linear1.bias = util.global.load @__auto.sampler.single_blocks.36.linear1.bias : tensor<21504xf16>
    %11149 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_15416 = torch.constant.int 6
    %11150 = torch.prims.convert_element_type %11149, %int6_15416 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_15417 = torch.constant.int 6
    %11151 = torch.prims.convert_element_type %11146, %int6_15417 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_15418 = torch.constant.int 6
    %11152 = torch.prims.convert_element_type %11148, %int6_15418 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %11153 = torch.aten.mm %11151, %11152 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_15419 = torch.constant.int 1
    %11154 = torch.aten.mul.Scalar %11153, %int1_15419 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_15420 = torch.constant.int 1
    %11155 = torch.aten.mul.Scalar %11150, %int1_15420 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_15421 = torch.constant.int 1
    %11156 = torch.aten.add.Tensor %11154, %11155, %int1_15421 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_15422 = torch.constant.int 5
    %11157 = torch.prims.convert_element_type %11156, %int5_15422 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_15423 = torch.constant.int 1
    %int4608_15424 = torch.constant.int 4608
    %int21504_15425 = torch.constant.int 21504
    %11158 = torch.prim.ListConstruct %int1_15423, %int4608_15424, %int21504_15425 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11159 = torch.aten.view %11157, %11158 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_15426 = torch.constant.int -1
    %int0_15427 = torch.constant.int 0
    %int9216_15428 = torch.constant.int 9216
    %int1_15429 = torch.constant.int 1
    %11160 = torch.aten.slice.Tensor %11159, %int-1_15426, %int0_15427, %int9216_15428, %int1_15429 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_15430 = torch.constant.int -1
    %int9216_15431 = torch.constant.int 9216
    %int21504_15432 = torch.constant.int 21504
    %int1_15433 = torch.constant.int 1
    %11161 = torch.aten.slice.Tensor %11159, %int-1_15430, %int9216_15431, %int21504_15432, %int1_15433 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_15434 = torch.constant.int 1
    %int4608_15435 = torch.constant.int 4608
    %int3_15436 = torch.constant.int 3
    %int24_15437 = torch.constant.int 24
    %int128_15438 = torch.constant.int 128
    %11162 = torch.prim.ListConstruct %int1_15434, %int4608_15435, %int3_15436, %int24_15437, %int128_15438 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11163 = torch.aten.view %11160, %11162 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_15439 = torch.constant.int 2
    %int0_15440 = torch.constant.int 0
    %int3_15441 = torch.constant.int 3
    %int1_15442 = torch.constant.int 1
    %int4_15443 = torch.constant.int 4
    %11164 = torch.prim.ListConstruct %int2_15439, %int0_15440, %int3_15441, %int1_15442, %int4_15443 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11165 = torch.aten.permute %11163, %11164 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_15444 = torch.constant.int 0
    %int0_15445 = torch.constant.int 0
    %11166 = torch.aten.select.int %11165, %int0_15444, %int0_15445 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_15446 = torch.constant.int 0
    %int1_15447 = torch.constant.int 1
    %11167 = torch.aten.select.int %11165, %int0_15446, %int1_15447 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_15448 = torch.constant.int 0
    %int2_15449 = torch.constant.int 2
    %11168 = torch.aten.select.int %11165, %int0_15448, %int2_15449 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_15450 = torch.constant.int 6
    %11169 = torch.prims.convert_element_type %11166, %int6_15450 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_15451 = torch.constant.int 2
    %11170 = torch.aten.pow.Tensor_Scalar %11169, %int2_15451 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_15452 = torch.constant.int -1
    %11171 = torch.prim.ListConstruct %int-1_15452 : (!torch.int) -> !torch.list<int>
    %true_15453 = torch.constant.bool true
    %none_15454 = torch.constant.none
    %11172 = torch.aten.mean.dim %11170, %11171, %true_15453, %none_15454 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_15455 = torch.constant.float 9.9999999999999995E-7
    %int1_15456 = torch.constant.int 1
    %11173 = torch.aten.add.Scalar %11172, %float9.999990e-07_15455, %int1_15456 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %11174 = torch.aten.rsqrt %11173 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %11175 = torch.aten.mul.Tensor %11169, %11174 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15457 = torch.constant.int 5
    %11176 = torch.prims.convert_element_type %11175, %int5_15457 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.36.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.36.norm.query_norm.scale : tensor<128xf16>
    %11177 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %11178 = torch.aten.mul.Tensor %11176, %11177 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_15458 = torch.constant.int 6
    %11179 = torch.prims.convert_element_type %11167, %int6_15458 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_15459 = torch.constant.int 2
    %11180 = torch.aten.pow.Tensor_Scalar %11179, %int2_15459 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_15460 = torch.constant.int -1
    %11181 = torch.prim.ListConstruct %int-1_15460 : (!torch.int) -> !torch.list<int>
    %true_15461 = torch.constant.bool true
    %none_15462 = torch.constant.none
    %11182 = torch.aten.mean.dim %11180, %11181, %true_15461, %none_15462 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_15463 = torch.constant.float 9.9999999999999995E-7
    %int1_15464 = torch.constant.int 1
    %11183 = torch.aten.add.Scalar %11182, %float9.999990e-07_15463, %int1_15464 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %11184 = torch.aten.rsqrt %11183 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %11185 = torch.aten.mul.Tensor %11179, %11184 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15465 = torch.constant.int 5
    %11186 = torch.prims.convert_element_type %11185, %int5_15465 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.36.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.36.norm.key_norm.scale : tensor<128xf16>
    %11187 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %11188 = torch.aten.mul.Tensor %11186, %11187 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_15466 = torch.constant.int 5
    %11189 = torch.prims.convert_element_type %11178, %int5_15466 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_15467 = torch.constant.int 5
    %11190 = torch.prims.convert_element_type %11188, %int5_15467 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_15468 = torch.constant.int 6
    %11191 = torch.prims.convert_element_type %11189, %int6_15468 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_15469 = torch.constant.int 1
    %int24_15470 = torch.constant.int 24
    %int4608_15471 = torch.constant.int 4608
    %int64_15472 = torch.constant.int 64
    %int1_15473 = torch.constant.int 1
    %int2_15474 = torch.constant.int 2
    %11192 = torch.prim.ListConstruct %int1_15469, %int24_15470, %int4608_15471, %int64_15472, %int1_15473, %int2_15474 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11193 = torch.aten.view %11191, %11192 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_15475 = torch.constant.int 6
    %11194 = torch.prims.convert_element_type %11190, %int6_15475 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_15476 = torch.constant.int 1
    %int24_15477 = torch.constant.int 24
    %int4608_15478 = torch.constant.int 4608
    %int64_15479 = torch.constant.int 64
    %int1_15480 = torch.constant.int 1
    %int2_15481 = torch.constant.int 2
    %11195 = torch.prim.ListConstruct %int1_15476, %int24_15477, %int4608_15478, %int64_15479, %int1_15480, %int2_15481 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11196 = torch.aten.view %11194, %11195 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_15482 = torch.constant.int 5
    %int0_15483 = torch.constant.int 0
    %11197 = torch.aten.select.int %211, %int5_15482, %int0_15483 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15484 = torch.constant.int 5
    %int0_15485 = torch.constant.int 0
    %11198 = torch.aten.select.int %11193, %int5_15484, %int0_15485 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %11199 = torch.aten.mul.Tensor %11197, %11198 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_15486 = torch.constant.int 5
    %int1_15487 = torch.constant.int 1
    %11200 = torch.aten.select.int %211, %int5_15486, %int1_15487 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15488 = torch.constant.int 5
    %int1_15489 = torch.constant.int 1
    %11201 = torch.aten.select.int %11193, %int5_15488, %int1_15489 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %11202 = torch.aten.mul.Tensor %11200, %11201 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_15490 = torch.constant.int 1
    %11203 = torch.aten.add.Tensor %11199, %11202, %int1_15490 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_15491 = torch.constant.int 5
    %int0_15492 = torch.constant.int 0
    %11204 = torch.aten.select.int %211, %int5_15491, %int0_15492 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15493 = torch.constant.int 5
    %int0_15494 = torch.constant.int 0
    %11205 = torch.aten.select.int %11196, %int5_15493, %int0_15494 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %11206 = torch.aten.mul.Tensor %11204, %11205 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_15495 = torch.constant.int 5
    %int1_15496 = torch.constant.int 1
    %11207 = torch.aten.select.int %211, %int5_15495, %int1_15496 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15497 = torch.constant.int 5
    %int1_15498 = torch.constant.int 1
    %11208 = torch.aten.select.int %11196, %int5_15497, %int1_15498 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %11209 = torch.aten.mul.Tensor %11207, %11208 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_15499 = torch.constant.int 1
    %11210 = torch.aten.add.Tensor %11206, %11209, %int1_15499 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_15500 = torch.constant.int 1
    %int24_15501 = torch.constant.int 24
    %int4608_15502 = torch.constant.int 4608
    %int128_15503 = torch.constant.int 128
    %11211 = torch.prim.ListConstruct %int1_15500, %int24_15501, %int4608_15502, %int128_15503 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11212 = torch.aten.view %11203, %11211 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15504 = torch.constant.int 5
    %11213 = torch.prims.convert_element_type %11212, %int5_15504 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_15505 = torch.constant.int 1
    %int24_15506 = torch.constant.int 24
    %int4608_15507 = torch.constant.int 4608
    %int128_15508 = torch.constant.int 128
    %11214 = torch.prim.ListConstruct %int1_15505, %int24_15506, %int4608_15507, %int128_15508 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11215 = torch.aten.view %11210, %11214 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15509 = torch.constant.int 5
    %11216 = torch.prims.convert_element_type %11215, %int5_15509 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_15510 = torch.constant.float 0.000000e+00
    %false_15511 = torch.constant.bool false
    %none_15512 = torch.constant.none
    %none_15513 = torch.constant.none
    %11217:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%11213, %11216, %11168, %float0.000000e00_15510, %false_15511, %none_15512, %none_15513) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_15514 = torch.constant.int 0
    %int2_15515 = torch.constant.int 2
    %int1_15516 = torch.constant.int 1
    %int3_15517 = torch.constant.int 3
    %11218 = torch.prim.ListConstruct %int0_15514, %int2_15515, %int1_15516, %int3_15517 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11219 = torch.aten.permute %11217#0, %11218 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_15518 = torch.constant.int 1
    %int4608_15519 = torch.constant.int 4608
    %int3072_15520 = torch.constant.int 3072
    %11220 = torch.prim.ListConstruct %int1_15518, %int4608_15519, %int3072_15520 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11221 = torch.aten.view %11219, %11220 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_15521 = torch.constant.str "tanh"
    %11222 = torch.aten.gelu %11161, %str_15521 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %11223 = torch.prim.ListConstruct %11221, %11222 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_15522 = torch.constant.int 2
    %11224 = torch.aten.cat %11223, %int2_15522 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_15523 = torch.constant.int 4608
    %int15360_15524 = torch.constant.int 15360
    %11225 = torch.prim.ListConstruct %int4608_15523, %int15360_15524 : (!torch.int, !torch.int) -> !torch.list<int>
    %11226 = torch.aten.view %11224, %11225 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.36.linear2.weight = util.global.load @__auto.sampler.single_blocks.36.linear2.weight : tensor<3072x15360xf16>
    %11227 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_15525 = torch.constant.int 0
    %int1_15526 = torch.constant.int 1
    %11228 = torch.aten.transpose.int %11227, %int0_15525, %int1_15526 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.36.linear2.bias = util.global.load @__auto.sampler.single_blocks.36.linear2.bias : tensor<3072xf16>
    %11229 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_15527 = torch.constant.int 6
    %11230 = torch.prims.convert_element_type %11229, %int6_15527 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_15528 = torch.constant.int 6
    %11231 = torch.prims.convert_element_type %11226, %int6_15528 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_15529 = torch.constant.int 6
    %11232 = torch.prims.convert_element_type %11228, %int6_15529 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %11233 = torch.aten.mm %11231, %11232 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_15530 = torch.constant.int 1
    %11234 = torch.aten.mul.Scalar %11233, %int1_15530 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_15531 = torch.constant.int 1
    %11235 = torch.aten.mul.Scalar %11230, %int1_15531 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_15532 = torch.constant.int 1
    %11236 = torch.aten.add.Tensor %11234, %11235, %int1_15532 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_15533 = torch.constant.int 5
    %11237 = torch.prims.convert_element_type %11236, %int5_15533 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_15534 = torch.constant.int 1
    %int4608_15535 = torch.constant.int 4608
    %int3072_15536 = torch.constant.int 3072
    %11238 = torch.prim.ListConstruct %int1_15534, %int4608_15535, %int3072_15536 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11239 = torch.aten.view %11237, %11238 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %11240 = torch.aten.mul.Tensor %11134, %11239 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_15537 = torch.constant.int 1
    %11241 = torch.aten.add.Tensor %11116, %11240, %int1_15537 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %11242 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.37.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.37.modulation.lin.weight : tensor<9216x3072xf16>
    %11243 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_15538 = torch.constant.int 0
    %int1_15539 = torch.constant.int 1
    %11244 = torch.aten.transpose.int %11243, %int0_15538, %int1_15539 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.37.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.37.modulation.lin.bias : tensor<9216xf16>
    %11245 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_15540 = torch.constant.int 6
    %11246 = torch.prims.convert_element_type %11245, %int6_15540 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_15541 = torch.constant.int 6
    %11247 = torch.prims.convert_element_type %11242, %int6_15541 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_15542 = torch.constant.int 6
    %11248 = torch.prims.convert_element_type %11244, %int6_15542 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %11249 = torch.aten.mm %11247, %11248 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_15543 = torch.constant.int 1
    %11250 = torch.aten.mul.Scalar %11249, %int1_15543 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_15544 = torch.constant.int 1
    %11251 = torch.aten.mul.Scalar %11246, %int1_15544 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_15545 = torch.constant.int 1
    %11252 = torch.aten.add.Tensor %11250, %11251, %int1_15545 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_15546 = torch.constant.int 5
    %11253 = torch.prims.convert_element_type %11252, %int5_15546 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_15547 = torch.constant.int 0
    %int0_15548 = torch.constant.int 0
    %int9223372036854775807_15549 = torch.constant.int 9223372036854775807
    %int1_15550 = torch.constant.int 1
    %11254 = torch.aten.slice.Tensor %11253, %int0_15547, %int0_15548, %int9223372036854775807_15549, %int1_15550 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_15551 = torch.constant.int 1
    %11255 = torch.aten.unsqueeze %11254, %int1_15551 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_15552 = torch.constant.int 2
    %int0_15553 = torch.constant.int 0
    %int9223372036854775807_15554 = torch.constant.int 9223372036854775807
    %int1_15555 = torch.constant.int 1
    %11256 = torch.aten.slice.Tensor %11255, %int2_15552, %int0_15553, %int9223372036854775807_15554, %int1_15555 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_15556 = torch.constant.int -1
    %int0_15557 = torch.constant.int 0
    %int3072_15558 = torch.constant.int 3072
    %int1_15559 = torch.constant.int 1
    %11257 = torch.aten.slice.Tensor %11256, %int-1_15556, %int0_15557, %int3072_15558, %int1_15559 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_15560 = torch.constant.int -1
    %int3072_15561 = torch.constant.int 3072
    %int6144_15562 = torch.constant.int 6144
    %int1_15563 = torch.constant.int 1
    %11258 = torch.aten.slice.Tensor %11256, %int-1_15560, %int3072_15561, %int6144_15562, %int1_15563 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_15564 = torch.constant.int -1
    %int6144_15565 = torch.constant.int 6144
    %int9216_15566 = torch.constant.int 9216
    %int1_15567 = torch.constant.int 1
    %11259 = torch.aten.slice.Tensor %11256, %int-1_15564, %int6144_15565, %int9216_15566, %int1_15567 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_15568 = torch.constant.int 1
    %int1_15569 = torch.constant.int 1
    %11260 = torch.aten.add.Scalar %11258, %int1_15568, %int1_15569 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_15570 = torch.constant.int 6
    %11261 = torch.prims.convert_element_type %11241, %int6_15570 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_15571 = torch.constant.int 2
    %11262 = torch.prim.ListConstruct %int2_15571 : (!torch.int) -> !torch.list<int>
    %int0_15572 = torch.constant.int 0
    %true_15573 = torch.constant.bool true
    %result0_15574, %result1_15575 = torch.aten.var_mean.correction %11261, %11262, %int0_15572, %true_15573 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_15576 = torch.constant.float 9.9999999999999995E-7
    %int1_15577 = torch.constant.int 1
    %11263 = torch.aten.add.Scalar %result0_15574, %float9.999990e-07_15576, %int1_15577 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %11264 = torch.aten.rsqrt %11263 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_15578 = torch.constant.int 1
    %11265 = torch.aten.sub.Tensor %11241, %result1_15575, %int1_15578 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %11266 = torch.aten.mul.Tensor %11265, %11264 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_15579 = torch.constant.int 5
    %11267 = torch.prims.convert_element_type %11266, %int5_15579 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %11268 = torch.aten.mul.Tensor %11260, %11267 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_15580 = torch.constant.int 1
    %11269 = torch.aten.add.Tensor %11268, %11257, %int1_15580 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_15581 = torch.constant.int 4608
    %int3072_15582 = torch.constant.int 3072
    %11270 = torch.prim.ListConstruct %int4608_15581, %int3072_15582 : (!torch.int, !torch.int) -> !torch.list<int>
    %11271 = torch.aten.view %11269, %11270 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.37.linear1.weight = util.global.load @__auto.sampler.single_blocks.37.linear1.weight : tensor<21504x3072xf16>
    %11272 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_15583 = torch.constant.int 0
    %int1_15584 = torch.constant.int 1
    %11273 = torch.aten.transpose.int %11272, %int0_15583, %int1_15584 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.37.linear1.bias = util.global.load @__auto.sampler.single_blocks.37.linear1.bias : tensor<21504xf16>
    %11274 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_15585 = torch.constant.int 6
    %11275 = torch.prims.convert_element_type %11274, %int6_15585 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_15586 = torch.constant.int 6
    %11276 = torch.prims.convert_element_type %11271, %int6_15586 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_15587 = torch.constant.int 6
    %11277 = torch.prims.convert_element_type %11273, %int6_15587 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %11278 = torch.aten.mm %11276, %11277 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_15588 = torch.constant.int 1
    %11279 = torch.aten.mul.Scalar %11278, %int1_15588 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_15589 = torch.constant.int 1
    %11280 = torch.aten.mul.Scalar %11275, %int1_15589 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_15590 = torch.constant.int 1
    %11281 = torch.aten.add.Tensor %11279, %11280, %int1_15590 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_15591 = torch.constant.int 5
    %11282 = torch.prims.convert_element_type %11281, %int5_15591 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_15592 = torch.constant.int 1
    %int4608_15593 = torch.constant.int 4608
    %int21504_15594 = torch.constant.int 21504
    %11283 = torch.prim.ListConstruct %int1_15592, %int4608_15593, %int21504_15594 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11284 = torch.aten.view %11282, %11283 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_15595 = torch.constant.int -1
    %int0_15596 = torch.constant.int 0
    %int9216_15597 = torch.constant.int 9216
    %int1_15598 = torch.constant.int 1
    %11285 = torch.aten.slice.Tensor %11284, %int-1_15595, %int0_15596, %int9216_15597, %int1_15598 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_15599 = torch.constant.int -1
    %int9216_15600 = torch.constant.int 9216
    %int21504_15601 = torch.constant.int 21504
    %int1_15602 = torch.constant.int 1
    %11286 = torch.aten.slice.Tensor %11284, %int-1_15599, %int9216_15600, %int21504_15601, %int1_15602 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_15603 = torch.constant.int 1
    %int4608_15604 = torch.constant.int 4608
    %int3_15605 = torch.constant.int 3
    %int24_15606 = torch.constant.int 24
    %int128_15607 = torch.constant.int 128
    %11287 = torch.prim.ListConstruct %int1_15603, %int4608_15604, %int3_15605, %int24_15606, %int128_15607 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11288 = torch.aten.view %11285, %11287 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_15608 = torch.constant.int 2
    %int0_15609 = torch.constant.int 0
    %int3_15610 = torch.constant.int 3
    %int1_15611 = torch.constant.int 1
    %int4_15612 = torch.constant.int 4
    %11289 = torch.prim.ListConstruct %int2_15608, %int0_15609, %int3_15610, %int1_15611, %int4_15612 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11290 = torch.aten.permute %11288, %11289 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_15613 = torch.constant.int 0
    %int0_15614 = torch.constant.int 0
    %11291 = torch.aten.select.int %11290, %int0_15613, %int0_15614 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_15615 = torch.constant.int 0
    %int1_15616 = torch.constant.int 1
    %11292 = torch.aten.select.int %11290, %int0_15615, %int1_15616 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_15617 = torch.constant.int 0
    %int2_15618 = torch.constant.int 2
    %11293 = torch.aten.select.int %11290, %int0_15617, %int2_15618 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_15619 = torch.constant.int 6
    %11294 = torch.prims.convert_element_type %11291, %int6_15619 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_15620 = torch.constant.int 2
    %11295 = torch.aten.pow.Tensor_Scalar %11294, %int2_15620 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_15621 = torch.constant.int -1
    %11296 = torch.prim.ListConstruct %int-1_15621 : (!torch.int) -> !torch.list<int>
    %true_15622 = torch.constant.bool true
    %none_15623 = torch.constant.none
    %11297 = torch.aten.mean.dim %11295, %11296, %true_15622, %none_15623 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_15624 = torch.constant.float 9.9999999999999995E-7
    %int1_15625 = torch.constant.int 1
    %11298 = torch.aten.add.Scalar %11297, %float9.999990e-07_15624, %int1_15625 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %11299 = torch.aten.rsqrt %11298 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %11300 = torch.aten.mul.Tensor %11294, %11299 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15626 = torch.constant.int 5
    %11301 = torch.prims.convert_element_type %11300, %int5_15626 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.37.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.37.norm.query_norm.scale : tensor<128xf16>
    %11302 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %11303 = torch.aten.mul.Tensor %11301, %11302 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_15627 = torch.constant.int 6
    %11304 = torch.prims.convert_element_type %11292, %int6_15627 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_15628 = torch.constant.int 2
    %11305 = torch.aten.pow.Tensor_Scalar %11304, %int2_15628 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_15629 = torch.constant.int -1
    %11306 = torch.prim.ListConstruct %int-1_15629 : (!torch.int) -> !torch.list<int>
    %true_15630 = torch.constant.bool true
    %none_15631 = torch.constant.none
    %11307 = torch.aten.mean.dim %11305, %11306, %true_15630, %none_15631 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_15632 = torch.constant.float 9.9999999999999995E-7
    %int1_15633 = torch.constant.int 1
    %11308 = torch.aten.add.Scalar %11307, %float9.999990e-07_15632, %int1_15633 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %11309 = torch.aten.rsqrt %11308 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %11310 = torch.aten.mul.Tensor %11304, %11309 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15634 = torch.constant.int 5
    %11311 = torch.prims.convert_element_type %11310, %int5_15634 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.37.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.37.norm.key_norm.scale : tensor<128xf16>
    %11312 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %11313 = torch.aten.mul.Tensor %11311, %11312 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_15635 = torch.constant.int 5
    %11314 = torch.prims.convert_element_type %11303, %int5_15635 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_15636 = torch.constant.int 5
    %11315 = torch.prims.convert_element_type %11313, %int5_15636 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_15637 = torch.constant.int 6
    %11316 = torch.prims.convert_element_type %11314, %int6_15637 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_15638 = torch.constant.int 1
    %int24_15639 = torch.constant.int 24
    %int4608_15640 = torch.constant.int 4608
    %int64_15641 = torch.constant.int 64
    %int1_15642 = torch.constant.int 1
    %int2_15643 = torch.constant.int 2
    %11317 = torch.prim.ListConstruct %int1_15638, %int24_15639, %int4608_15640, %int64_15641, %int1_15642, %int2_15643 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11318 = torch.aten.view %11316, %11317 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_15644 = torch.constant.int 6
    %11319 = torch.prims.convert_element_type %11315, %int6_15644 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_15645 = torch.constant.int 1
    %int24_15646 = torch.constant.int 24
    %int4608_15647 = torch.constant.int 4608
    %int64_15648 = torch.constant.int 64
    %int1_15649 = torch.constant.int 1
    %int2_15650 = torch.constant.int 2
    %11320 = torch.prim.ListConstruct %int1_15645, %int24_15646, %int4608_15647, %int64_15648, %int1_15649, %int2_15650 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11321 = torch.aten.view %11319, %11320 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_15651 = torch.constant.int 5
    %int0_15652 = torch.constant.int 0
    %11322 = torch.aten.select.int %211, %int5_15651, %int0_15652 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15653 = torch.constant.int 5
    %int0_15654 = torch.constant.int 0
    %11323 = torch.aten.select.int %11318, %int5_15653, %int0_15654 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %11324 = torch.aten.mul.Tensor %11322, %11323 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_15655 = torch.constant.int 5
    %int1_15656 = torch.constant.int 1
    %11325 = torch.aten.select.int %211, %int5_15655, %int1_15656 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15657 = torch.constant.int 5
    %int1_15658 = torch.constant.int 1
    %11326 = torch.aten.select.int %11318, %int5_15657, %int1_15658 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %11327 = torch.aten.mul.Tensor %11325, %11326 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_15659 = torch.constant.int 1
    %11328 = torch.aten.add.Tensor %11324, %11327, %int1_15659 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_15660 = torch.constant.int 5
    %int0_15661 = torch.constant.int 0
    %11329 = torch.aten.select.int %211, %int5_15660, %int0_15661 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15662 = torch.constant.int 5
    %int0_15663 = torch.constant.int 0
    %11330 = torch.aten.select.int %11321, %int5_15662, %int0_15663 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %11331 = torch.aten.mul.Tensor %11329, %11330 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_15664 = torch.constant.int 5
    %int1_15665 = torch.constant.int 1
    %11332 = torch.aten.select.int %211, %int5_15664, %int1_15665 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_15666 = torch.constant.int 5
    %int1_15667 = torch.constant.int 1
    %11333 = torch.aten.select.int %11321, %int5_15666, %int1_15667 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %11334 = torch.aten.mul.Tensor %11332, %11333 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_15668 = torch.constant.int 1
    %11335 = torch.aten.add.Tensor %11331, %11334, %int1_15668 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_15669 = torch.constant.int 1
    %int24_15670 = torch.constant.int 24
    %int4608_15671 = torch.constant.int 4608
    %int128_15672 = torch.constant.int 128
    %11336 = torch.prim.ListConstruct %int1_15669, %int24_15670, %int4608_15671, %int128_15672 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11337 = torch.aten.view %11328, %11336 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15673 = torch.constant.int 5
    %11338 = torch.prims.convert_element_type %11337, %int5_15673 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_15674 = torch.constant.int 1
    %int24_15675 = torch.constant.int 24
    %int4608_15676 = torch.constant.int 4608
    %int128_15677 = torch.constant.int 128
    %11339 = torch.prim.ListConstruct %int1_15674, %int24_15675, %int4608_15676, %int128_15677 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11340 = torch.aten.view %11335, %11339 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_15678 = torch.constant.int 5
    %11341 = torch.prims.convert_element_type %11340, %int5_15678 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_15679 = torch.constant.float 0.000000e+00
    %false_15680 = torch.constant.bool false
    %none_15681 = torch.constant.none
    %none_15682 = torch.constant.none
    %11342:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%11338, %11341, %11293, %float0.000000e00_15679, %false_15680, %none_15681, %none_15682) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_15683 = torch.constant.int 0
    %int2_15684 = torch.constant.int 2
    %int1_15685 = torch.constant.int 1
    %int3_15686 = torch.constant.int 3
    %11343 = torch.prim.ListConstruct %int0_15683, %int2_15684, %int1_15685, %int3_15686 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11344 = torch.aten.permute %11342#0, %11343 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_15687 = torch.constant.int 1
    %int4608_15688 = torch.constant.int 4608
    %int3072_15689 = torch.constant.int 3072
    %11345 = torch.prim.ListConstruct %int1_15687, %int4608_15688, %int3072_15689 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11346 = torch.aten.view %11344, %11345 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_15690 = torch.constant.str "tanh"
    %11347 = torch.aten.gelu %11286, %str_15690 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %11348 = torch.prim.ListConstruct %11346, %11347 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_15691 = torch.constant.int 2
    %11349 = torch.aten.cat %11348, %int2_15691 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_15692 = torch.constant.int 4608
    %int15360_15693 = torch.constant.int 15360
    %11350 = torch.prim.ListConstruct %int4608_15692, %int15360_15693 : (!torch.int, !torch.int) -> !torch.list<int>
    %11351 = torch.aten.view %11349, %11350 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.37.linear2.weight = util.global.load @__auto.sampler.single_blocks.37.linear2.weight : tensor<3072x15360xf16>
    %11352 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_15694 = torch.constant.int 0
    %int1_15695 = torch.constant.int 1
    %11353 = torch.aten.transpose.int %11352, %int0_15694, %int1_15695 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.37.linear2.bias = util.global.load @__auto.sampler.single_blocks.37.linear2.bias : tensor<3072xf16>
    %11354 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_15696 = torch.constant.int 6
    %11355 = torch.prims.convert_element_type %11354, %int6_15696 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_15697 = torch.constant.int 6
    %11356 = torch.prims.convert_element_type %11351, %int6_15697 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_15698 = torch.constant.int 6
    %11357 = torch.prims.convert_element_type %11353, %int6_15698 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %11358 = torch.aten.mm %11356, %11357 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_15699 = torch.constant.int 1
    %11359 = torch.aten.mul.Scalar %11358, %int1_15699 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_15700 = torch.constant.int 1
    %11360 = torch.aten.mul.Scalar %11355, %int1_15700 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_15701 = torch.constant.int 1
    %11361 = torch.aten.add.Tensor %11359, %11360, %int1_15701 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_15702 = torch.constant.int 5
    %11362 = torch.prims.convert_element_type %11361, %int5_15702 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_15703 = torch.constant.int 1
    %int4608_15704 = torch.constant.int 4608
    %int3072_15705 = torch.constant.int 3072
    %11363 = torch.prim.ListConstruct %int1_15703, %int4608_15704, %int3072_15705 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11364 = torch.aten.view %11362, %11363 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %11365 = torch.aten.mul.Tensor %11259, %11364 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_15706 = torch.constant.int 1
    %11366 = torch.aten.add.Tensor %11241, %11365, %int1_15706 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int0_15707 = torch.constant.int 0
    %int0_15708 = torch.constant.int 0
    %int9223372036854775807_15709 = torch.constant.int 9223372036854775807
    %int1_15710 = torch.constant.int 1
    %11367 = torch.aten.slice.Tensor %11366, %int0_15707, %int0_15708, %int9223372036854775807_15709, %int1_15710 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_15711 = torch.constant.int 1
    %int512_15712 = torch.constant.int 512
    %int9223372036854775807_15713 = torch.constant.int 9223372036854775807
    %int1_15714 = torch.constant.int 1
    %11368 = torch.aten.slice.Tensor %11367, %int1_15711, %int512_15712, %int9223372036854775807_15713, %int1_15714 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %11369 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.final_layer.adaLN_modulation.1.weight = util.global.load @__auto.sampler.final_layer.adaLN_modulation.1.weight : tensor<6144x3072xf16>
    %11370 = torch_c.from_builtin_tensor %__auto.sampler.final_layer.adaLN_modulation.1.weight : tensor<6144x3072xf16> -> !torch.vtensor<[6144,3072],f16>
    %int0_15715 = torch.constant.int 0
    %int1_15716 = torch.constant.int 1
    %11371 = torch.aten.transpose.int %11370, %int0_15715, %int1_15716 : !torch.vtensor<[6144,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,6144],f16>
    %__auto.sampler.final_layer.adaLN_modulation.1.bias = util.global.load @__auto.sampler.final_layer.adaLN_modulation.1.bias : tensor<6144xf16>
    %11372 = torch_c.from_builtin_tensor %__auto.sampler.final_layer.adaLN_modulation.1.bias : tensor<6144xf16> -> !torch.vtensor<[6144],f16>
    %int6_15717 = torch.constant.int 6
    %11373 = torch.prims.convert_element_type %11372, %int6_15717 : !torch.vtensor<[6144],f16>, !torch.int -> !torch.vtensor<[6144],f32>
    %int6_15718 = torch.constant.int 6
    %11374 = torch.prims.convert_element_type %11369, %int6_15718 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_15719 = torch.constant.int 6
    %11375 = torch.prims.convert_element_type %11371, %int6_15719 : !torch.vtensor<[3072,6144],f16>, !torch.int -> !torch.vtensor<[3072,6144],f32>
    %11376 = torch.aten.mm %11374, %11375 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,6144],f32> -> !torch.vtensor<[1,6144],f32>
    %int1_15720 = torch.constant.int 1
    %11377 = torch.aten.mul.Scalar %11376, %int1_15720 : !torch.vtensor<[1,6144],f32>, !torch.int -> !torch.vtensor<[1,6144],f32>
    %int1_15721 = torch.constant.int 1
    %11378 = torch.aten.mul.Scalar %11373, %int1_15721 : !torch.vtensor<[6144],f32>, !torch.int -> !torch.vtensor<[6144],f32>
    %int1_15722 = torch.constant.int 1
    %11379 = torch.aten.add.Tensor %11377, %11378, %int1_15722 : !torch.vtensor<[1,6144],f32>, !torch.vtensor<[6144],f32>, !torch.int -> !torch.vtensor<[1,6144],f32>
    %int5_15723 = torch.constant.int 5
    %11380 = torch.prims.convert_element_type %11379, %int5_15723 : !torch.vtensor<[1,6144],f32>, !torch.int -> !torch.vtensor<[1,6144],f16>
    %int1_15724 = torch.constant.int 1
    %int0_15725 = torch.constant.int 0
    %int3072_15726 = torch.constant.int 3072
    %int1_15727 = torch.constant.int 1
    %11381 = torch.aten.slice.Tensor %11380, %int1_15724, %int0_15725, %int3072_15726, %int1_15727 : !torch.vtensor<[1,6144],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,3072],f16>
    %int1_15728 = torch.constant.int 1
    %int3072_15729 = torch.constant.int 3072
    %int6144_15730 = torch.constant.int 6144
    %int1_15731 = torch.constant.int 1
    %11382 = torch.aten.slice.Tensor %11380, %int1_15728, %int3072_15729, %int6144_15730, %int1_15731 : !torch.vtensor<[1,6144],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,3072],f16>
    %int0_15732 = torch.constant.int 0
    %int0_15733 = torch.constant.int 0
    %int9223372036854775807_15734 = torch.constant.int 9223372036854775807
    %int1_15735 = torch.constant.int 1
    %11383 = torch.aten.slice.Tensor %11382, %int0_15732, %int0_15733, %int9223372036854775807_15734, %int1_15735 : !torch.vtensor<[1,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,3072],f16>
    %int1_15736 = torch.constant.int 1
    %11384 = torch.aten.unsqueeze %11383, %int1_15736 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int2_15737 = torch.constant.int 2
    %int0_15738 = torch.constant.int 0
    %int9223372036854775807_15739 = torch.constant.int 9223372036854775807
    %int1_15740 = torch.constant.int 1
    %11385 = torch.aten.slice.Tensor %11384, %int2_15737, %int0_15738, %int9223372036854775807_15739, %int1_15740 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_15741 = torch.constant.int 1
    %int1_15742 = torch.constant.int 1
    %11386 = torch.aten.add.Scalar %11385, %int1_15741, %int1_15742 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_15743 = torch.constant.int 6
    %11387 = torch.prims.convert_element_type %11368, %int6_15743 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_15744 = torch.constant.int 2
    %11388 = torch.prim.ListConstruct %int2_15744 : (!torch.int) -> !torch.list<int>
    %int0_15745 = torch.constant.int 0
    %true_15746 = torch.constant.bool true
    %result0_15747, %result1_15748 = torch.aten.var_mean.correction %11387, %11388, %int0_15745, %true_15746 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_15749 = torch.constant.float 9.9999999999999995E-7
    %int1_15750 = torch.constant.int 1
    %11389 = torch.aten.add.Scalar %result0_15747, %float9.999990e-07_15749, %int1_15750 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %11390 = torch.aten.rsqrt %11389 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_15751 = torch.constant.int 1
    %11391 = torch.aten.sub.Tensor %11368, %result1_15748, %int1_15751 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %11392 = torch.aten.mul.Tensor %11391, %11390 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_15752 = torch.constant.int 5
    %11393 = torch.prims.convert_element_type %11392, %int5_15752 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %11394 = torch.aten.mul.Tensor %11386, %11393 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int0_15753 = torch.constant.int 0
    %int0_15754 = torch.constant.int 0
    %int9223372036854775807_15755 = torch.constant.int 9223372036854775807
    %int1_15756 = torch.constant.int 1
    %11395 = torch.aten.slice.Tensor %11381, %int0_15753, %int0_15754, %int9223372036854775807_15755, %int1_15756 : !torch.vtensor<[1,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,3072],f16>
    %int1_15757 = torch.constant.int 1
    %11396 = torch.aten.unsqueeze %11395, %int1_15757 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int2_15758 = torch.constant.int 2
    %int0_15759 = torch.constant.int 0
    %int9223372036854775807_15760 = torch.constant.int 9223372036854775807
    %int1_15761 = torch.constant.int 1
    %11397 = torch.aten.slice.Tensor %11396, %int2_15758, %int0_15759, %int9223372036854775807_15760, %int1_15761 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_15762 = torch.constant.int 1
    %11398 = torch.aten.add.Tensor %11394, %11397, %int1_15762 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_15763 = torch.constant.int 4096
    %int3072_15764 = torch.constant.int 3072
    %11399 = torch.prim.ListConstruct %int4096_15763, %int3072_15764 : (!torch.int, !torch.int) -> !torch.list<int>
    %11400 = torch.aten.view %11398, %11399 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.final_layer.linear.weight = util.global.load @__auto.sampler.final_layer.linear.weight : tensor<64x3072xf16>
    %11401 = torch_c.from_builtin_tensor %__auto.sampler.final_layer.linear.weight : tensor<64x3072xf16> -> !torch.vtensor<[64,3072],f16>
    %int0_15765 = torch.constant.int 0
    %int1_15766 = torch.constant.int 1
    %11402 = torch.aten.transpose.int %11401, %int0_15765, %int1_15766 : !torch.vtensor<[64,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,64],f16>
    %__auto.sampler.final_layer.linear.bias = util.global.load @__auto.sampler.final_layer.linear.bias : tensor<64xf16>
    %11403 = torch_c.from_builtin_tensor %__auto.sampler.final_layer.linear.bias : tensor<64xf16> -> !torch.vtensor<[64],f16>
    %int6_15767 = torch.constant.int 6
    %11404 = torch.prims.convert_element_type %11403, %int6_15767 : !torch.vtensor<[64],f16>, !torch.int -> !torch.vtensor<[64],f32>
    %int6_15768 = torch.constant.int 6
    %11405 = torch.prims.convert_element_type %11400, %int6_15768 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_15769 = torch.constant.int 6
    %11406 = torch.prims.convert_element_type %11402, %int6_15769 : !torch.vtensor<[3072,64],f16>, !torch.int -> !torch.vtensor<[3072,64],f32>
    %11407 = torch.aten.mm %11405, %11406 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,64],f32> -> !torch.vtensor<[4096,64],f32>
    %int1_15770 = torch.constant.int 1
    %11408 = torch.aten.mul.Scalar %11407, %int1_15770 : !torch.vtensor<[4096,64],f32>, !torch.int -> !torch.vtensor<[4096,64],f32>
    %int1_15771 = torch.constant.int 1
    %11409 = torch.aten.mul.Scalar %11404, %int1_15771 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_15772 = torch.constant.int 1
    %11410 = torch.aten.add.Tensor %11408, %11409, %int1_15772 : !torch.vtensor<[4096,64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[4096,64],f32>
    %int5_15773 = torch.constant.int 5
    %11411 = torch.prims.convert_element_type %11410, %int5_15773 : !torch.vtensor<[4096,64],f32>, !torch.int -> !torch.vtensor<[4096,64],f16>
    %int1_15774 = torch.constant.int 1
    %int4096_15775 = torch.constant.int 4096
    %int64_15776 = torch.constant.int 64
    %11412 = torch.prim.ListConstruct %int1_15774, %int4096_15775, %int64_15776 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %11413 = torch.aten.view %11411, %11412 : !torch.vtensor<[4096,64],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,64],f16>
    %int1_15777 = torch.constant.int 1
    %11414 = torch.aten.sub.Tensor %arg6, %arg5, %int1_15777 : !torch.vtensor<[1],f16>, !torch.vtensor<[1],f16>, !torch.int -> !torch.vtensor<[1],f16>
    %11415 = torch.aten.mul.Tensor %11414, %11413 : !torch.vtensor<[1],f16>, !torch.vtensor<[1,4096,64],f16> -> !torch.vtensor<[1,4096,64],f16>
    %int1_15778 = torch.constant.int 1
    %11416 = torch.aten.add.Tensor %arg0, %11415, %int1_15778 : !torch.vtensor<[1,4096,64],f16>, !torch.vtensor<[1,4096,64],f16>, !torch.int -> !torch.vtensor<[1,4096,64],f16>
    return %11416 : !torch.vtensor<[1,4096,64],f16>
  }
}