module @compiled_flux_sampler {
  util.global private @__auto.sampler.img_in.weight = #stream.parameter.named<"model"::"sampler.img_in.weight"> : tensor<3072x64xf16>
  util.global private @__auto.sampler.img_in.bias = #stream.parameter.named<"model"::"sampler.img_in.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.time_in.in_layer.weight = #stream.parameter.named<"model"::"sampler.time_in.in_layer.weight"> : tensor<3072x256xf16>
  util.global private @__auto.sampler.time_in.in_layer.bias = #stream.parameter.named<"model"::"sampler.time_in.in_layer.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.time_in.out_layer.weight = #stream.parameter.named<"model"::"sampler.time_in.out_layer.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.time_in.out_layer.bias = #stream.parameter.named<"model"::"sampler.time_in.out_layer.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.guidance_in.in_layer.weight = #stream.parameter.named<"model"::"sampler.guidance_in.in_layer.weight"> : tensor<3072x256xf16>
  util.global private @__auto.sampler.guidance_in.in_layer.bias = #stream.parameter.named<"model"::"sampler.guidance_in.in_layer.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.guidance_in.out_layer.weight = #stream.parameter.named<"model"::"sampler.guidance_in.out_layer.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.guidance_in.out_layer.bias = #stream.parameter.named<"model"::"sampler.guidance_in.out_layer.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.vector_in.in_layer.weight = #stream.parameter.named<"model"::"sampler.vector_in.in_layer.weight"> : tensor<3072x768xf16>
  util.global private @__auto.sampler.vector_in.in_layer.bias = #stream.parameter.named<"model"::"sampler.vector_in.in_layer.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.vector_in.out_layer.weight = #stream.parameter.named<"model"::"sampler.vector_in.out_layer.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.vector_in.out_layer.bias = #stream.parameter.named<"model"::"sampler.vector_in.out_layer.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.txt_in.weight = #stream.parameter.named<"model"::"sampler.txt_in.weight"> : tensor<3072x4096xf16>
  util.global private @__auto.sampler.txt_in.bias = #stream.parameter.named<"model"::"sampler.txt_in.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.0.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.0.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.0.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.0.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.0.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.0.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.0.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.0.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.0.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.0.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.1.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.1.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.1.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.1.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.1.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.1.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.1.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.1.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.1.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.1.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.2.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.2.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.2.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.2.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.2.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.2.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.2.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.2.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.2.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.2.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.3.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.3.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.3.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.3.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.3.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.3.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.3.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.3.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.3.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.3.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.4.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.4.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.4.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.4.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.4.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.4.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.4.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.4.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.4.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.4.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.5.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.5.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.5.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.5.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.5.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.5.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.5.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.5.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.5.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.5.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.6.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.6.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.6.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.6.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.6.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.6.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.6.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.6.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.6.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.6.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.7.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.7.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.7.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.7.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.7.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.7.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.7.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.7.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.7.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.7.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.8.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.8.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.8.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.8.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.8.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.8.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.8.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.8.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.8.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.8.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.9.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.9.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.9.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.9.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.9.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.9.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.9.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.9.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.9.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.9.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.10.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.10.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.10.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.10.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.10.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.10.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.10.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.10.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.10.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.10.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.11.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.11.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.11.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.11.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.11.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.11.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.11.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.11.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.11.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.11.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.12.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.12.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.12.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.12.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.12.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.12.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.12.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.12.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.12.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.12.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.13.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.13.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.13.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.13.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.13.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.13.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.13.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.13.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.13.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.13.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.14.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.14.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.14.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.14.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.14.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.14.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.14.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.14.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.14.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.14.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.15.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.15.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.15.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.15.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.15.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.15.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.15.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.15.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.15.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.15.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.16.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.16.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.16.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.16.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.16.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.16.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.16.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.16.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.16.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.16.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.17.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.17.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.17.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.17.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.17.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.17.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.17.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.17.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.17.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.17.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.18.img_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.img_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_mod.lin.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_mod.lin.weight"> : tensor<18432x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_mod.lin.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_mod.lin.bias"> : tensor<18432xf16>
  util.global private @__auto.sampler.double_blocks.18.img_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.img_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.18.img_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.18.img_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_attn.qkv.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_attn.qkv.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_attn.qkv.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_attn.qkv.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_attn.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_attn.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_attn.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_attn.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.double_blocks.18.img_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.img_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.18.img_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.img_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.18.img_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.18.img_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.img_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_attn.proj.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_attn.proj.weight"> : tensor<3072x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_attn.proj.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_attn.proj.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_mlp.0.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_mlp.0.weight"> : tensor<12288x3072xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_mlp.0.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_mlp.0.bias"> : tensor<12288xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_mlp.2.weight = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_mlp.2.weight"> : tensor<3072x12288xf16>
  util.global private @__auto.sampler.double_blocks.18.txt_mlp.2.bias = #stream.parameter.named<"model"::"sampler.double_blocks.18.txt_mlp.2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.0.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.0.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.0.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.0.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.0.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.0.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.0.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.0.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.0.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.0.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.0.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.0.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.0.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.0.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.0.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.0.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.1.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.1.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.1.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.1.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.1.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.1.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.1.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.1.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.1.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.1.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.1.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.1.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.1.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.1.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.1.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.1.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.2.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.2.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.2.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.2.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.2.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.2.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.2.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.2.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.2.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.2.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.2.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.2.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.2.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.2.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.2.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.2.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.3.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.3.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.3.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.3.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.3.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.3.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.3.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.3.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.3.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.3.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.3.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.3.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.3.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.3.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.3.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.3.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.4.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.4.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.4.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.4.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.4.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.4.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.4.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.4.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.4.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.4.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.4.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.4.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.4.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.4.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.4.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.4.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.5.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.5.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.5.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.5.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.5.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.5.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.5.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.5.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.5.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.5.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.5.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.5.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.5.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.5.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.5.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.5.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.6.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.6.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.6.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.6.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.6.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.6.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.6.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.6.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.6.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.6.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.6.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.6.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.6.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.6.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.6.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.6.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.7.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.7.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.7.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.7.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.7.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.7.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.7.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.7.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.7.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.7.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.7.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.7.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.7.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.7.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.7.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.7.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.8.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.8.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.8.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.8.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.8.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.8.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.8.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.8.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.8.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.8.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.8.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.8.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.8.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.8.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.8.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.8.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.9.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.9.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.9.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.9.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.9.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.9.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.9.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.9.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.9.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.9.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.9.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.9.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.9.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.9.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.9.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.9.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.10.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.10.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.10.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.10.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.10.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.10.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.10.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.10.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.10.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.10.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.10.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.10.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.10.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.10.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.10.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.10.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.11.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.11.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.11.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.11.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.11.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.11.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.11.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.11.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.11.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.11.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.11.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.11.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.11.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.11.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.11.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.11.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.12.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.12.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.12.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.12.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.12.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.12.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.12.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.12.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.12.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.12.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.12.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.12.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.12.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.12.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.12.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.12.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.13.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.13.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.13.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.13.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.13.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.13.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.13.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.13.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.13.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.13.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.13.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.13.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.13.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.13.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.13.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.13.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.14.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.14.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.14.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.14.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.14.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.14.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.14.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.14.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.14.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.14.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.14.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.14.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.14.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.14.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.14.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.14.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.15.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.15.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.15.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.15.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.15.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.15.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.15.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.15.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.15.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.15.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.15.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.15.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.15.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.15.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.15.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.15.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.16.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.16.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.16.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.16.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.16.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.16.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.16.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.16.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.16.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.16.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.16.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.16.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.16.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.16.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.16.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.16.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.17.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.17.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.17.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.17.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.17.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.17.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.17.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.17.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.17.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.17.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.17.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.17.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.17.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.17.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.17.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.17.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.18.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.18.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.18.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.18.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.18.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.18.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.18.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.18.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.18.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.18.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.18.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.18.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.18.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.18.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.18.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.18.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.19.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.19.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.19.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.19.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.19.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.19.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.19.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.19.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.19.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.19.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.19.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.19.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.19.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.19.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.19.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.19.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.20.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.20.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.20.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.20.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.20.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.20.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.20.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.20.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.20.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.20.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.20.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.20.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.20.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.20.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.20.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.20.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.21.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.21.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.21.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.21.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.21.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.21.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.21.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.21.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.21.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.21.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.21.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.21.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.21.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.21.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.21.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.21.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.22.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.22.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.22.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.22.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.22.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.22.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.22.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.22.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.22.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.22.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.22.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.22.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.22.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.22.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.22.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.22.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.23.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.23.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.23.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.23.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.23.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.23.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.23.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.23.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.23.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.23.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.23.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.23.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.23.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.23.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.23.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.23.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.24.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.24.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.24.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.24.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.24.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.24.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.24.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.24.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.24.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.24.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.24.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.24.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.24.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.24.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.24.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.24.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.25.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.25.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.25.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.25.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.25.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.25.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.25.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.25.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.25.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.25.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.25.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.25.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.25.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.25.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.25.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.25.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.26.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.26.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.26.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.26.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.26.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.26.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.26.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.26.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.26.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.26.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.26.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.26.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.26.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.26.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.26.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.26.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.27.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.27.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.27.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.27.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.27.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.27.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.27.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.27.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.27.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.27.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.27.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.27.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.27.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.27.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.27.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.27.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.28.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.28.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.28.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.28.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.28.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.28.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.28.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.28.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.28.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.28.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.28.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.28.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.28.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.28.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.28.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.28.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.29.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.29.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.29.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.29.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.29.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.29.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.29.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.29.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.29.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.29.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.29.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.29.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.29.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.29.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.29.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.29.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.30.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.30.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.30.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.30.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.30.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.30.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.30.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.30.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.30.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.30.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.30.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.30.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.30.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.30.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.30.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.30.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.31.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.31.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.31.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.31.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.31.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.31.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.31.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.31.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.31.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.31.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.31.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.31.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.31.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.31.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.31.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.31.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.32.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.32.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.32.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.32.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.32.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.32.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.32.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.32.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.32.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.32.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.32.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.32.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.32.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.32.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.32.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.32.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.33.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.33.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.33.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.33.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.33.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.33.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.33.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.33.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.33.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.33.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.33.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.33.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.33.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.33.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.33.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.33.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.34.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.34.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.34.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.34.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.34.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.34.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.34.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.34.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.34.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.34.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.34.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.34.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.34.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.34.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.34.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.34.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.35.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.35.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.35.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.35.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.35.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.35.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.35.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.35.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.35.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.35.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.35.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.35.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.35.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.35.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.35.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.35.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.36.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.36.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.36.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.36.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.36.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.36.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.36.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.36.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.36.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.36.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.36.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.36.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.36.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.36.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.36.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.36.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.single_blocks.37.modulation.lin.weight = #stream.parameter.named<"model"::"sampler.single_blocks.37.modulation.lin.weight"> : tensor<9216x3072xf16>
  util.global private @__auto.sampler.single_blocks.37.modulation.lin.bias = #stream.parameter.named<"model"::"sampler.single_blocks.37.modulation.lin.bias"> : tensor<9216xf16>
  util.global private @__auto.sampler.single_blocks.37.linear1.weight = #stream.parameter.named<"model"::"sampler.single_blocks.37.linear1.weight"> : tensor<21504x3072xf16>
  util.global private @__auto.sampler.single_blocks.37.linear1.bias = #stream.parameter.named<"model"::"sampler.single_blocks.37.linear1.bias"> : tensor<21504xf16>
  util.global private @__auto.sampler.single_blocks.37.norm.query_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.37.norm.query_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.37.norm.key_norm.scale = #stream.parameter.named<"model"::"sampler.single_blocks.37.norm.key_norm.scale"> : tensor<128xf16>
  util.global private @__auto.sampler.single_blocks.37.linear2.weight = #stream.parameter.named<"model"::"sampler.single_blocks.37.linear2.weight"> : tensor<3072x15360xf16>
  util.global private @__auto.sampler.single_blocks.37.linear2.bias = #stream.parameter.named<"model"::"sampler.single_blocks.37.linear2.bias"> : tensor<3072xf16>
  util.global private @__auto.sampler.final_layer.adaLN_modulation.1.weight = #stream.parameter.named<"model"::"sampler.final_layer.adaLN_modulation.1.weight"> : tensor<6144x3072xf16>
  util.global private @__auto.sampler.final_layer.adaLN_modulation.1.bias = #stream.parameter.named<"model"::"sampler.final_layer.adaLN_modulation.1.bias"> : tensor<6144xf16>
  util.global private @__auto.sampler.final_layer.linear.weight = #stream.parameter.named<"model"::"sampler.final_layer.linear.weight"> : tensor<64x3072xf16>
  util.global private @__auto.sampler.final_layer.linear.bias = #stream.parameter.named<"model"::"sampler.final_layer.linear.bias"> : tensor<64xf16>
  func.func @run_forward(%arg0: !torch.vtensor<[1,4096,64],f16>, %arg1: !torch.vtensor<[1,4096,3],f16>, %arg2: !torch.vtensor<[1,512,4096],f16>, %arg3: !torch.vtensor<[1,512,3],f16>, %arg4: !torch.vtensor<[1,768],f16>, %arg5: !torch.vtensor<[1],f16>, %arg6: !torch.vtensor<[1],f16>, %arg7: !torch.vtensor<[1],f16>) -> !torch.vtensor<[1,4096,64],f16> attributes {iree.reflection = {model_name = "flux_sampler"}, torch.assume_strict_symbolic_shapes} {
    %int1 = torch.constant.int 1
    %0 = torch.prim.ListConstruct %int1 : (!torch.int) -> !torch.list<int>
    %false = torch.constant.bool false
    %1 = torch.aten.expand %arg5, %0, %false : !torch.vtensor<[1],f16>, !torch.list<int>, !torch.bool -> !torch.vtensor<[1],f16>
    %int4096 = torch.constant.int 4096
    %int64 = torch.constant.int 64
    %2 = torch.prim.ListConstruct %int4096, %int64 : (!torch.int, !torch.int) -> !torch.list<int>
    %3 = torch.aten.view %arg0, %2 : !torch.vtensor<[1,4096,64],f16>, !torch.list<int> -> !torch.vtensor<[4096,64],f16>
    %__auto.sampler.img_in.weight = util.global.load @__auto.sampler.img_in.weight : tensor<3072x64xf16>
    %4 = torch_c.from_builtin_tensor %__auto.sampler.img_in.weight : tensor<3072x64xf16> -> !torch.vtensor<[3072,64],f16>
    %int0 = torch.constant.int 0
    %int1_0 = torch.constant.int 1
    %5 = torch.aten.transpose.int %4, %int0, %int1_0 : !torch.vtensor<[3072,64],f16>, !torch.int, !torch.int -> !torch.vtensor<[64,3072],f16>
    %__auto.sampler.img_in.bias = util.global.load @__auto.sampler.img_in.bias : tensor<3072xf16>
    %6 = torch_c.from_builtin_tensor %__auto.sampler.img_in.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6 = torch.constant.int 6
    %7 = torch.prims.convert_element_type %6, %int6 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1 = torch.constant.int 6
    %8 = torch.prims.convert_element_type %3, %int6_1 : !torch.vtensor<[4096,64],f16>, !torch.int -> !torch.vtensor<[4096,64],f32>
    %int6_2 = torch.constant.int 6
    %9 = torch.prims.convert_element_type %5, %int6_2 : !torch.vtensor<[64,3072],f16>, !torch.int -> !torch.vtensor<[64,3072],f32>
    %10 = torch.aten.mm %8, %9 : !torch.vtensor<[4096,64],f32>, !torch.vtensor<[64,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_3 = torch.constant.int 1
    %11 = torch.aten.mul.Scalar %10, %int1_3 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_4 = torch.constant.int 1
    %12 = torch.aten.mul.Scalar %7, %int1_4 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5 = torch.constant.int 1
    %13 = torch.aten.add.Tensor %11, %12, %int1_5 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5 = torch.constant.int 5
    %14 = torch.prims.convert_element_type %13, %int5 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_6 = torch.constant.int 1
    %int4096_7 = torch.constant.int 4096
    %int3072 = torch.constant.int 3072
    %15 = torch.prim.ListConstruct %int1_6, %int4096_7, %int3072 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %16 = torch.aten.view %14, %15 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %float1.000000e03 = torch.constant.float 1.000000e+03
    %17 = torch.aten.mul.Scalar %1, %float1.000000e03 : !torch.vtensor<[1],f16>, !torch.float -> !torch.vtensor<[1],f16>
    %int0_8 = torch.constant.int 0
    %int128 = torch.constant.int 128
    %int6_9 = torch.constant.int 6
    %none = torch.constant.none
    %cpu = torch.constant.device "cpu"
    %false_10 = torch.constant.bool false
    %18 = torch.aten.arange.start %int0_8, %int128, %int6_9, %none, %cpu, %false_10 : !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],f32>
    %float-9.210340e00 = torch.constant.float -9.2103403719761836
    %19 = torch.aten.mul.Scalar %18, %float-9.210340e00 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int128_11 = torch.constant.int 128
    %20 = torch.aten.div.Scalar %19, %int128_11 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %21 = torch.aten.exp %20 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %int6_12 = torch.constant.int 6
    %22 = torch.prims.convert_element_type %21, %int6_12 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int0_13 = torch.constant.int 0
    %int0_14 = torch.constant.int 0
    %int9223372036854775807 = torch.constant.int 9223372036854775807
    %int1_15 = torch.constant.int 1
    %23 = torch.aten.slice.Tensor %17, %int0_13, %int0_14, %int9223372036854775807, %int1_15 : !torch.vtensor<[1],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1],f16>
    %int1_16 = torch.constant.int 1
    %24 = torch.aten.unsqueeze %23, %int1_16 : !torch.vtensor<[1],f16>, !torch.int -> !torch.vtensor<[1,1],f16>
    %int6_17 = torch.constant.int 6
    %25 = torch.prims.convert_element_type %24, %int6_17 : !torch.vtensor<[1,1],f16>, !torch.int -> !torch.vtensor<[1,1],f32>
    %int0_18 = torch.constant.int 0
    %26 = torch.aten.unsqueeze %22, %int0_18 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %27 = torch.aten.mul.Tensor %25, %26 : !torch.vtensor<[1,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[1,128],f32>
    %28 = torch.aten.cos %27 : !torch.vtensor<[1,128],f32> -> !torch.vtensor<[1,128],f32>
    %29 = torch.aten.sin %27 : !torch.vtensor<[1,128],f32> -> !torch.vtensor<[1,128],f32>
    %30 = torch.prim.ListConstruct %28, %29 : (!torch.vtensor<[1,128],f32>, !torch.vtensor<[1,128],f32>) -> !torch.list<vtensor>
    %int-1 = torch.constant.int -1
    %31 = torch.aten.cat %30, %int-1 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,256],f32>
    %int5_19 = torch.constant.int 5
    %32 = torch.prims.convert_element_type %31, %int5_19 : !torch.vtensor<[1,256],f32>, !torch.int -> !torch.vtensor<[1,256],f16>
    %__auto.sampler.time_in.in_layer.weight = util.global.load @__auto.sampler.time_in.in_layer.weight : tensor<3072x256xf16>
    %33 = torch_c.from_builtin_tensor %__auto.sampler.time_in.in_layer.weight : tensor<3072x256xf16> -> !torch.vtensor<[3072,256],f16>
    %int0_20 = torch.constant.int 0
    %int1_21 = torch.constant.int 1
    %34 = torch.aten.transpose.int %33, %int0_20, %int1_21 : !torch.vtensor<[3072,256],f16>, !torch.int, !torch.int -> !torch.vtensor<[256,3072],f16>
    %__auto.sampler.time_in.in_layer.bias = util.global.load @__auto.sampler.time_in.in_layer.bias : tensor<3072xf16>
    %35 = torch_c.from_builtin_tensor %__auto.sampler.time_in.in_layer.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_22 = torch.constant.int 6
    %36 = torch.prims.convert_element_type %35, %int6_22 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_23 = torch.constant.int 6
    %37 = torch.prims.convert_element_type %32, %int6_23 : !torch.vtensor<[1,256],f16>, !torch.int -> !torch.vtensor<[1,256],f32>
    %int6_24 = torch.constant.int 6
    %38 = torch.prims.convert_element_type %34, %int6_24 : !torch.vtensor<[256,3072],f16>, !torch.int -> !torch.vtensor<[256,3072],f32>
    %39 = torch.aten.mm %37, %38 : !torch.vtensor<[1,256],f32>, !torch.vtensor<[256,3072],f32> -> !torch.vtensor<[1,3072],f32>
    %int1_25 = torch.constant.int 1
    %40 = torch.aten.mul.Scalar %39, %int1_25 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int1_26 = torch.constant.int 1
    %41 = torch.aten.mul.Scalar %36, %int1_26 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_27 = torch.constant.int 1
    %42 = torch.aten.add.Tensor %40, %41, %int1_27 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int5_28 = torch.constant.int 5
    %43 = torch.prims.convert_element_type %42, %int5_28 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %44 = torch.aten.silu %43 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.time_in.out_layer.weight = util.global.load @__auto.sampler.time_in.out_layer.weight : tensor<3072x3072xf16>
    %45 = torch_c.from_builtin_tensor %__auto.sampler.time_in.out_layer.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_29 = torch.constant.int 0
    %int1_30 = torch.constant.int 1
    %46 = torch.aten.transpose.int %45, %int0_29, %int1_30 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.time_in.out_layer.bias = util.global.load @__auto.sampler.time_in.out_layer.bias : tensor<3072xf16>
    %47 = torch_c.from_builtin_tensor %__auto.sampler.time_in.out_layer.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_31 = torch.constant.int 6
    %48 = torch.prims.convert_element_type %47, %int6_31 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_32 = torch.constant.int 6
    %49 = torch.prims.convert_element_type %44, %int6_32 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_33 = torch.constant.int 6
    %50 = torch.prims.convert_element_type %46, %int6_33 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %51 = torch.aten.mm %49, %50 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[1,3072],f32>
    %int1_34 = torch.constant.int 1
    %52 = torch.aten.mul.Scalar %51, %int1_34 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int1_35 = torch.constant.int 1
    %53 = torch.aten.mul.Scalar %48, %int1_35 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_36 = torch.constant.int 1
    %54 = torch.aten.add.Tensor %52, %53, %int1_36 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int5_37 = torch.constant.int 5
    %55 = torch.prims.convert_element_type %54, %int5_37 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %float1.000000e03_38 = torch.constant.float 1.000000e+03
    %56 = torch.aten.mul.Scalar %arg7, %float1.000000e03_38 : !torch.vtensor<[1],f16>, !torch.float -> !torch.vtensor<[1],f16>
    %int0_39 = torch.constant.int 0
    %int128_40 = torch.constant.int 128
    %int6_41 = torch.constant.int 6
    %none_42 = torch.constant.none
    %cpu_43 = torch.constant.device "cpu"
    %false_44 = torch.constant.bool false
    %57 = torch.aten.arange.start %int0_39, %int128_40, %int6_41, %none_42, %cpu_43, %false_44 : !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[128],f32>
    %float-9.210340e00_45 = torch.constant.float -9.2103403719761836
    %58 = torch.aten.mul.Scalar %57, %float-9.210340e00_45 : !torch.vtensor<[128],f32>, !torch.float -> !torch.vtensor<[128],f32>
    %int128_46 = torch.constant.int 128
    %59 = torch.aten.div.Scalar %58, %int128_46 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %60 = torch.aten.exp %59 : !torch.vtensor<[128],f32> -> !torch.vtensor<[128],f32>
    %int6_47 = torch.constant.int 6
    %61 = torch.prims.convert_element_type %60, %int6_47 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[128],f32>
    %int0_48 = torch.constant.int 0
    %int0_49 = torch.constant.int 0
    %int9223372036854775807_50 = torch.constant.int 9223372036854775807
    %int1_51 = torch.constant.int 1
    %62 = torch.aten.slice.Tensor %56, %int0_48, %int0_49, %int9223372036854775807_50, %int1_51 : !torch.vtensor<[1],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1],f16>
    %int1_52 = torch.constant.int 1
    %63 = torch.aten.unsqueeze %62, %int1_52 : !torch.vtensor<[1],f16>, !torch.int -> !torch.vtensor<[1,1],f16>
    %int6_53 = torch.constant.int 6
    %64 = torch.prims.convert_element_type %63, %int6_53 : !torch.vtensor<[1,1],f16>, !torch.int -> !torch.vtensor<[1,1],f32>
    %int0_54 = torch.constant.int 0
    %65 = torch.aten.unsqueeze %61, %int0_54 : !torch.vtensor<[128],f32>, !torch.int -> !torch.vtensor<[1,128],f32>
    %66 = torch.aten.mul.Tensor %64, %65 : !torch.vtensor<[1,1],f32>, !torch.vtensor<[1,128],f32> -> !torch.vtensor<[1,128],f32>
    %67 = torch.aten.cos %66 : !torch.vtensor<[1,128],f32> -> !torch.vtensor<[1,128],f32>
    %68 = torch.aten.sin %66 : !torch.vtensor<[1,128],f32> -> !torch.vtensor<[1,128],f32>
    %69 = torch.prim.ListConstruct %67, %68 : (!torch.vtensor<[1,128],f32>, !torch.vtensor<[1,128],f32>) -> !torch.list<vtensor>
    %int-1_55 = torch.constant.int -1
    %70 = torch.aten.cat %69, %int-1_55 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,256],f32>
    %int5_56 = torch.constant.int 5
    %71 = torch.prims.convert_element_type %70, %int5_56 : !torch.vtensor<[1,256],f32>, !torch.int -> !torch.vtensor<[1,256],f16>
    %__auto.sampler.guidance_in.in_layer.weight = util.global.load @__auto.sampler.guidance_in.in_layer.weight : tensor<3072x256xf16>
    %72 = torch_c.from_builtin_tensor %__auto.sampler.guidance_in.in_layer.weight : tensor<3072x256xf16> -> !torch.vtensor<[3072,256],f16>
    %int0_57 = torch.constant.int 0
    %int1_58 = torch.constant.int 1
    %73 = torch.aten.transpose.int %72, %int0_57, %int1_58 : !torch.vtensor<[3072,256],f16>, !torch.int, !torch.int -> !torch.vtensor<[256,3072],f16>
    %__auto.sampler.guidance_in.in_layer.bias = util.global.load @__auto.sampler.guidance_in.in_layer.bias : tensor<3072xf16>
    %74 = torch_c.from_builtin_tensor %__auto.sampler.guidance_in.in_layer.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_59 = torch.constant.int 6
    %75 = torch.prims.convert_element_type %74, %int6_59 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_60 = torch.constant.int 6
    %76 = torch.prims.convert_element_type %71, %int6_60 : !torch.vtensor<[1,256],f16>, !torch.int -> !torch.vtensor<[1,256],f32>
    %int6_61 = torch.constant.int 6
    %77 = torch.prims.convert_element_type %73, %int6_61 : !torch.vtensor<[256,3072],f16>, !torch.int -> !torch.vtensor<[256,3072],f32>
    %78 = torch.aten.mm %76, %77 : !torch.vtensor<[1,256],f32>, !torch.vtensor<[256,3072],f32> -> !torch.vtensor<[1,3072],f32>
    %int1_62 = torch.constant.int 1
    %79 = torch.aten.mul.Scalar %78, %int1_62 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int1_63 = torch.constant.int 1
    %80 = torch.aten.mul.Scalar %75, %int1_63 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_64 = torch.constant.int 1
    %81 = torch.aten.add.Tensor %79, %80, %int1_64 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int5_65 = torch.constant.int 5
    %82 = torch.prims.convert_element_type %81, %int5_65 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %83 = torch.aten.silu %82 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.guidance_in.out_layer.weight = util.global.load @__auto.sampler.guidance_in.out_layer.weight : tensor<3072x3072xf16>
    %84 = torch_c.from_builtin_tensor %__auto.sampler.guidance_in.out_layer.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_66 = torch.constant.int 0
    %int1_67 = torch.constant.int 1
    %85 = torch.aten.transpose.int %84, %int0_66, %int1_67 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.guidance_in.out_layer.bias = util.global.load @__auto.sampler.guidance_in.out_layer.bias : tensor<3072xf16>
    %86 = torch_c.from_builtin_tensor %__auto.sampler.guidance_in.out_layer.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_68 = torch.constant.int 6
    %87 = torch.prims.convert_element_type %86, %int6_68 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_69 = torch.constant.int 6
    %88 = torch.prims.convert_element_type %83, %int6_69 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_70 = torch.constant.int 6
    %89 = torch.prims.convert_element_type %85, %int6_70 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %90 = torch.aten.mm %88, %89 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[1,3072],f32>
    %int1_71 = torch.constant.int 1
    %91 = torch.aten.mul.Scalar %90, %int1_71 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int1_72 = torch.constant.int 1
    %92 = torch.aten.mul.Scalar %87, %int1_72 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_73 = torch.constant.int 1
    %93 = torch.aten.add.Tensor %91, %92, %int1_73 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int5_74 = torch.constant.int 5
    %94 = torch.prims.convert_element_type %93, %int5_74 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %int1_75 = torch.constant.int 1
    %95 = torch.aten.add.Tensor %55, %94, %int1_75 : !torch.vtensor<[1,3072],f16>, !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.vector_in.in_layer.weight = util.global.load @__auto.sampler.vector_in.in_layer.weight : tensor<3072x768xf16>
    %96 = torch_c.from_builtin_tensor %__auto.sampler.vector_in.in_layer.weight : tensor<3072x768xf16> -> !torch.vtensor<[3072,768],f16>
    %int0_76 = torch.constant.int 0
    %int1_77 = torch.constant.int 1
    %97 = torch.aten.transpose.int %96, %int0_76, %int1_77 : !torch.vtensor<[3072,768],f16>, !torch.int, !torch.int -> !torch.vtensor<[768,3072],f16>
    %__auto.sampler.vector_in.in_layer.bias = util.global.load @__auto.sampler.vector_in.in_layer.bias : tensor<3072xf16>
    %98 = torch_c.from_builtin_tensor %__auto.sampler.vector_in.in_layer.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_78 = torch.constant.int 6
    %99 = torch.prims.convert_element_type %98, %int6_78 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_79 = torch.constant.int 6
    %100 = torch.prims.convert_element_type %arg4, %int6_79 : !torch.vtensor<[1,768],f16>, !torch.int -> !torch.vtensor<[1,768],f32>
    %int6_80 = torch.constant.int 6
    %101 = torch.prims.convert_element_type %97, %int6_80 : !torch.vtensor<[768,3072],f16>, !torch.int -> !torch.vtensor<[768,3072],f32>
    %102 = torch.aten.mm %100, %101 : !torch.vtensor<[1,768],f32>, !torch.vtensor<[768,3072],f32> -> !torch.vtensor<[1,3072],f32>
    %int1_81 = torch.constant.int 1
    %103 = torch.aten.mul.Scalar %102, %int1_81 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int1_82 = torch.constant.int 1
    %104 = torch.aten.mul.Scalar %99, %int1_82 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_83 = torch.constant.int 1
    %105 = torch.aten.add.Tensor %103, %104, %int1_83 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int5_84 = torch.constant.int 5
    %106 = torch.prims.convert_element_type %105, %int5_84 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %107 = torch.aten.silu %106 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.vector_in.out_layer.weight = util.global.load @__auto.sampler.vector_in.out_layer.weight : tensor<3072x3072xf16>
    %108 = torch_c.from_builtin_tensor %__auto.sampler.vector_in.out_layer.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_85 = torch.constant.int 0
    %int1_86 = torch.constant.int 1
    %109 = torch.aten.transpose.int %108, %int0_85, %int1_86 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.vector_in.out_layer.bias = util.global.load @__auto.sampler.vector_in.out_layer.bias : tensor<3072xf16>
    %110 = torch_c.from_builtin_tensor %__auto.sampler.vector_in.out_layer.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_87 = torch.constant.int 6
    %111 = torch.prims.convert_element_type %110, %int6_87 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_88 = torch.constant.int 6
    %112 = torch.prims.convert_element_type %107, %int6_88 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_89 = torch.constant.int 6
    %113 = torch.prims.convert_element_type %109, %int6_89 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %114 = torch.aten.mm %112, %113 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[1,3072],f32>
    %int1_90 = torch.constant.int 1
    %115 = torch.aten.mul.Scalar %114, %int1_90 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int1_91 = torch.constant.int 1
    %116 = torch.aten.mul.Scalar %111, %int1_91 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_92 = torch.constant.int 1
    %117 = torch.aten.add.Tensor %115, %116, %int1_92 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int5_93 = torch.constant.int 5
    %118 = torch.prims.convert_element_type %117, %int5_93 : !torch.vtensor<[1,3072],f32>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %int1_94 = torch.constant.int 1
    %119 = torch.aten.add.Tensor %95, %118, %int1_94 : !torch.vtensor<[1,3072],f16>, !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f16>
    %int512 = torch.constant.int 512
    %int4096_95 = torch.constant.int 4096
    %120 = torch.prim.ListConstruct %int512, %int4096_95 : (!torch.int, !torch.int) -> !torch.list<int>
    %121 = torch.aten.view %arg2, %120 : !torch.vtensor<[1,512,4096],f16>, !torch.list<int> -> !torch.vtensor<[512,4096],f16>
    %__auto.sampler.txt_in.weight = util.global.load @__auto.sampler.txt_in.weight : tensor<3072x4096xf16>
    %122 = torch_c.from_builtin_tensor %__auto.sampler.txt_in.weight : tensor<3072x4096xf16> -> !torch.vtensor<[3072,4096],f16>
    %int0_96 = torch.constant.int 0
    %int1_97 = torch.constant.int 1
    %123 = torch.aten.transpose.int %122, %int0_96, %int1_97 : !torch.vtensor<[3072,4096],f16>, !torch.int, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.txt_in.bias = util.global.load @__auto.sampler.txt_in.bias : tensor<3072xf16>
    %124 = torch_c.from_builtin_tensor %__auto.sampler.txt_in.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_98 = torch.constant.int 6
    %125 = torch.prims.convert_element_type %124, %int6_98 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_99 = torch.constant.int 6
    %126 = torch.prims.convert_element_type %121, %int6_99 : !torch.vtensor<[512,4096],f16>, !torch.int -> !torch.vtensor<[512,4096],f32>
    %int6_100 = torch.constant.int 6
    %127 = torch.prims.convert_element_type %123, %int6_100 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %128 = torch.aten.mm %126, %127 : !torch.vtensor<[512,4096],f32>, !torch.vtensor<[4096,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_101 = torch.constant.int 1
    %129 = torch.aten.mul.Scalar %128, %int1_101 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_102 = torch.constant.int 1
    %130 = torch.aten.mul.Scalar %125, %int1_102 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_103 = torch.constant.int 1
    %131 = torch.aten.add.Tensor %129, %130, %int1_103 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_104 = torch.constant.int 5
    %132 = torch.prims.convert_element_type %131, %int5_104 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_105 = torch.constant.int 1
    %int512_106 = torch.constant.int 512
    %int3072_107 = torch.constant.int 3072
    %133 = torch.prim.ListConstruct %int1_105, %int512_106, %int3072_107 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %134 = torch.aten.view %132, %133 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %135 = torch.prim.ListConstruct %arg3, %arg1 : (!torch.vtensor<[1,512,3],f16>, !torch.vtensor<[1,4096,3],f16>) -> !torch.list<vtensor>
    %int1_108 = torch.constant.int 1
    %136 = torch.aten.cat %135, %int1_108 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,3],f16>
    %int2 = torch.constant.int 2
    %int0_109 = torch.constant.int 0
    %137 = torch.aten.select.int %136, %int2, %int0_109 : !torch.vtensor<[1,4608,3],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,4608],f16>
    %int0_110 = torch.constant.int 0
    %int16 = torch.constant.int 16
    %int2_111 = torch.constant.int 2
    %int7 = torch.constant.int 7
    %none_112 = torch.constant.none
    %cpu_113 = torch.constant.device "cpu"
    %false_114 = torch.constant.bool false
    %138 = torch.aten.arange.start_step %int0_110, %int16, %int2_111, %int7, %none_112, %cpu_113, %false_114 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[8],f64>
    %int16_115 = torch.constant.int 16
    %139 = torch.aten.div.Scalar %138, %int16_115 : !torch.vtensor<[8],f64>, !torch.int -> !torch.vtensor<[8],f64>
    %int10000 = torch.constant.int 10000
    %140 = torch.aten.pow.Scalar %int10000, %139 : !torch.int, !torch.vtensor<[8],f64> -> !torch.vtensor<[8],f64>
    %141 = torch.aten.reciprocal %140 : !torch.vtensor<[8],f64> -> !torch.vtensor<[8],f64>
    %float1.000000e00 = torch.constant.float 1.000000e+00
    %142 = torch.aten.mul.Scalar %141, %float1.000000e00 : !torch.vtensor<[8],f64>, !torch.float -> !torch.vtensor<[8],f64>
    %int2_116 = torch.constant.int 2
    %143 = torch.aten.unsqueeze %137, %int2_116 : !torch.vtensor<[1,4608],f16>, !torch.int -> !torch.vtensor<[1,4608,1],f16>
    %int0_117 = torch.constant.int 0
    %int1_118 = torch.constant.int 1
    %int2_119 = torch.constant.int 2
    %144 = torch.prim.ListConstruct %int0_117, %int1_118, %int2_119 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %145 = torch.aten.permute %143, %144 : !torch.vtensor<[1,4608,1],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,1],f16>
    %int1_120 = torch.constant.int 1
    %146 = torch.aten.unsqueeze %142, %int1_120 : !torch.vtensor<[8],f64>, !torch.int -> !torch.vtensor<[8,1],f64>
    %int2_121 = torch.constant.int 2
    %147 = torch.aten.unsqueeze %146, %int2_121 : !torch.vtensor<[8,1],f64>, !torch.int -> !torch.vtensor<[8,1,1],f64>
    %int1_122 = torch.constant.int 1
    %int2_123 = torch.constant.int 2
    %int0_124 = torch.constant.int 0
    %148 = torch.prim.ListConstruct %int1_122, %int2_123, %int0_124 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %149 = torch.aten.permute %147, %148 : !torch.vtensor<[8,1,1],f64>, !torch.list<int> -> !torch.vtensor<[1,1,8],f64>
    %150 = torch.aten.mul.Tensor %145, %149 : !torch.vtensor<[1,4608,1],f16>, !torch.vtensor<[1,1,8],f64> -> !torch.vtensor<[1,4608,8],f64>
    %151 = torch.aten.cos %150 : !torch.vtensor<[1,4608,8],f64> -> !torch.vtensor<[1,4608,8],f64>
    %152 = torch.aten.sin %150 : !torch.vtensor<[1,4608,8],f64> -> !torch.vtensor<[1,4608,8],f64>
    %153 = torch.aten.neg %152 : !torch.vtensor<[1,4608,8],f64> -> !torch.vtensor<[1,4608,8],f64>
    %154 = torch.aten.sin %150 : !torch.vtensor<[1,4608,8],f64> -> !torch.vtensor<[1,4608,8],f64>
    %155 = torch.aten.cos %150 : !torch.vtensor<[1,4608,8],f64> -> !torch.vtensor<[1,4608,8],f64>
    %156 = torch.prim.ListConstruct %151, %153, %154, %155 : (!torch.vtensor<[1,4608,8],f64>, !torch.vtensor<[1,4608,8],f64>, !torch.vtensor<[1,4608,8],f64>, !torch.vtensor<[1,4608,8],f64>) -> !torch.list<vtensor>
    %int-1_125 = torch.constant.int -1
    %157 = torch.aten.stack %156, %int-1_125 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,8,4],f64>
    %int1_126 = torch.constant.int 1
    %int4608 = torch.constant.int 4608
    %int8 = torch.constant.int 8
    %int2_127 = torch.constant.int 2
    %int2_128 = torch.constant.int 2
    %158 = torch.prim.ListConstruct %int1_126, %int4608, %int8, %int2_127, %int2_128 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %159 = torch.aten.view %157, %158 : !torch.vtensor<[1,4608,8,4],f64>, !torch.list<int> -> !torch.vtensor<[1,4608,8,2,2],f64>
    %int6_129 = torch.constant.int 6
    %160 = torch.prims.convert_element_type %159, %int6_129 : !torch.vtensor<[1,4608,8,2,2],f64>, !torch.int -> !torch.vtensor<[1,4608,8,2,2],f32>
    %int2_130 = torch.constant.int 2
    %int1_131 = torch.constant.int 1
    %161 = torch.aten.select.int %136, %int2_130, %int1_131 : !torch.vtensor<[1,4608,3],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,4608],f16>
    %int0_132 = torch.constant.int 0
    %int56 = torch.constant.int 56
    %int2_133 = torch.constant.int 2
    %int7_134 = torch.constant.int 7
    %none_135 = torch.constant.none
    %cpu_136 = torch.constant.device "cpu"
    %false_137 = torch.constant.bool false
    %162 = torch.aten.arange.start_step %int0_132, %int56, %int2_133, %int7_134, %none_135, %cpu_136, %false_137 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[28],f64>
    %int56_138 = torch.constant.int 56
    %163 = torch.aten.div.Scalar %162, %int56_138 : !torch.vtensor<[28],f64>, !torch.int -> !torch.vtensor<[28],f64>
    %int10000_139 = torch.constant.int 10000
    %164 = torch.aten.pow.Scalar %int10000_139, %163 : !torch.int, !torch.vtensor<[28],f64> -> !torch.vtensor<[28],f64>
    %165 = torch.aten.reciprocal %164 : !torch.vtensor<[28],f64> -> !torch.vtensor<[28],f64>
    %float1.000000e00_140 = torch.constant.float 1.000000e+00
    %166 = torch.aten.mul.Scalar %165, %float1.000000e00_140 : !torch.vtensor<[28],f64>, !torch.float -> !torch.vtensor<[28],f64>
    %int2_141 = torch.constant.int 2
    %167 = torch.aten.unsqueeze %161, %int2_141 : !torch.vtensor<[1,4608],f16>, !torch.int -> !torch.vtensor<[1,4608,1],f16>
    %int0_142 = torch.constant.int 0
    %int1_143 = torch.constant.int 1
    %int2_144 = torch.constant.int 2
    %168 = torch.prim.ListConstruct %int0_142, %int1_143, %int2_144 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %169 = torch.aten.permute %167, %168 : !torch.vtensor<[1,4608,1],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,1],f16>
    %int1_145 = torch.constant.int 1
    %170 = torch.aten.unsqueeze %166, %int1_145 : !torch.vtensor<[28],f64>, !torch.int -> !torch.vtensor<[28,1],f64>
    %int2_146 = torch.constant.int 2
    %171 = torch.aten.unsqueeze %170, %int2_146 : !torch.vtensor<[28,1],f64>, !torch.int -> !torch.vtensor<[28,1,1],f64>
    %int1_147 = torch.constant.int 1
    %int2_148 = torch.constant.int 2
    %int0_149 = torch.constant.int 0
    %172 = torch.prim.ListConstruct %int1_147, %int2_148, %int0_149 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %173 = torch.aten.permute %171, %172 : !torch.vtensor<[28,1,1],f64>, !torch.list<int> -> !torch.vtensor<[1,1,28],f64>
    %174 = torch.aten.mul.Tensor %169, %173 : !torch.vtensor<[1,4608,1],f16>, !torch.vtensor<[1,1,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %175 = torch.aten.cos %174 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %176 = torch.aten.sin %174 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %177 = torch.aten.neg %176 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %178 = torch.aten.sin %174 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %179 = torch.aten.cos %174 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %180 = torch.prim.ListConstruct %175, %177, %178, %179 : (!torch.vtensor<[1,4608,28],f64>, !torch.vtensor<[1,4608,28],f64>, !torch.vtensor<[1,4608,28],f64>, !torch.vtensor<[1,4608,28],f64>) -> !torch.list<vtensor>
    %int-1_150 = torch.constant.int -1
    %181 = torch.aten.stack %180, %int-1_150 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,28,4],f64>
    %int1_151 = torch.constant.int 1
    %int4608_152 = torch.constant.int 4608
    %int28 = torch.constant.int 28
    %int2_153 = torch.constant.int 2
    %int2_154 = torch.constant.int 2
    %182 = torch.prim.ListConstruct %int1_151, %int4608_152, %int28, %int2_153, %int2_154 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %183 = torch.aten.view %181, %182 : !torch.vtensor<[1,4608,28,4],f64>, !torch.list<int> -> !torch.vtensor<[1,4608,28,2,2],f64>
    %int6_155 = torch.constant.int 6
    %184 = torch.prims.convert_element_type %183, %int6_155 : !torch.vtensor<[1,4608,28,2,2],f64>, !torch.int -> !torch.vtensor<[1,4608,28,2,2],f32>
    %int2_156 = torch.constant.int 2
    %int2_157 = torch.constant.int 2
    %185 = torch.aten.select.int %136, %int2_156, %int2_157 : !torch.vtensor<[1,4608,3],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,4608],f16>
    %int0_158 = torch.constant.int 0
    %int56_159 = torch.constant.int 56
    %int2_160 = torch.constant.int 2
    %int7_161 = torch.constant.int 7
    %none_162 = torch.constant.none
    %cpu_163 = torch.constant.device "cpu"
    %false_164 = torch.constant.bool false
    %186 = torch.aten.arange.start_step %int0_158, %int56_159, %int2_160, %int7_161, %none_162, %cpu_163, %false_164 : !torch.int, !torch.int, !torch.int, !torch.int, !torch.none, !torch.Device, !torch.bool -> !torch.vtensor<[28],f64>
    %int56_165 = torch.constant.int 56
    %187 = torch.aten.div.Scalar %186, %int56_165 : !torch.vtensor<[28],f64>, !torch.int -> !torch.vtensor<[28],f64>
    %int10000_166 = torch.constant.int 10000
    %188 = torch.aten.pow.Scalar %int10000_166, %187 : !torch.int, !torch.vtensor<[28],f64> -> !torch.vtensor<[28],f64>
    %189 = torch.aten.reciprocal %188 : !torch.vtensor<[28],f64> -> !torch.vtensor<[28],f64>
    %float1.000000e00_167 = torch.constant.float 1.000000e+00
    %190 = torch.aten.mul.Scalar %189, %float1.000000e00_167 : !torch.vtensor<[28],f64>, !torch.float -> !torch.vtensor<[28],f64>
    %int2_168 = torch.constant.int 2
    %191 = torch.aten.unsqueeze %185, %int2_168 : !torch.vtensor<[1,4608],f16>, !torch.int -> !torch.vtensor<[1,4608,1],f16>
    %int0_169 = torch.constant.int 0
    %int1_170 = torch.constant.int 1
    %int2_171 = torch.constant.int 2
    %192 = torch.prim.ListConstruct %int0_169, %int1_170, %int2_171 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %193 = torch.aten.permute %191, %192 : !torch.vtensor<[1,4608,1],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,1],f16>
    %int1_172 = torch.constant.int 1
    %194 = torch.aten.unsqueeze %190, %int1_172 : !torch.vtensor<[28],f64>, !torch.int -> !torch.vtensor<[28,1],f64>
    %int2_173 = torch.constant.int 2
    %195 = torch.aten.unsqueeze %194, %int2_173 : !torch.vtensor<[28,1],f64>, !torch.int -> !torch.vtensor<[28,1,1],f64>
    %int1_174 = torch.constant.int 1
    %int2_175 = torch.constant.int 2
    %int0_176 = torch.constant.int 0
    %196 = torch.prim.ListConstruct %int1_174, %int2_175, %int0_176 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %197 = torch.aten.permute %195, %196 : !torch.vtensor<[28,1,1],f64>, !torch.list<int> -> !torch.vtensor<[1,1,28],f64>
    %198 = torch.aten.mul.Tensor %193, %197 : !torch.vtensor<[1,4608,1],f16>, !torch.vtensor<[1,1,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %199 = torch.aten.cos %198 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %200 = torch.aten.sin %198 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %201 = torch.aten.neg %200 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %202 = torch.aten.sin %198 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %203 = torch.aten.cos %198 : !torch.vtensor<[1,4608,28],f64> -> !torch.vtensor<[1,4608,28],f64>
    %204 = torch.prim.ListConstruct %199, %201, %202, %203 : (!torch.vtensor<[1,4608,28],f64>, !torch.vtensor<[1,4608,28],f64>, !torch.vtensor<[1,4608,28],f64>, !torch.vtensor<[1,4608,28],f64>) -> !torch.list<vtensor>
    %int-1_177 = torch.constant.int -1
    %205 = torch.aten.stack %204, %int-1_177 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,28,4],f64>
    %int1_178 = torch.constant.int 1
    %int4608_179 = torch.constant.int 4608
    %int28_180 = torch.constant.int 28
    %int2_181 = torch.constant.int 2
    %int2_182 = torch.constant.int 2
    %206 = torch.prim.ListConstruct %int1_178, %int4608_179, %int28_180, %int2_181, %int2_182 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %207 = torch.aten.view %205, %206 : !torch.vtensor<[1,4608,28,4],f64>, !torch.list<int> -> !torch.vtensor<[1,4608,28,2,2],f64>
    %int6_183 = torch.constant.int 6
    %208 = torch.prims.convert_element_type %207, %int6_183 : !torch.vtensor<[1,4608,28,2,2],f64>, !torch.int -> !torch.vtensor<[1,4608,28,2,2],f32>
    %209 = torch.prim.ListConstruct %160, %184, %208 : (!torch.vtensor<[1,4608,8,2,2],f32>, !torch.vtensor<[1,4608,28,2,2],f32>, !torch.vtensor<[1,4608,28,2,2],f32>) -> !torch.list<vtensor>
    %int-3 = torch.constant.int -3
    %210 = torch.aten.cat %209, %int-3 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,64,2,2],f32>
    %int1_184 = torch.constant.int 1
    %211 = torch.aten.unsqueeze %210, %int1_184 : !torch.vtensor<[1,4608,64,2,2],f32>, !torch.int -> !torch.vtensor<[1,1,4608,64,2,2],f32>
    %212 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.0.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.0.img_mod.lin.weight : tensor<18432x3072xf16>
    %213 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_185 = torch.constant.int 0
    %int1_186 = torch.constant.int 1
    %214 = torch.aten.transpose.int %213, %int0_185, %int1_186 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.0.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.0.img_mod.lin.bias : tensor<18432xf16>
    %215 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_187 = torch.constant.int 6
    %216 = torch.prims.convert_element_type %215, %int6_187 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_188 = torch.constant.int 6
    %217 = torch.prims.convert_element_type %212, %int6_188 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_189 = torch.constant.int 6
    %218 = torch.prims.convert_element_type %214, %int6_189 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %219 = torch.aten.mm %217, %218 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_190 = torch.constant.int 1
    %220 = torch.aten.mul.Scalar %219, %int1_190 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_191 = torch.constant.int 1
    %221 = torch.aten.mul.Scalar %216, %int1_191 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_192 = torch.constant.int 1
    %222 = torch.aten.add.Tensor %220, %221, %int1_192 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_193 = torch.constant.int 5
    %223 = torch.prims.convert_element_type %222, %int5_193 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_194 = torch.constant.int 0
    %int0_195 = torch.constant.int 0
    %int9223372036854775807_196 = torch.constant.int 9223372036854775807
    %int1_197 = torch.constant.int 1
    %224 = torch.aten.slice.Tensor %223, %int0_194, %int0_195, %int9223372036854775807_196, %int1_197 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_198 = torch.constant.int 1
    %225 = torch.aten.unsqueeze %224, %int1_198 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_199 = torch.constant.int 2
    %int0_200 = torch.constant.int 0
    %int9223372036854775807_201 = torch.constant.int 9223372036854775807
    %int1_202 = torch.constant.int 1
    %226 = torch.aten.slice.Tensor %225, %int2_199, %int0_200, %int9223372036854775807_201, %int1_202 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_203 = torch.constant.int -1
    %int0_204 = torch.constant.int 0
    %int3072_205 = torch.constant.int 3072
    %int1_206 = torch.constant.int 1
    %227 = torch.aten.slice.Tensor %226, %int-1_203, %int0_204, %int3072_205, %int1_206 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_207 = torch.constant.int -1
    %int3072_208 = torch.constant.int 3072
    %int6144 = torch.constant.int 6144
    %int1_209 = torch.constant.int 1
    %228 = torch.aten.slice.Tensor %226, %int-1_207, %int3072_208, %int6144, %int1_209 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_210 = torch.constant.int -1
    %int6144_211 = torch.constant.int 6144
    %int9216 = torch.constant.int 9216
    %int1_212 = torch.constant.int 1
    %229 = torch.aten.slice.Tensor %226, %int-1_210, %int6144_211, %int9216, %int1_212 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_213 = torch.constant.int -1
    %int9216_214 = torch.constant.int 9216
    %int12288 = torch.constant.int 12288
    %int1_215 = torch.constant.int 1
    %230 = torch.aten.slice.Tensor %226, %int-1_213, %int9216_214, %int12288, %int1_215 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_216 = torch.constant.int -1
    %int12288_217 = torch.constant.int 12288
    %int15360 = torch.constant.int 15360
    %int1_218 = torch.constant.int 1
    %231 = torch.aten.slice.Tensor %226, %int-1_216, %int12288_217, %int15360, %int1_218 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_219 = torch.constant.int -1
    %int15360_220 = torch.constant.int 15360
    %int18432 = torch.constant.int 18432
    %int1_221 = torch.constant.int 1
    %232 = torch.aten.slice.Tensor %226, %int-1_219, %int15360_220, %int18432, %int1_221 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %233 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.0.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.0.txt_mod.lin.weight : tensor<18432x3072xf16>
    %234 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_222 = torch.constant.int 0
    %int1_223 = torch.constant.int 1
    %235 = torch.aten.transpose.int %234, %int0_222, %int1_223 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.0.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.0.txt_mod.lin.bias : tensor<18432xf16>
    %236 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_224 = torch.constant.int 6
    %237 = torch.prims.convert_element_type %236, %int6_224 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_225 = torch.constant.int 6
    %238 = torch.prims.convert_element_type %233, %int6_225 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_226 = torch.constant.int 6
    %239 = torch.prims.convert_element_type %235, %int6_226 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %240 = torch.aten.mm %238, %239 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_227 = torch.constant.int 1
    %241 = torch.aten.mul.Scalar %240, %int1_227 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_228 = torch.constant.int 1
    %242 = torch.aten.mul.Scalar %237, %int1_228 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_229 = torch.constant.int 1
    %243 = torch.aten.add.Tensor %241, %242, %int1_229 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_230 = torch.constant.int 5
    %244 = torch.prims.convert_element_type %243, %int5_230 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_231 = torch.constant.int 0
    %int0_232 = torch.constant.int 0
    %int9223372036854775807_233 = torch.constant.int 9223372036854775807
    %int1_234 = torch.constant.int 1
    %245 = torch.aten.slice.Tensor %244, %int0_231, %int0_232, %int9223372036854775807_233, %int1_234 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_235 = torch.constant.int 1
    %246 = torch.aten.unsqueeze %245, %int1_235 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_236 = torch.constant.int 2
    %int0_237 = torch.constant.int 0
    %int9223372036854775807_238 = torch.constant.int 9223372036854775807
    %int1_239 = torch.constant.int 1
    %247 = torch.aten.slice.Tensor %246, %int2_236, %int0_237, %int9223372036854775807_238, %int1_239 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_240 = torch.constant.int -1
    %int0_241 = torch.constant.int 0
    %int3072_242 = torch.constant.int 3072
    %int1_243 = torch.constant.int 1
    %248 = torch.aten.slice.Tensor %247, %int-1_240, %int0_241, %int3072_242, %int1_243 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_244 = torch.constant.int -1
    %int3072_245 = torch.constant.int 3072
    %int6144_246 = torch.constant.int 6144
    %int1_247 = torch.constant.int 1
    %249 = torch.aten.slice.Tensor %247, %int-1_244, %int3072_245, %int6144_246, %int1_247 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_248 = torch.constant.int -1
    %int6144_249 = torch.constant.int 6144
    %int9216_250 = torch.constant.int 9216
    %int1_251 = torch.constant.int 1
    %250 = torch.aten.slice.Tensor %247, %int-1_248, %int6144_249, %int9216_250, %int1_251 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_252 = torch.constant.int -1
    %int9216_253 = torch.constant.int 9216
    %int12288_254 = torch.constant.int 12288
    %int1_255 = torch.constant.int 1
    %251 = torch.aten.slice.Tensor %247, %int-1_252, %int9216_253, %int12288_254, %int1_255 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_256 = torch.constant.int -1
    %int12288_257 = torch.constant.int 12288
    %int15360_258 = torch.constant.int 15360
    %int1_259 = torch.constant.int 1
    %252 = torch.aten.slice.Tensor %247, %int-1_256, %int12288_257, %int15360_258, %int1_259 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_260 = torch.constant.int -1
    %int15360_261 = torch.constant.int 15360
    %int18432_262 = torch.constant.int 18432
    %int1_263 = torch.constant.int 1
    %253 = torch.aten.slice.Tensor %247, %int-1_260, %int15360_261, %int18432_262, %int1_263 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_264 = torch.constant.int 6
    %254 = torch.prims.convert_element_type %16, %int6_264 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_265 = torch.constant.int 2
    %255 = torch.prim.ListConstruct %int2_265 : (!torch.int) -> !torch.list<int>
    %int0_266 = torch.constant.int 0
    %true = torch.constant.bool true
    %result0, %result1 = torch.aten.var_mean.correction %254, %255, %int0_266, %true : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07 = torch.constant.float 9.9999999999999995E-7
    %int1_267 = torch.constant.int 1
    %256 = torch.aten.add.Scalar %result0, %float9.999990e-07, %int1_267 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %257 = torch.aten.rsqrt %256 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_268 = torch.constant.int 1
    %258 = torch.aten.sub.Tensor %16, %result1, %int1_268 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %259 = torch.aten.mul.Tensor %258, %257 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_269 = torch.constant.int 5
    %260 = torch.prims.convert_element_type %259, %int5_269 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_270 = torch.constant.int 1
    %int1_271 = torch.constant.int 1
    %261 = torch.aten.add.Scalar %228, %int1_270, %int1_271 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %262 = torch.aten.mul.Tensor %261, %260 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_272 = torch.constant.int 1
    %263 = torch.aten.add.Tensor %262, %227, %int1_272 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_273 = torch.constant.int 4096
    %int3072_274 = torch.constant.int 3072
    %264 = torch.prim.ListConstruct %int4096_273, %int3072_274 : (!torch.int, !torch.int) -> !torch.list<int>
    %265 = torch.aten.view %263, %264 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.0.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.0.img_attn.qkv.weight : tensor<9216x3072xf16>
    %266 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_275 = torch.constant.int 0
    %int1_276 = torch.constant.int 1
    %267 = torch.aten.transpose.int %266, %int0_275, %int1_276 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.0.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.0.img_attn.qkv.bias : tensor<9216xf16>
    %268 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_277 = torch.constant.int 6
    %269 = torch.prims.convert_element_type %268, %int6_277 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_278 = torch.constant.int 6
    %270 = torch.prims.convert_element_type %265, %int6_278 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_279 = torch.constant.int 6
    %271 = torch.prims.convert_element_type %267, %int6_279 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %272 = torch.aten.mm %270, %271 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_280 = torch.constant.int 1
    %273 = torch.aten.mul.Scalar %272, %int1_280 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_281 = torch.constant.int 1
    %274 = torch.aten.mul.Scalar %269, %int1_281 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_282 = torch.constant.int 1
    %275 = torch.aten.add.Tensor %273, %274, %int1_282 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_283 = torch.constant.int 5
    %276 = torch.prims.convert_element_type %275, %int5_283 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_284 = torch.constant.int 1
    %int4096_285 = torch.constant.int 4096
    %int9216_286 = torch.constant.int 9216
    %277 = torch.prim.ListConstruct %int1_284, %int4096_285, %int9216_286 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %278 = torch.aten.view %276, %277 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_287 = torch.constant.int 1
    %int4096_288 = torch.constant.int 4096
    %int3 = torch.constant.int 3
    %int24 = torch.constant.int 24
    %int128_289 = torch.constant.int 128
    %279 = torch.prim.ListConstruct %int1_287, %int4096_288, %int3, %int24, %int128_289 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %280 = torch.aten.view %278, %279 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_290 = torch.constant.int 2
    %int0_291 = torch.constant.int 0
    %int3_292 = torch.constant.int 3
    %int1_293 = torch.constant.int 1
    %int4 = torch.constant.int 4
    %281 = torch.prim.ListConstruct %int2_290, %int0_291, %int3_292, %int1_293, %int4 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %282 = torch.aten.permute %280, %281 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_294 = torch.constant.int 0
    %int0_295 = torch.constant.int 0
    %283 = torch.aten.select.int %282, %int0_294, %int0_295 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_296 = torch.constant.int 0
    %int1_297 = torch.constant.int 1
    %284 = torch.aten.select.int %282, %int0_296, %int1_297 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_298 = torch.constant.int 0
    %int2_299 = torch.constant.int 2
    %285 = torch.aten.select.int %282, %int0_298, %int2_299 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_300 = torch.constant.int 6
    %286 = torch.prims.convert_element_type %283, %int6_300 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_301 = torch.constant.int 2
    %287 = torch.aten.pow.Tensor_Scalar %286, %int2_301 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_302 = torch.constant.int -1
    %288 = torch.prim.ListConstruct %int-1_302 : (!torch.int) -> !torch.list<int>
    %true_303 = torch.constant.bool true
    %none_304 = torch.constant.none
    %289 = torch.aten.mean.dim %287, %288, %true_303, %none_304 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_305 = torch.constant.float 9.9999999999999995E-7
    %int1_306 = torch.constant.int 1
    %290 = torch.aten.add.Scalar %289, %float9.999990e-07_305, %int1_306 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %291 = torch.aten.rsqrt %290 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %292 = torch.aten.mul.Tensor %286, %291 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_307 = torch.constant.int 5
    %293 = torch.prims.convert_element_type %292, %int5_307 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.0.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.0.img_attn.norm.query_norm.scale : tensor<128xf16>
    %294 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %295 = torch.aten.mul.Tensor %293, %294 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_308 = torch.constant.int 6
    %296 = torch.prims.convert_element_type %284, %int6_308 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_309 = torch.constant.int 2
    %297 = torch.aten.pow.Tensor_Scalar %296, %int2_309 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_310 = torch.constant.int -1
    %298 = torch.prim.ListConstruct %int-1_310 : (!torch.int) -> !torch.list<int>
    %true_311 = torch.constant.bool true
    %none_312 = torch.constant.none
    %299 = torch.aten.mean.dim %297, %298, %true_311, %none_312 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_313 = torch.constant.float 9.9999999999999995E-7
    %int1_314 = torch.constant.int 1
    %300 = torch.aten.add.Scalar %299, %float9.999990e-07_313, %int1_314 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %301 = torch.aten.rsqrt %300 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %302 = torch.aten.mul.Tensor %296, %301 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_315 = torch.constant.int 5
    %303 = torch.prims.convert_element_type %302, %int5_315 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.0.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.0.img_attn.norm.key_norm.scale : tensor<128xf16>
    %304 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %305 = torch.aten.mul.Tensor %303, %304 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_316 = torch.constant.int 5
    %306 = torch.prims.convert_element_type %295, %int5_316 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_317 = torch.constant.int 5
    %307 = torch.prims.convert_element_type %305, %int5_317 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_318 = torch.constant.int 6
    %308 = torch.prims.convert_element_type %134, %int6_318 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_319 = torch.constant.int 2
    %309 = torch.prim.ListConstruct %int2_319 : (!torch.int) -> !torch.list<int>
    %int0_320 = torch.constant.int 0
    %true_321 = torch.constant.bool true
    %result0_322, %result1_323 = torch.aten.var_mean.correction %308, %309, %int0_320, %true_321 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_324 = torch.constant.float 9.9999999999999995E-7
    %int1_325 = torch.constant.int 1
    %310 = torch.aten.add.Scalar %result0_322, %float9.999990e-07_324, %int1_325 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %311 = torch.aten.rsqrt %310 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_326 = torch.constant.int 1
    %312 = torch.aten.sub.Tensor %134, %result1_323, %int1_326 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %313 = torch.aten.mul.Tensor %312, %311 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_327 = torch.constant.int 5
    %314 = torch.prims.convert_element_type %313, %int5_327 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_328 = torch.constant.int 1
    %int1_329 = torch.constant.int 1
    %315 = torch.aten.add.Scalar %249, %int1_328, %int1_329 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %316 = torch.aten.mul.Tensor %315, %314 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_330 = torch.constant.int 1
    %317 = torch.aten.add.Tensor %316, %248, %int1_330 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_331 = torch.constant.int 512
    %int3072_332 = torch.constant.int 3072
    %318 = torch.prim.ListConstruct %int512_331, %int3072_332 : (!torch.int, !torch.int) -> !torch.list<int>
    %319 = torch.aten.view %317, %318 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.0.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.0.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %320 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_333 = torch.constant.int 0
    %int1_334 = torch.constant.int 1
    %321 = torch.aten.transpose.int %320, %int0_333, %int1_334 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.0.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.0.txt_attn.qkv.bias : tensor<9216xf16>
    %322 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_335 = torch.constant.int 6
    %323 = torch.prims.convert_element_type %322, %int6_335 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_336 = torch.constant.int 6
    %324 = torch.prims.convert_element_type %319, %int6_336 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_337 = torch.constant.int 6
    %325 = torch.prims.convert_element_type %321, %int6_337 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %326 = torch.aten.mm %324, %325 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_338 = torch.constant.int 1
    %327 = torch.aten.mul.Scalar %326, %int1_338 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_339 = torch.constant.int 1
    %328 = torch.aten.mul.Scalar %323, %int1_339 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_340 = torch.constant.int 1
    %329 = torch.aten.add.Tensor %327, %328, %int1_340 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_341 = torch.constant.int 5
    %330 = torch.prims.convert_element_type %329, %int5_341 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_342 = torch.constant.int 1
    %int512_343 = torch.constant.int 512
    %int9216_344 = torch.constant.int 9216
    %331 = torch.prim.ListConstruct %int1_342, %int512_343, %int9216_344 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %332 = torch.aten.view %330, %331 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_345 = torch.constant.int 1
    %int512_346 = torch.constant.int 512
    %int3_347 = torch.constant.int 3
    %int24_348 = torch.constant.int 24
    %int128_349 = torch.constant.int 128
    %333 = torch.prim.ListConstruct %int1_345, %int512_346, %int3_347, %int24_348, %int128_349 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %334 = torch.aten.view %332, %333 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_350 = torch.constant.int 2
    %int0_351 = torch.constant.int 0
    %int3_352 = torch.constant.int 3
    %int1_353 = torch.constant.int 1
    %int4_354 = torch.constant.int 4
    %335 = torch.prim.ListConstruct %int2_350, %int0_351, %int3_352, %int1_353, %int4_354 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %336 = torch.aten.permute %334, %335 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_355 = torch.constant.int 0
    %int0_356 = torch.constant.int 0
    %337 = torch.aten.select.int %336, %int0_355, %int0_356 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_357 = torch.constant.int 0
    %int1_358 = torch.constant.int 1
    %338 = torch.aten.select.int %336, %int0_357, %int1_358 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_359 = torch.constant.int 0
    %int2_360 = torch.constant.int 2
    %339 = torch.aten.select.int %336, %int0_359, %int2_360 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_361 = torch.constant.int 6
    %340 = torch.prims.convert_element_type %337, %int6_361 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_362 = torch.constant.int 2
    %341 = torch.aten.pow.Tensor_Scalar %340, %int2_362 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_363 = torch.constant.int -1
    %342 = torch.prim.ListConstruct %int-1_363 : (!torch.int) -> !torch.list<int>
    %true_364 = torch.constant.bool true
    %none_365 = torch.constant.none
    %343 = torch.aten.mean.dim %341, %342, %true_364, %none_365 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_366 = torch.constant.float 9.9999999999999995E-7
    %int1_367 = torch.constant.int 1
    %344 = torch.aten.add.Scalar %343, %float9.999990e-07_366, %int1_367 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %345 = torch.aten.rsqrt %344 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %346 = torch.aten.mul.Tensor %340, %345 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_368 = torch.constant.int 5
    %347 = torch.prims.convert_element_type %346, %int5_368 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.0.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.0.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %348 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %349 = torch.aten.mul.Tensor %347, %348 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_369 = torch.constant.int 6
    %350 = torch.prims.convert_element_type %338, %int6_369 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_370 = torch.constant.int 2
    %351 = torch.aten.pow.Tensor_Scalar %350, %int2_370 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_371 = torch.constant.int -1
    %352 = torch.prim.ListConstruct %int-1_371 : (!torch.int) -> !torch.list<int>
    %true_372 = torch.constant.bool true
    %none_373 = torch.constant.none
    %353 = torch.aten.mean.dim %351, %352, %true_372, %none_373 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_374 = torch.constant.float 9.9999999999999995E-7
    %int1_375 = torch.constant.int 1
    %354 = torch.aten.add.Scalar %353, %float9.999990e-07_374, %int1_375 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %355 = torch.aten.rsqrt %354 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %356 = torch.aten.mul.Tensor %350, %355 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_376 = torch.constant.int 5
    %357 = torch.prims.convert_element_type %356, %int5_376 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.0.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.0.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %358 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %359 = torch.aten.mul.Tensor %357, %358 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_377 = torch.constant.int 5
    %360 = torch.prims.convert_element_type %349, %int5_377 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_378 = torch.constant.int 5
    %361 = torch.prims.convert_element_type %359, %int5_378 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %362 = torch.prim.ListConstruct %360, %306 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_379 = torch.constant.int 2
    %363 = torch.aten.cat %362, %int2_379 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %364 = torch.prim.ListConstruct %361, %307 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_380 = torch.constant.int 2
    %365 = torch.aten.cat %364, %int2_380 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %366 = torch.prim.ListConstruct %339, %285 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_381 = torch.constant.int 2
    %367 = torch.aten.cat %366, %int2_381 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_382 = torch.constant.int 6
    %368 = torch.prims.convert_element_type %363, %int6_382 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_383 = torch.constant.int 1
    %int24_384 = torch.constant.int 24
    %int4608_385 = torch.constant.int 4608
    %int-1_386 = torch.constant.int -1
    %int1_387 = torch.constant.int 1
    %int2_388 = torch.constant.int 2
    %369 = torch.prim.ListConstruct %int1_383, %int24_384, %int4608_385, %int-1_386, %int1_387, %int2_388 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %370 = torch.aten.view %368, %369 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_389 = torch.constant.int 6
    %371 = torch.prims.convert_element_type %365, %int6_389 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_390 = torch.constant.int 1
    %int24_391 = torch.constant.int 24
    %int4608_392 = torch.constant.int 4608
    %int-1_393 = torch.constant.int -1
    %int1_394 = torch.constant.int 1
    %int2_395 = torch.constant.int 2
    %372 = torch.prim.ListConstruct %int1_390, %int24_391, %int4608_392, %int-1_393, %int1_394, %int2_395 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %373 = torch.aten.view %371, %372 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_396 = torch.constant.int 5
    %int0_397 = torch.constant.int 0
    %374 = torch.aten.select.int %211, %int5_396, %int0_397 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_398 = torch.constant.int 5
    %int0_399 = torch.constant.int 0
    %375 = torch.aten.select.int %370, %int5_398, %int0_399 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %376 = torch.aten.mul.Tensor %374, %375 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_400 = torch.constant.int 5
    %int1_401 = torch.constant.int 1
    %377 = torch.aten.select.int %211, %int5_400, %int1_401 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_402 = torch.constant.int 5
    %int1_403 = torch.constant.int 1
    %378 = torch.aten.select.int %370, %int5_402, %int1_403 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %379 = torch.aten.mul.Tensor %377, %378 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_404 = torch.constant.int 1
    %380 = torch.aten.add.Tensor %376, %379, %int1_404 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_405 = torch.constant.int 5
    %int0_406 = torch.constant.int 0
    %381 = torch.aten.select.int %211, %int5_405, %int0_406 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_407 = torch.constant.int 5
    %int0_408 = torch.constant.int 0
    %382 = torch.aten.select.int %373, %int5_407, %int0_408 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %383 = torch.aten.mul.Tensor %381, %382 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_409 = torch.constant.int 5
    %int1_410 = torch.constant.int 1
    %384 = torch.aten.select.int %211, %int5_409, %int1_410 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_411 = torch.constant.int 5
    %int1_412 = torch.constant.int 1
    %385 = torch.aten.select.int %373, %int5_411, %int1_412 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %386 = torch.aten.mul.Tensor %384, %385 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_413 = torch.constant.int 1
    %387 = torch.aten.add.Tensor %383, %386, %int1_413 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_414 = torch.constant.int 1
    %int24_415 = torch.constant.int 24
    %int4608_416 = torch.constant.int 4608
    %int128_417 = torch.constant.int 128
    %388 = torch.prim.ListConstruct %int1_414, %int24_415, %int4608_416, %int128_417 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %389 = torch.aten.view %380, %388 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_418 = torch.constant.int 5
    %390 = torch.prims.convert_element_type %389, %int5_418 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_419 = torch.constant.int 1
    %int24_420 = torch.constant.int 24
    %int4608_421 = torch.constant.int 4608
    %int128_422 = torch.constant.int 128
    %391 = torch.prim.ListConstruct %int1_419, %int24_420, %int4608_421, %int128_422 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %392 = torch.aten.view %387, %391 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_423 = torch.constant.int 5
    %393 = torch.prims.convert_element_type %392, %int5_423 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00 = torch.constant.float 0.000000e+00
    %false_424 = torch.constant.bool false
    %none_425 = torch.constant.none
    %none_426 = torch.constant.none
    %394:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%390, %393, %367, %float0.000000e00, %false_424, %none_425, %none_426) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_427 = torch.constant.int 0
    %int2_428 = torch.constant.int 2
    %int1_429 = torch.constant.int 1
    %int3_430 = torch.constant.int 3
    %395 = torch.prim.ListConstruct %int0_427, %int2_428, %int1_429, %int3_430 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %396 = torch.aten.permute %394#0, %395 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_431 = torch.constant.int 1
    %int4608_432 = torch.constant.int 4608
    %int3072_433 = torch.constant.int 3072
    %397 = torch.prim.ListConstruct %int1_431, %int4608_432, %int3072_433 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %398 = torch.aten.view %396, %397 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_434 = torch.constant.int 0
    %int0_435 = torch.constant.int 0
    %int9223372036854775807_436 = torch.constant.int 9223372036854775807
    %int1_437 = torch.constant.int 1
    %399 = torch.aten.slice.Tensor %398, %int0_434, %int0_435, %int9223372036854775807_436, %int1_437 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_438 = torch.constant.int 1
    %int0_439 = torch.constant.int 0
    %int512_440 = torch.constant.int 512
    %int1_441 = torch.constant.int 1
    %400 = torch.aten.slice.Tensor %399, %int1_438, %int0_439, %int512_440, %int1_441 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_442 = torch.constant.int 0
    %int0_443 = torch.constant.int 0
    %int9223372036854775807_444 = torch.constant.int 9223372036854775807
    %int1_445 = torch.constant.int 1
    %401 = torch.aten.slice.Tensor %398, %int0_442, %int0_443, %int9223372036854775807_444, %int1_445 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_446 = torch.constant.int 1
    %int512_447 = torch.constant.int 512
    %int9223372036854775807_448 = torch.constant.int 9223372036854775807
    %int1_449 = torch.constant.int 1
    %402 = torch.aten.slice.Tensor %401, %int1_446, %int512_447, %int9223372036854775807_448, %int1_449 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_450 = torch.constant.int 4096
    %int3072_451 = torch.constant.int 3072
    %403 = torch.prim.ListConstruct %int4096_450, %int3072_451 : (!torch.int, !torch.int) -> !torch.list<int>
    %404 = torch.aten.view %402, %403 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.0.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.0.img_attn.proj.weight : tensor<3072x3072xf16>
    %405 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_452 = torch.constant.int 0
    %int1_453 = torch.constant.int 1
    %406 = torch.aten.transpose.int %405, %int0_452, %int1_453 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.0.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.0.img_attn.proj.bias : tensor<3072xf16>
    %407 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_454 = torch.constant.int 6
    %408 = torch.prims.convert_element_type %407, %int6_454 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_455 = torch.constant.int 6
    %409 = torch.prims.convert_element_type %404, %int6_455 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_456 = torch.constant.int 6
    %410 = torch.prims.convert_element_type %406, %int6_456 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %411 = torch.aten.mm %409, %410 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_457 = torch.constant.int 1
    %412 = torch.aten.mul.Scalar %411, %int1_457 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_458 = torch.constant.int 1
    %413 = torch.aten.mul.Scalar %408, %int1_458 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_459 = torch.constant.int 1
    %414 = torch.aten.add.Tensor %412, %413, %int1_459 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_460 = torch.constant.int 5
    %415 = torch.prims.convert_element_type %414, %int5_460 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_461 = torch.constant.int 1
    %int4096_462 = torch.constant.int 4096
    %int3072_463 = torch.constant.int 3072
    %416 = torch.prim.ListConstruct %int1_461, %int4096_462, %int3072_463 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %417 = torch.aten.view %415, %416 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %418 = torch.aten.mul.Tensor %229, %417 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_464 = torch.constant.int 1
    %419 = torch.aten.add.Tensor %16, %418, %int1_464 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_465 = torch.constant.int 1
    %int1_466 = torch.constant.int 1
    %420 = torch.aten.add.Scalar %231, %int1_465, %int1_466 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_467 = torch.constant.int 6
    %421 = torch.prims.convert_element_type %419, %int6_467 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_468 = torch.constant.int 2
    %422 = torch.prim.ListConstruct %int2_468 : (!torch.int) -> !torch.list<int>
    %int0_469 = torch.constant.int 0
    %true_470 = torch.constant.bool true
    %result0_471, %result1_472 = torch.aten.var_mean.correction %421, %422, %int0_469, %true_470 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_473 = torch.constant.float 9.9999999999999995E-7
    %int1_474 = torch.constant.int 1
    %423 = torch.aten.add.Scalar %result0_471, %float9.999990e-07_473, %int1_474 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %424 = torch.aten.rsqrt %423 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_475 = torch.constant.int 1
    %425 = torch.aten.sub.Tensor %419, %result1_472, %int1_475 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %426 = torch.aten.mul.Tensor %425, %424 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_476 = torch.constant.int 5
    %427 = torch.prims.convert_element_type %426, %int5_476 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %428 = torch.aten.mul.Tensor %420, %427 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_477 = torch.constant.int 1
    %429 = torch.aten.add.Tensor %428, %230, %int1_477 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_478 = torch.constant.int 4096
    %int3072_479 = torch.constant.int 3072
    %430 = torch.prim.ListConstruct %int4096_478, %int3072_479 : (!torch.int, !torch.int) -> !torch.list<int>
    %431 = torch.aten.view %429, %430 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.0.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.0.img_mlp.0.weight : tensor<12288x3072xf16>
    %432 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_480 = torch.constant.int 0
    %int1_481 = torch.constant.int 1
    %433 = torch.aten.transpose.int %432, %int0_480, %int1_481 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.0.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.0.img_mlp.0.bias : tensor<12288xf16>
    %434 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_482 = torch.constant.int 6
    %435 = torch.prims.convert_element_type %434, %int6_482 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_483 = torch.constant.int 6
    %436 = torch.prims.convert_element_type %431, %int6_483 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_484 = torch.constant.int 6
    %437 = torch.prims.convert_element_type %433, %int6_484 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %438 = torch.aten.mm %436, %437 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_485 = torch.constant.int 1
    %439 = torch.aten.mul.Scalar %438, %int1_485 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_486 = torch.constant.int 1
    %440 = torch.aten.mul.Scalar %435, %int1_486 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_487 = torch.constant.int 1
    %441 = torch.aten.add.Tensor %439, %440, %int1_487 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_488 = torch.constant.int 5
    %442 = torch.prims.convert_element_type %441, %int5_488 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_489 = torch.constant.int 1
    %int4096_490 = torch.constant.int 4096
    %int12288_491 = torch.constant.int 12288
    %443 = torch.prim.ListConstruct %int1_489, %int4096_490, %int12288_491 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %444 = torch.aten.view %442, %443 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str = torch.constant.str "tanh"
    %445 = torch.aten.gelu %444, %str : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_492 = torch.constant.int 4096
    %int12288_493 = torch.constant.int 12288
    %446 = torch.prim.ListConstruct %int4096_492, %int12288_493 : (!torch.int, !torch.int) -> !torch.list<int>
    %447 = torch.aten.view %445, %446 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.0.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.0.img_mlp.2.weight : tensor<3072x12288xf16>
    %448 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_494 = torch.constant.int 0
    %int1_495 = torch.constant.int 1
    %449 = torch.aten.transpose.int %448, %int0_494, %int1_495 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.0.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.0.img_mlp.2.bias : tensor<3072xf16>
    %450 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_496 = torch.constant.int 6
    %451 = torch.prims.convert_element_type %450, %int6_496 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_497 = torch.constant.int 6
    %452 = torch.prims.convert_element_type %447, %int6_497 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_498 = torch.constant.int 6
    %453 = torch.prims.convert_element_type %449, %int6_498 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %454 = torch.aten.mm %452, %453 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_499 = torch.constant.int 1
    %455 = torch.aten.mul.Scalar %454, %int1_499 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_500 = torch.constant.int 1
    %456 = torch.aten.mul.Scalar %451, %int1_500 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_501 = torch.constant.int 1
    %457 = torch.aten.add.Tensor %455, %456, %int1_501 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_502 = torch.constant.int 5
    %458 = torch.prims.convert_element_type %457, %int5_502 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_503 = torch.constant.int 1
    %int4096_504 = torch.constant.int 4096
    %int3072_505 = torch.constant.int 3072
    %459 = torch.prim.ListConstruct %int1_503, %int4096_504, %int3072_505 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %460 = torch.aten.view %458, %459 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %461 = torch.aten.mul.Tensor %232, %460 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_506 = torch.constant.int 1
    %462 = torch.aten.add.Tensor %419, %461, %int1_506 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_507 = torch.constant.int 512
    %int3072_508 = torch.constant.int 3072
    %463 = torch.prim.ListConstruct %int512_507, %int3072_508 : (!torch.int, !torch.int) -> !torch.list<int>
    %464 = torch.aten.view %400, %463 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.0.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.0.txt_attn.proj.weight : tensor<3072x3072xf16>
    %465 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_509 = torch.constant.int 0
    %int1_510 = torch.constant.int 1
    %466 = torch.aten.transpose.int %465, %int0_509, %int1_510 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.0.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.0.txt_attn.proj.bias : tensor<3072xf16>
    %467 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_511 = torch.constant.int 6
    %468 = torch.prims.convert_element_type %467, %int6_511 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_512 = torch.constant.int 6
    %469 = torch.prims.convert_element_type %464, %int6_512 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_513 = torch.constant.int 6
    %470 = torch.prims.convert_element_type %466, %int6_513 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %471 = torch.aten.mm %469, %470 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_514 = torch.constant.int 1
    %472 = torch.aten.mul.Scalar %471, %int1_514 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_515 = torch.constant.int 1
    %473 = torch.aten.mul.Scalar %468, %int1_515 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_516 = torch.constant.int 1
    %474 = torch.aten.add.Tensor %472, %473, %int1_516 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_517 = torch.constant.int 5
    %475 = torch.prims.convert_element_type %474, %int5_517 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_518 = torch.constant.int 1
    %int512_519 = torch.constant.int 512
    %int3072_520 = torch.constant.int 3072
    %476 = torch.prim.ListConstruct %int1_518, %int512_519, %int3072_520 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %477 = torch.aten.view %475, %476 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %478 = torch.aten.mul.Tensor %250, %477 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_521 = torch.constant.int 1
    %479 = torch.aten.add.Tensor %134, %478, %int1_521 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_522 = torch.constant.int 1
    %int1_523 = torch.constant.int 1
    %480 = torch.aten.add.Scalar %252, %int1_522, %int1_523 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_524 = torch.constant.int 6
    %481 = torch.prims.convert_element_type %479, %int6_524 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_525 = torch.constant.int 2
    %482 = torch.prim.ListConstruct %int2_525 : (!torch.int) -> !torch.list<int>
    %int0_526 = torch.constant.int 0
    %true_527 = torch.constant.bool true
    %result0_528, %result1_529 = torch.aten.var_mean.correction %481, %482, %int0_526, %true_527 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_530 = torch.constant.float 9.9999999999999995E-7
    %int1_531 = torch.constant.int 1
    %483 = torch.aten.add.Scalar %result0_528, %float9.999990e-07_530, %int1_531 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %484 = torch.aten.rsqrt %483 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_532 = torch.constant.int 1
    %485 = torch.aten.sub.Tensor %479, %result1_529, %int1_532 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %486 = torch.aten.mul.Tensor %485, %484 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_533 = torch.constant.int 5
    %487 = torch.prims.convert_element_type %486, %int5_533 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %488 = torch.aten.mul.Tensor %480, %487 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_534 = torch.constant.int 1
    %489 = torch.aten.add.Tensor %488, %251, %int1_534 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_535 = torch.constant.int 512
    %int3072_536 = torch.constant.int 3072
    %490 = torch.prim.ListConstruct %int512_535, %int3072_536 : (!torch.int, !torch.int) -> !torch.list<int>
    %491 = torch.aten.view %489, %490 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.0.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.0.txt_mlp.0.weight : tensor<12288x3072xf16>
    %492 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_537 = torch.constant.int 0
    %int1_538 = torch.constant.int 1
    %493 = torch.aten.transpose.int %492, %int0_537, %int1_538 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.0.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.0.txt_mlp.0.bias : tensor<12288xf16>
    %494 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_539 = torch.constant.int 6
    %495 = torch.prims.convert_element_type %494, %int6_539 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_540 = torch.constant.int 6
    %496 = torch.prims.convert_element_type %491, %int6_540 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_541 = torch.constant.int 6
    %497 = torch.prims.convert_element_type %493, %int6_541 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %498 = torch.aten.mm %496, %497 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_542 = torch.constant.int 1
    %499 = torch.aten.mul.Scalar %498, %int1_542 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_543 = torch.constant.int 1
    %500 = torch.aten.mul.Scalar %495, %int1_543 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_544 = torch.constant.int 1
    %501 = torch.aten.add.Tensor %499, %500, %int1_544 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_545 = torch.constant.int 5
    %502 = torch.prims.convert_element_type %501, %int5_545 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_546 = torch.constant.int 1
    %int512_547 = torch.constant.int 512
    %int12288_548 = torch.constant.int 12288
    %503 = torch.prim.ListConstruct %int1_546, %int512_547, %int12288_548 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %504 = torch.aten.view %502, %503 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_549 = torch.constant.str "tanh"
    %505 = torch.aten.gelu %504, %str_549 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_550 = torch.constant.int 512
    %int12288_551 = torch.constant.int 12288
    %506 = torch.prim.ListConstruct %int512_550, %int12288_551 : (!torch.int, !torch.int) -> !torch.list<int>
    %507 = torch.aten.view %505, %506 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.0.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.0.txt_mlp.2.weight : tensor<3072x12288xf16>
    %508 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_552 = torch.constant.int 0
    %int1_553 = torch.constant.int 1
    %509 = torch.aten.transpose.int %508, %int0_552, %int1_553 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.0.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.0.txt_mlp.2.bias : tensor<3072xf16>
    %510 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.0.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_554 = torch.constant.int 6
    %511 = torch.prims.convert_element_type %510, %int6_554 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_555 = torch.constant.int 6
    %512 = torch.prims.convert_element_type %507, %int6_555 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_556 = torch.constant.int 6
    %513 = torch.prims.convert_element_type %509, %int6_556 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %514 = torch.aten.mm %512, %513 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_557 = torch.constant.int 1
    %515 = torch.aten.mul.Scalar %514, %int1_557 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_558 = torch.constant.int 1
    %516 = torch.aten.mul.Scalar %511, %int1_558 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_559 = torch.constant.int 1
    %517 = torch.aten.add.Tensor %515, %516, %int1_559 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_560 = torch.constant.int 5
    %518 = torch.prims.convert_element_type %517, %int5_560 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_561 = torch.constant.int 1
    %int512_562 = torch.constant.int 512
    %int3072_563 = torch.constant.int 3072
    %519 = torch.prim.ListConstruct %int1_561, %int512_562, %int3072_563 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %520 = torch.aten.view %518, %519 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %521 = torch.aten.mul.Tensor %253, %520 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_564 = torch.constant.int 1
    %522 = torch.aten.add.Tensor %479, %521, %int1_564 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %523 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.1.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.1.img_mod.lin.weight : tensor<18432x3072xf16>
    %524 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_565 = torch.constant.int 0
    %int1_566 = torch.constant.int 1
    %525 = torch.aten.transpose.int %524, %int0_565, %int1_566 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.1.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.1.img_mod.lin.bias : tensor<18432xf16>
    %526 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_567 = torch.constant.int 6
    %527 = torch.prims.convert_element_type %526, %int6_567 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_568 = torch.constant.int 6
    %528 = torch.prims.convert_element_type %523, %int6_568 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_569 = torch.constant.int 6
    %529 = torch.prims.convert_element_type %525, %int6_569 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %530 = torch.aten.mm %528, %529 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_570 = torch.constant.int 1
    %531 = torch.aten.mul.Scalar %530, %int1_570 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_571 = torch.constant.int 1
    %532 = torch.aten.mul.Scalar %527, %int1_571 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_572 = torch.constant.int 1
    %533 = torch.aten.add.Tensor %531, %532, %int1_572 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_573 = torch.constant.int 5
    %534 = torch.prims.convert_element_type %533, %int5_573 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_574 = torch.constant.int 0
    %int0_575 = torch.constant.int 0
    %int9223372036854775807_576 = torch.constant.int 9223372036854775807
    %int1_577 = torch.constant.int 1
    %535 = torch.aten.slice.Tensor %534, %int0_574, %int0_575, %int9223372036854775807_576, %int1_577 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_578 = torch.constant.int 1
    %536 = torch.aten.unsqueeze %535, %int1_578 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_579 = torch.constant.int 2
    %int0_580 = torch.constant.int 0
    %int9223372036854775807_581 = torch.constant.int 9223372036854775807
    %int1_582 = torch.constant.int 1
    %537 = torch.aten.slice.Tensor %536, %int2_579, %int0_580, %int9223372036854775807_581, %int1_582 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_583 = torch.constant.int -1
    %int0_584 = torch.constant.int 0
    %int3072_585 = torch.constant.int 3072
    %int1_586 = torch.constant.int 1
    %538 = torch.aten.slice.Tensor %537, %int-1_583, %int0_584, %int3072_585, %int1_586 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_587 = torch.constant.int -1
    %int3072_588 = torch.constant.int 3072
    %int6144_589 = torch.constant.int 6144
    %int1_590 = torch.constant.int 1
    %539 = torch.aten.slice.Tensor %537, %int-1_587, %int3072_588, %int6144_589, %int1_590 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_591 = torch.constant.int -1
    %int6144_592 = torch.constant.int 6144
    %int9216_593 = torch.constant.int 9216
    %int1_594 = torch.constant.int 1
    %540 = torch.aten.slice.Tensor %537, %int-1_591, %int6144_592, %int9216_593, %int1_594 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_595 = torch.constant.int -1
    %int9216_596 = torch.constant.int 9216
    %int12288_597 = torch.constant.int 12288
    %int1_598 = torch.constant.int 1
    %541 = torch.aten.slice.Tensor %537, %int-1_595, %int9216_596, %int12288_597, %int1_598 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_599 = torch.constant.int -1
    %int12288_600 = torch.constant.int 12288
    %int15360_601 = torch.constant.int 15360
    %int1_602 = torch.constant.int 1
    %542 = torch.aten.slice.Tensor %537, %int-1_599, %int12288_600, %int15360_601, %int1_602 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_603 = torch.constant.int -1
    %int15360_604 = torch.constant.int 15360
    %int18432_605 = torch.constant.int 18432
    %int1_606 = torch.constant.int 1
    %543 = torch.aten.slice.Tensor %537, %int-1_603, %int15360_604, %int18432_605, %int1_606 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %544 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.1.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.1.txt_mod.lin.weight : tensor<18432x3072xf16>
    %545 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_607 = torch.constant.int 0
    %int1_608 = torch.constant.int 1
    %546 = torch.aten.transpose.int %545, %int0_607, %int1_608 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.1.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.1.txt_mod.lin.bias : tensor<18432xf16>
    %547 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_609 = torch.constant.int 6
    %548 = torch.prims.convert_element_type %547, %int6_609 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_610 = torch.constant.int 6
    %549 = torch.prims.convert_element_type %544, %int6_610 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_611 = torch.constant.int 6
    %550 = torch.prims.convert_element_type %546, %int6_611 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %551 = torch.aten.mm %549, %550 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_612 = torch.constant.int 1
    %552 = torch.aten.mul.Scalar %551, %int1_612 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_613 = torch.constant.int 1
    %553 = torch.aten.mul.Scalar %548, %int1_613 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_614 = torch.constant.int 1
    %554 = torch.aten.add.Tensor %552, %553, %int1_614 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_615 = torch.constant.int 5
    %555 = torch.prims.convert_element_type %554, %int5_615 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_616 = torch.constant.int 0
    %int0_617 = torch.constant.int 0
    %int9223372036854775807_618 = torch.constant.int 9223372036854775807
    %int1_619 = torch.constant.int 1
    %556 = torch.aten.slice.Tensor %555, %int0_616, %int0_617, %int9223372036854775807_618, %int1_619 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_620 = torch.constant.int 1
    %557 = torch.aten.unsqueeze %556, %int1_620 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_621 = torch.constant.int 2
    %int0_622 = torch.constant.int 0
    %int9223372036854775807_623 = torch.constant.int 9223372036854775807
    %int1_624 = torch.constant.int 1
    %558 = torch.aten.slice.Tensor %557, %int2_621, %int0_622, %int9223372036854775807_623, %int1_624 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_625 = torch.constant.int -1
    %int0_626 = torch.constant.int 0
    %int3072_627 = torch.constant.int 3072
    %int1_628 = torch.constant.int 1
    %559 = torch.aten.slice.Tensor %558, %int-1_625, %int0_626, %int3072_627, %int1_628 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_629 = torch.constant.int -1
    %int3072_630 = torch.constant.int 3072
    %int6144_631 = torch.constant.int 6144
    %int1_632 = torch.constant.int 1
    %560 = torch.aten.slice.Tensor %558, %int-1_629, %int3072_630, %int6144_631, %int1_632 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_633 = torch.constant.int -1
    %int6144_634 = torch.constant.int 6144
    %int9216_635 = torch.constant.int 9216
    %int1_636 = torch.constant.int 1
    %561 = torch.aten.slice.Tensor %558, %int-1_633, %int6144_634, %int9216_635, %int1_636 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_637 = torch.constant.int -1
    %int9216_638 = torch.constant.int 9216
    %int12288_639 = torch.constant.int 12288
    %int1_640 = torch.constant.int 1
    %562 = torch.aten.slice.Tensor %558, %int-1_637, %int9216_638, %int12288_639, %int1_640 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_641 = torch.constant.int -1
    %int12288_642 = torch.constant.int 12288
    %int15360_643 = torch.constant.int 15360
    %int1_644 = torch.constant.int 1
    %563 = torch.aten.slice.Tensor %558, %int-1_641, %int12288_642, %int15360_643, %int1_644 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_645 = torch.constant.int -1
    %int15360_646 = torch.constant.int 15360
    %int18432_647 = torch.constant.int 18432
    %int1_648 = torch.constant.int 1
    %564 = torch.aten.slice.Tensor %558, %int-1_645, %int15360_646, %int18432_647, %int1_648 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_649 = torch.constant.int 6
    %565 = torch.prims.convert_element_type %462, %int6_649 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_650 = torch.constant.int 2
    %566 = torch.prim.ListConstruct %int2_650 : (!torch.int) -> !torch.list<int>
    %int0_651 = torch.constant.int 0
    %true_652 = torch.constant.bool true
    %result0_653, %result1_654 = torch.aten.var_mean.correction %565, %566, %int0_651, %true_652 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_655 = torch.constant.float 9.9999999999999995E-7
    %int1_656 = torch.constant.int 1
    %567 = torch.aten.add.Scalar %result0_653, %float9.999990e-07_655, %int1_656 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %568 = torch.aten.rsqrt %567 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_657 = torch.constant.int 1
    %569 = torch.aten.sub.Tensor %462, %result1_654, %int1_657 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %570 = torch.aten.mul.Tensor %569, %568 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_658 = torch.constant.int 5
    %571 = torch.prims.convert_element_type %570, %int5_658 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_659 = torch.constant.int 1
    %int1_660 = torch.constant.int 1
    %572 = torch.aten.add.Scalar %539, %int1_659, %int1_660 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %573 = torch.aten.mul.Tensor %572, %571 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_661 = torch.constant.int 1
    %574 = torch.aten.add.Tensor %573, %538, %int1_661 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_662 = torch.constant.int 4096
    %int3072_663 = torch.constant.int 3072
    %575 = torch.prim.ListConstruct %int4096_662, %int3072_663 : (!torch.int, !torch.int) -> !torch.list<int>
    %576 = torch.aten.view %574, %575 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.1.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.1.img_attn.qkv.weight : tensor<9216x3072xf16>
    %577 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_664 = torch.constant.int 0
    %int1_665 = torch.constant.int 1
    %578 = torch.aten.transpose.int %577, %int0_664, %int1_665 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.1.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.1.img_attn.qkv.bias : tensor<9216xf16>
    %579 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_666 = torch.constant.int 6
    %580 = torch.prims.convert_element_type %579, %int6_666 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_667 = torch.constant.int 6
    %581 = torch.prims.convert_element_type %576, %int6_667 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_668 = torch.constant.int 6
    %582 = torch.prims.convert_element_type %578, %int6_668 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %583 = torch.aten.mm %581, %582 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_669 = torch.constant.int 1
    %584 = torch.aten.mul.Scalar %583, %int1_669 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_670 = torch.constant.int 1
    %585 = torch.aten.mul.Scalar %580, %int1_670 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_671 = torch.constant.int 1
    %586 = torch.aten.add.Tensor %584, %585, %int1_671 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_672 = torch.constant.int 5
    %587 = torch.prims.convert_element_type %586, %int5_672 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_673 = torch.constant.int 1
    %int4096_674 = torch.constant.int 4096
    %int9216_675 = torch.constant.int 9216
    %588 = torch.prim.ListConstruct %int1_673, %int4096_674, %int9216_675 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %589 = torch.aten.view %587, %588 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_676 = torch.constant.int 1
    %int4096_677 = torch.constant.int 4096
    %int3_678 = torch.constant.int 3
    %int24_679 = torch.constant.int 24
    %int128_680 = torch.constant.int 128
    %590 = torch.prim.ListConstruct %int1_676, %int4096_677, %int3_678, %int24_679, %int128_680 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %591 = torch.aten.view %589, %590 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_681 = torch.constant.int 2
    %int0_682 = torch.constant.int 0
    %int3_683 = torch.constant.int 3
    %int1_684 = torch.constant.int 1
    %int4_685 = torch.constant.int 4
    %592 = torch.prim.ListConstruct %int2_681, %int0_682, %int3_683, %int1_684, %int4_685 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %593 = torch.aten.permute %591, %592 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_686 = torch.constant.int 0
    %int0_687 = torch.constant.int 0
    %594 = torch.aten.select.int %593, %int0_686, %int0_687 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_688 = torch.constant.int 0
    %int1_689 = torch.constant.int 1
    %595 = torch.aten.select.int %593, %int0_688, %int1_689 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_690 = torch.constant.int 0
    %int2_691 = torch.constant.int 2
    %596 = torch.aten.select.int %593, %int0_690, %int2_691 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_692 = torch.constant.int 6
    %597 = torch.prims.convert_element_type %594, %int6_692 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_693 = torch.constant.int 2
    %598 = torch.aten.pow.Tensor_Scalar %597, %int2_693 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_694 = torch.constant.int -1
    %599 = torch.prim.ListConstruct %int-1_694 : (!torch.int) -> !torch.list<int>
    %true_695 = torch.constant.bool true
    %none_696 = torch.constant.none
    %600 = torch.aten.mean.dim %598, %599, %true_695, %none_696 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_697 = torch.constant.float 9.9999999999999995E-7
    %int1_698 = torch.constant.int 1
    %601 = torch.aten.add.Scalar %600, %float9.999990e-07_697, %int1_698 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %602 = torch.aten.rsqrt %601 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %603 = torch.aten.mul.Tensor %597, %602 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_699 = torch.constant.int 5
    %604 = torch.prims.convert_element_type %603, %int5_699 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.1.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.1.img_attn.norm.query_norm.scale : tensor<128xf16>
    %605 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %606 = torch.aten.mul.Tensor %604, %605 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_700 = torch.constant.int 6
    %607 = torch.prims.convert_element_type %595, %int6_700 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_701 = torch.constant.int 2
    %608 = torch.aten.pow.Tensor_Scalar %607, %int2_701 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_702 = torch.constant.int -1
    %609 = torch.prim.ListConstruct %int-1_702 : (!torch.int) -> !torch.list<int>
    %true_703 = torch.constant.bool true
    %none_704 = torch.constant.none
    %610 = torch.aten.mean.dim %608, %609, %true_703, %none_704 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_705 = torch.constant.float 9.9999999999999995E-7
    %int1_706 = torch.constant.int 1
    %611 = torch.aten.add.Scalar %610, %float9.999990e-07_705, %int1_706 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %612 = torch.aten.rsqrt %611 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %613 = torch.aten.mul.Tensor %607, %612 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_707 = torch.constant.int 5
    %614 = torch.prims.convert_element_type %613, %int5_707 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.1.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.1.img_attn.norm.key_norm.scale : tensor<128xf16>
    %615 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %616 = torch.aten.mul.Tensor %614, %615 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_708 = torch.constant.int 5
    %617 = torch.prims.convert_element_type %606, %int5_708 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_709 = torch.constant.int 5
    %618 = torch.prims.convert_element_type %616, %int5_709 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_710 = torch.constant.int 6
    %619 = torch.prims.convert_element_type %522, %int6_710 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_711 = torch.constant.int 2
    %620 = torch.prim.ListConstruct %int2_711 : (!torch.int) -> !torch.list<int>
    %int0_712 = torch.constant.int 0
    %true_713 = torch.constant.bool true
    %result0_714, %result1_715 = torch.aten.var_mean.correction %619, %620, %int0_712, %true_713 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_716 = torch.constant.float 9.9999999999999995E-7
    %int1_717 = torch.constant.int 1
    %621 = torch.aten.add.Scalar %result0_714, %float9.999990e-07_716, %int1_717 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %622 = torch.aten.rsqrt %621 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_718 = torch.constant.int 1
    %623 = torch.aten.sub.Tensor %522, %result1_715, %int1_718 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %624 = torch.aten.mul.Tensor %623, %622 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_719 = torch.constant.int 5
    %625 = torch.prims.convert_element_type %624, %int5_719 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_720 = torch.constant.int 1
    %int1_721 = torch.constant.int 1
    %626 = torch.aten.add.Scalar %560, %int1_720, %int1_721 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %627 = torch.aten.mul.Tensor %626, %625 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_722 = torch.constant.int 1
    %628 = torch.aten.add.Tensor %627, %559, %int1_722 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_723 = torch.constant.int 512
    %int3072_724 = torch.constant.int 3072
    %629 = torch.prim.ListConstruct %int512_723, %int3072_724 : (!torch.int, !torch.int) -> !torch.list<int>
    %630 = torch.aten.view %628, %629 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.1.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.1.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %631 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_725 = torch.constant.int 0
    %int1_726 = torch.constant.int 1
    %632 = torch.aten.transpose.int %631, %int0_725, %int1_726 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.1.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.1.txt_attn.qkv.bias : tensor<9216xf16>
    %633 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_727 = torch.constant.int 6
    %634 = torch.prims.convert_element_type %633, %int6_727 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_728 = torch.constant.int 6
    %635 = torch.prims.convert_element_type %630, %int6_728 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_729 = torch.constant.int 6
    %636 = torch.prims.convert_element_type %632, %int6_729 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %637 = torch.aten.mm %635, %636 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_730 = torch.constant.int 1
    %638 = torch.aten.mul.Scalar %637, %int1_730 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_731 = torch.constant.int 1
    %639 = torch.aten.mul.Scalar %634, %int1_731 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_732 = torch.constant.int 1
    %640 = torch.aten.add.Tensor %638, %639, %int1_732 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_733 = torch.constant.int 5
    %641 = torch.prims.convert_element_type %640, %int5_733 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_734 = torch.constant.int 1
    %int512_735 = torch.constant.int 512
    %int9216_736 = torch.constant.int 9216
    %642 = torch.prim.ListConstruct %int1_734, %int512_735, %int9216_736 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %643 = torch.aten.view %641, %642 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_737 = torch.constant.int 1
    %int512_738 = torch.constant.int 512
    %int3_739 = torch.constant.int 3
    %int24_740 = torch.constant.int 24
    %int128_741 = torch.constant.int 128
    %644 = torch.prim.ListConstruct %int1_737, %int512_738, %int3_739, %int24_740, %int128_741 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %645 = torch.aten.view %643, %644 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_742 = torch.constant.int 2
    %int0_743 = torch.constant.int 0
    %int3_744 = torch.constant.int 3
    %int1_745 = torch.constant.int 1
    %int4_746 = torch.constant.int 4
    %646 = torch.prim.ListConstruct %int2_742, %int0_743, %int3_744, %int1_745, %int4_746 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %647 = torch.aten.permute %645, %646 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_747 = torch.constant.int 0
    %int0_748 = torch.constant.int 0
    %648 = torch.aten.select.int %647, %int0_747, %int0_748 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_749 = torch.constant.int 0
    %int1_750 = torch.constant.int 1
    %649 = torch.aten.select.int %647, %int0_749, %int1_750 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_751 = torch.constant.int 0
    %int2_752 = torch.constant.int 2
    %650 = torch.aten.select.int %647, %int0_751, %int2_752 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_753 = torch.constant.int 6
    %651 = torch.prims.convert_element_type %648, %int6_753 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_754 = torch.constant.int 2
    %652 = torch.aten.pow.Tensor_Scalar %651, %int2_754 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_755 = torch.constant.int -1
    %653 = torch.prim.ListConstruct %int-1_755 : (!torch.int) -> !torch.list<int>
    %true_756 = torch.constant.bool true
    %none_757 = torch.constant.none
    %654 = torch.aten.mean.dim %652, %653, %true_756, %none_757 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_758 = torch.constant.float 9.9999999999999995E-7
    %int1_759 = torch.constant.int 1
    %655 = torch.aten.add.Scalar %654, %float9.999990e-07_758, %int1_759 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %656 = torch.aten.rsqrt %655 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %657 = torch.aten.mul.Tensor %651, %656 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_760 = torch.constant.int 5
    %658 = torch.prims.convert_element_type %657, %int5_760 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.1.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.1.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %659 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %660 = torch.aten.mul.Tensor %658, %659 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_761 = torch.constant.int 6
    %661 = torch.prims.convert_element_type %649, %int6_761 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_762 = torch.constant.int 2
    %662 = torch.aten.pow.Tensor_Scalar %661, %int2_762 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_763 = torch.constant.int -1
    %663 = torch.prim.ListConstruct %int-1_763 : (!torch.int) -> !torch.list<int>
    %true_764 = torch.constant.bool true
    %none_765 = torch.constant.none
    %664 = torch.aten.mean.dim %662, %663, %true_764, %none_765 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_766 = torch.constant.float 9.9999999999999995E-7
    %int1_767 = torch.constant.int 1
    %665 = torch.aten.add.Scalar %664, %float9.999990e-07_766, %int1_767 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %666 = torch.aten.rsqrt %665 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %667 = torch.aten.mul.Tensor %661, %666 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_768 = torch.constant.int 5
    %668 = torch.prims.convert_element_type %667, %int5_768 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.1.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.1.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %669 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %670 = torch.aten.mul.Tensor %668, %669 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_769 = torch.constant.int 5
    %671 = torch.prims.convert_element_type %660, %int5_769 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_770 = torch.constant.int 5
    %672 = torch.prims.convert_element_type %670, %int5_770 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %673 = torch.prim.ListConstruct %671, %617 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_771 = torch.constant.int 2
    %674 = torch.aten.cat %673, %int2_771 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %675 = torch.prim.ListConstruct %672, %618 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_772 = torch.constant.int 2
    %676 = torch.aten.cat %675, %int2_772 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %677 = torch.prim.ListConstruct %650, %596 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_773 = torch.constant.int 2
    %678 = torch.aten.cat %677, %int2_773 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_774 = torch.constant.int 6
    %679 = torch.prims.convert_element_type %674, %int6_774 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_775 = torch.constant.int 1
    %int24_776 = torch.constant.int 24
    %int4608_777 = torch.constant.int 4608
    %int-1_778 = torch.constant.int -1
    %int1_779 = torch.constant.int 1
    %int2_780 = torch.constant.int 2
    %680 = torch.prim.ListConstruct %int1_775, %int24_776, %int4608_777, %int-1_778, %int1_779, %int2_780 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %681 = torch.aten.view %679, %680 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_781 = torch.constant.int 6
    %682 = torch.prims.convert_element_type %676, %int6_781 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_782 = torch.constant.int 1
    %int24_783 = torch.constant.int 24
    %int4608_784 = torch.constant.int 4608
    %int-1_785 = torch.constant.int -1
    %int1_786 = torch.constant.int 1
    %int2_787 = torch.constant.int 2
    %683 = torch.prim.ListConstruct %int1_782, %int24_783, %int4608_784, %int-1_785, %int1_786, %int2_787 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %684 = torch.aten.view %682, %683 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_788 = torch.constant.int 5
    %int0_789 = torch.constant.int 0
    %685 = torch.aten.select.int %211, %int5_788, %int0_789 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_790 = torch.constant.int 5
    %int0_791 = torch.constant.int 0
    %686 = torch.aten.select.int %681, %int5_790, %int0_791 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %687 = torch.aten.mul.Tensor %685, %686 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_792 = torch.constant.int 5
    %int1_793 = torch.constant.int 1
    %688 = torch.aten.select.int %211, %int5_792, %int1_793 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_794 = torch.constant.int 5
    %int1_795 = torch.constant.int 1
    %689 = torch.aten.select.int %681, %int5_794, %int1_795 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %690 = torch.aten.mul.Tensor %688, %689 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_796 = torch.constant.int 1
    %691 = torch.aten.add.Tensor %687, %690, %int1_796 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_797 = torch.constant.int 5
    %int0_798 = torch.constant.int 0
    %692 = torch.aten.select.int %211, %int5_797, %int0_798 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_799 = torch.constant.int 5
    %int0_800 = torch.constant.int 0
    %693 = torch.aten.select.int %684, %int5_799, %int0_800 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %694 = torch.aten.mul.Tensor %692, %693 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_801 = torch.constant.int 5
    %int1_802 = torch.constant.int 1
    %695 = torch.aten.select.int %211, %int5_801, %int1_802 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_803 = torch.constant.int 5
    %int1_804 = torch.constant.int 1
    %696 = torch.aten.select.int %684, %int5_803, %int1_804 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %697 = torch.aten.mul.Tensor %695, %696 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_805 = torch.constant.int 1
    %698 = torch.aten.add.Tensor %694, %697, %int1_805 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_806 = torch.constant.int 1
    %int24_807 = torch.constant.int 24
    %int4608_808 = torch.constant.int 4608
    %int128_809 = torch.constant.int 128
    %699 = torch.prim.ListConstruct %int1_806, %int24_807, %int4608_808, %int128_809 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %700 = torch.aten.view %691, %699 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_810 = torch.constant.int 5
    %701 = torch.prims.convert_element_type %700, %int5_810 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_811 = torch.constant.int 1
    %int24_812 = torch.constant.int 24
    %int4608_813 = torch.constant.int 4608
    %int128_814 = torch.constant.int 128
    %702 = torch.prim.ListConstruct %int1_811, %int24_812, %int4608_813, %int128_814 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %703 = torch.aten.view %698, %702 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_815 = torch.constant.int 5
    %704 = torch.prims.convert_element_type %703, %int5_815 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_816 = torch.constant.float 0.000000e+00
    %false_817 = torch.constant.bool false
    %none_818 = torch.constant.none
    %none_819 = torch.constant.none
    %705:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%701, %704, %678, %float0.000000e00_816, %false_817, %none_818, %none_819) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_820 = torch.constant.int 0
    %int2_821 = torch.constant.int 2
    %int1_822 = torch.constant.int 1
    %int3_823 = torch.constant.int 3
    %706 = torch.prim.ListConstruct %int0_820, %int2_821, %int1_822, %int3_823 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %707 = torch.aten.permute %705#0, %706 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_824 = torch.constant.int 1
    %int4608_825 = torch.constant.int 4608
    %int3072_826 = torch.constant.int 3072
    %708 = torch.prim.ListConstruct %int1_824, %int4608_825, %int3072_826 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %709 = torch.aten.view %707, %708 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_827 = torch.constant.int 0
    %int0_828 = torch.constant.int 0
    %int9223372036854775807_829 = torch.constant.int 9223372036854775807
    %int1_830 = torch.constant.int 1
    %710 = torch.aten.slice.Tensor %709, %int0_827, %int0_828, %int9223372036854775807_829, %int1_830 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_831 = torch.constant.int 1
    %int0_832 = torch.constant.int 0
    %int512_833 = torch.constant.int 512
    %int1_834 = torch.constant.int 1
    %711 = torch.aten.slice.Tensor %710, %int1_831, %int0_832, %int512_833, %int1_834 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_835 = torch.constant.int 0
    %int0_836 = torch.constant.int 0
    %int9223372036854775807_837 = torch.constant.int 9223372036854775807
    %int1_838 = torch.constant.int 1
    %712 = torch.aten.slice.Tensor %709, %int0_835, %int0_836, %int9223372036854775807_837, %int1_838 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_839 = torch.constant.int 1
    %int512_840 = torch.constant.int 512
    %int9223372036854775807_841 = torch.constant.int 9223372036854775807
    %int1_842 = torch.constant.int 1
    %713 = torch.aten.slice.Tensor %712, %int1_839, %int512_840, %int9223372036854775807_841, %int1_842 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_843 = torch.constant.int 4096
    %int3072_844 = torch.constant.int 3072
    %714 = torch.prim.ListConstruct %int4096_843, %int3072_844 : (!torch.int, !torch.int) -> !torch.list<int>
    %715 = torch.aten.view %713, %714 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.1.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.1.img_attn.proj.weight : tensor<3072x3072xf16>
    %716 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_845 = torch.constant.int 0
    %int1_846 = torch.constant.int 1
    %717 = torch.aten.transpose.int %716, %int0_845, %int1_846 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.1.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.1.img_attn.proj.bias : tensor<3072xf16>
    %718 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_847 = torch.constant.int 6
    %719 = torch.prims.convert_element_type %718, %int6_847 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_848 = torch.constant.int 6
    %720 = torch.prims.convert_element_type %715, %int6_848 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_849 = torch.constant.int 6
    %721 = torch.prims.convert_element_type %717, %int6_849 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %722 = torch.aten.mm %720, %721 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_850 = torch.constant.int 1
    %723 = torch.aten.mul.Scalar %722, %int1_850 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_851 = torch.constant.int 1
    %724 = torch.aten.mul.Scalar %719, %int1_851 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_852 = torch.constant.int 1
    %725 = torch.aten.add.Tensor %723, %724, %int1_852 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_853 = torch.constant.int 5
    %726 = torch.prims.convert_element_type %725, %int5_853 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_854 = torch.constant.int 1
    %int4096_855 = torch.constant.int 4096
    %int3072_856 = torch.constant.int 3072
    %727 = torch.prim.ListConstruct %int1_854, %int4096_855, %int3072_856 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %728 = torch.aten.view %726, %727 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %729 = torch.aten.mul.Tensor %540, %728 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_857 = torch.constant.int 1
    %730 = torch.aten.add.Tensor %462, %729, %int1_857 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_858 = torch.constant.int 1
    %int1_859 = torch.constant.int 1
    %731 = torch.aten.add.Scalar %542, %int1_858, %int1_859 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_860 = torch.constant.int 6
    %732 = torch.prims.convert_element_type %730, %int6_860 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_861 = torch.constant.int 2
    %733 = torch.prim.ListConstruct %int2_861 : (!torch.int) -> !torch.list<int>
    %int0_862 = torch.constant.int 0
    %true_863 = torch.constant.bool true
    %result0_864, %result1_865 = torch.aten.var_mean.correction %732, %733, %int0_862, %true_863 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_866 = torch.constant.float 9.9999999999999995E-7
    %int1_867 = torch.constant.int 1
    %734 = torch.aten.add.Scalar %result0_864, %float9.999990e-07_866, %int1_867 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %735 = torch.aten.rsqrt %734 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_868 = torch.constant.int 1
    %736 = torch.aten.sub.Tensor %730, %result1_865, %int1_868 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %737 = torch.aten.mul.Tensor %736, %735 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_869 = torch.constant.int 5
    %738 = torch.prims.convert_element_type %737, %int5_869 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %739 = torch.aten.mul.Tensor %731, %738 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_870 = torch.constant.int 1
    %740 = torch.aten.add.Tensor %739, %541, %int1_870 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_871 = torch.constant.int 4096
    %int3072_872 = torch.constant.int 3072
    %741 = torch.prim.ListConstruct %int4096_871, %int3072_872 : (!torch.int, !torch.int) -> !torch.list<int>
    %742 = torch.aten.view %740, %741 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.1.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.1.img_mlp.0.weight : tensor<12288x3072xf16>
    %743 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_873 = torch.constant.int 0
    %int1_874 = torch.constant.int 1
    %744 = torch.aten.transpose.int %743, %int0_873, %int1_874 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.1.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.1.img_mlp.0.bias : tensor<12288xf16>
    %745 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_875 = torch.constant.int 6
    %746 = torch.prims.convert_element_type %745, %int6_875 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_876 = torch.constant.int 6
    %747 = torch.prims.convert_element_type %742, %int6_876 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_877 = torch.constant.int 6
    %748 = torch.prims.convert_element_type %744, %int6_877 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %749 = torch.aten.mm %747, %748 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_878 = torch.constant.int 1
    %750 = torch.aten.mul.Scalar %749, %int1_878 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_879 = torch.constant.int 1
    %751 = torch.aten.mul.Scalar %746, %int1_879 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_880 = torch.constant.int 1
    %752 = torch.aten.add.Tensor %750, %751, %int1_880 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_881 = torch.constant.int 5
    %753 = torch.prims.convert_element_type %752, %int5_881 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_882 = torch.constant.int 1
    %int4096_883 = torch.constant.int 4096
    %int12288_884 = torch.constant.int 12288
    %754 = torch.prim.ListConstruct %int1_882, %int4096_883, %int12288_884 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %755 = torch.aten.view %753, %754 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_885 = torch.constant.str "tanh"
    %756 = torch.aten.gelu %755, %str_885 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_886 = torch.constant.int 4096
    %int12288_887 = torch.constant.int 12288
    %757 = torch.prim.ListConstruct %int4096_886, %int12288_887 : (!torch.int, !torch.int) -> !torch.list<int>
    %758 = torch.aten.view %756, %757 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.1.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.1.img_mlp.2.weight : tensor<3072x12288xf16>
    %759 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_888 = torch.constant.int 0
    %int1_889 = torch.constant.int 1
    %760 = torch.aten.transpose.int %759, %int0_888, %int1_889 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.1.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.1.img_mlp.2.bias : tensor<3072xf16>
    %761 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_890 = torch.constant.int 6
    %762 = torch.prims.convert_element_type %761, %int6_890 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_891 = torch.constant.int 6
    %763 = torch.prims.convert_element_type %758, %int6_891 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_892 = torch.constant.int 6
    %764 = torch.prims.convert_element_type %760, %int6_892 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %765 = torch.aten.mm %763, %764 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_893 = torch.constant.int 1
    %766 = torch.aten.mul.Scalar %765, %int1_893 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_894 = torch.constant.int 1
    %767 = torch.aten.mul.Scalar %762, %int1_894 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_895 = torch.constant.int 1
    %768 = torch.aten.add.Tensor %766, %767, %int1_895 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_896 = torch.constant.int 5
    %769 = torch.prims.convert_element_type %768, %int5_896 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_897 = torch.constant.int 1
    %int4096_898 = torch.constant.int 4096
    %int3072_899 = torch.constant.int 3072
    %770 = torch.prim.ListConstruct %int1_897, %int4096_898, %int3072_899 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %771 = torch.aten.view %769, %770 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %772 = torch.aten.mul.Tensor %543, %771 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_900 = torch.constant.int 1
    %773 = torch.aten.add.Tensor %730, %772, %int1_900 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_901 = torch.constant.int 512
    %int3072_902 = torch.constant.int 3072
    %774 = torch.prim.ListConstruct %int512_901, %int3072_902 : (!torch.int, !torch.int) -> !torch.list<int>
    %775 = torch.aten.view %711, %774 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.1.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.1.txt_attn.proj.weight : tensor<3072x3072xf16>
    %776 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_903 = torch.constant.int 0
    %int1_904 = torch.constant.int 1
    %777 = torch.aten.transpose.int %776, %int0_903, %int1_904 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.1.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.1.txt_attn.proj.bias : tensor<3072xf16>
    %778 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_905 = torch.constant.int 6
    %779 = torch.prims.convert_element_type %778, %int6_905 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_906 = torch.constant.int 6
    %780 = torch.prims.convert_element_type %775, %int6_906 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_907 = torch.constant.int 6
    %781 = torch.prims.convert_element_type %777, %int6_907 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %782 = torch.aten.mm %780, %781 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_908 = torch.constant.int 1
    %783 = torch.aten.mul.Scalar %782, %int1_908 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_909 = torch.constant.int 1
    %784 = torch.aten.mul.Scalar %779, %int1_909 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_910 = torch.constant.int 1
    %785 = torch.aten.add.Tensor %783, %784, %int1_910 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_911 = torch.constant.int 5
    %786 = torch.prims.convert_element_type %785, %int5_911 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_912 = torch.constant.int 1
    %int512_913 = torch.constant.int 512
    %int3072_914 = torch.constant.int 3072
    %787 = torch.prim.ListConstruct %int1_912, %int512_913, %int3072_914 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %788 = torch.aten.view %786, %787 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %789 = torch.aten.mul.Tensor %561, %788 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_915 = torch.constant.int 1
    %790 = torch.aten.add.Tensor %522, %789, %int1_915 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_916 = torch.constant.int 1
    %int1_917 = torch.constant.int 1
    %791 = torch.aten.add.Scalar %563, %int1_916, %int1_917 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_918 = torch.constant.int 6
    %792 = torch.prims.convert_element_type %790, %int6_918 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_919 = torch.constant.int 2
    %793 = torch.prim.ListConstruct %int2_919 : (!torch.int) -> !torch.list<int>
    %int0_920 = torch.constant.int 0
    %true_921 = torch.constant.bool true
    %result0_922, %result1_923 = torch.aten.var_mean.correction %792, %793, %int0_920, %true_921 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_924 = torch.constant.float 9.9999999999999995E-7
    %int1_925 = torch.constant.int 1
    %794 = torch.aten.add.Scalar %result0_922, %float9.999990e-07_924, %int1_925 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %795 = torch.aten.rsqrt %794 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_926 = torch.constant.int 1
    %796 = torch.aten.sub.Tensor %790, %result1_923, %int1_926 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %797 = torch.aten.mul.Tensor %796, %795 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_927 = torch.constant.int 5
    %798 = torch.prims.convert_element_type %797, %int5_927 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %799 = torch.aten.mul.Tensor %791, %798 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_928 = torch.constant.int 1
    %800 = torch.aten.add.Tensor %799, %562, %int1_928 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_929 = torch.constant.int 512
    %int3072_930 = torch.constant.int 3072
    %801 = torch.prim.ListConstruct %int512_929, %int3072_930 : (!torch.int, !torch.int) -> !torch.list<int>
    %802 = torch.aten.view %800, %801 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.1.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.1.txt_mlp.0.weight : tensor<12288x3072xf16>
    %803 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_931 = torch.constant.int 0
    %int1_932 = torch.constant.int 1
    %804 = torch.aten.transpose.int %803, %int0_931, %int1_932 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.1.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.1.txt_mlp.0.bias : tensor<12288xf16>
    %805 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_933 = torch.constant.int 6
    %806 = torch.prims.convert_element_type %805, %int6_933 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_934 = torch.constant.int 6
    %807 = torch.prims.convert_element_type %802, %int6_934 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_935 = torch.constant.int 6
    %808 = torch.prims.convert_element_type %804, %int6_935 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %809 = torch.aten.mm %807, %808 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_936 = torch.constant.int 1
    %810 = torch.aten.mul.Scalar %809, %int1_936 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_937 = torch.constant.int 1
    %811 = torch.aten.mul.Scalar %806, %int1_937 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_938 = torch.constant.int 1
    %812 = torch.aten.add.Tensor %810, %811, %int1_938 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_939 = torch.constant.int 5
    %813 = torch.prims.convert_element_type %812, %int5_939 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_940 = torch.constant.int 1
    %int512_941 = torch.constant.int 512
    %int12288_942 = torch.constant.int 12288
    %814 = torch.prim.ListConstruct %int1_940, %int512_941, %int12288_942 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %815 = torch.aten.view %813, %814 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_943 = torch.constant.str "tanh"
    %816 = torch.aten.gelu %815, %str_943 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_944 = torch.constant.int 512
    %int12288_945 = torch.constant.int 12288
    %817 = torch.prim.ListConstruct %int512_944, %int12288_945 : (!torch.int, !torch.int) -> !torch.list<int>
    %818 = torch.aten.view %816, %817 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.1.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.1.txt_mlp.2.weight : tensor<3072x12288xf16>
    %819 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_946 = torch.constant.int 0
    %int1_947 = torch.constant.int 1
    %820 = torch.aten.transpose.int %819, %int0_946, %int1_947 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.1.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.1.txt_mlp.2.bias : tensor<3072xf16>
    %821 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.1.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_948 = torch.constant.int 6
    %822 = torch.prims.convert_element_type %821, %int6_948 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_949 = torch.constant.int 6
    %823 = torch.prims.convert_element_type %818, %int6_949 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_950 = torch.constant.int 6
    %824 = torch.prims.convert_element_type %820, %int6_950 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %825 = torch.aten.mm %823, %824 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_951 = torch.constant.int 1
    %826 = torch.aten.mul.Scalar %825, %int1_951 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_952 = torch.constant.int 1
    %827 = torch.aten.mul.Scalar %822, %int1_952 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_953 = torch.constant.int 1
    %828 = torch.aten.add.Tensor %826, %827, %int1_953 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_954 = torch.constant.int 5
    %829 = torch.prims.convert_element_type %828, %int5_954 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_955 = torch.constant.int 1
    %int512_956 = torch.constant.int 512
    %int3072_957 = torch.constant.int 3072
    %830 = torch.prim.ListConstruct %int1_955, %int512_956, %int3072_957 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %831 = torch.aten.view %829, %830 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %832 = torch.aten.mul.Tensor %564, %831 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_958 = torch.constant.int 1
    %833 = torch.aten.add.Tensor %790, %832, %int1_958 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %834 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.2.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.2.img_mod.lin.weight : tensor<18432x3072xf16>
    %835 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_959 = torch.constant.int 0
    %int1_960 = torch.constant.int 1
    %836 = torch.aten.transpose.int %835, %int0_959, %int1_960 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.2.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.2.img_mod.lin.bias : tensor<18432xf16>
    %837 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_961 = torch.constant.int 6
    %838 = torch.prims.convert_element_type %837, %int6_961 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_962 = torch.constant.int 6
    %839 = torch.prims.convert_element_type %834, %int6_962 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_963 = torch.constant.int 6
    %840 = torch.prims.convert_element_type %836, %int6_963 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %841 = torch.aten.mm %839, %840 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_964 = torch.constant.int 1
    %842 = torch.aten.mul.Scalar %841, %int1_964 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_965 = torch.constant.int 1
    %843 = torch.aten.mul.Scalar %838, %int1_965 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_966 = torch.constant.int 1
    %844 = torch.aten.add.Tensor %842, %843, %int1_966 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_967 = torch.constant.int 5
    %845 = torch.prims.convert_element_type %844, %int5_967 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_968 = torch.constant.int 0
    %int0_969 = torch.constant.int 0
    %int9223372036854775807_970 = torch.constant.int 9223372036854775807
    %int1_971 = torch.constant.int 1
    %846 = torch.aten.slice.Tensor %845, %int0_968, %int0_969, %int9223372036854775807_970, %int1_971 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_972 = torch.constant.int 1
    %847 = torch.aten.unsqueeze %846, %int1_972 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_973 = torch.constant.int 2
    %int0_974 = torch.constant.int 0
    %int9223372036854775807_975 = torch.constant.int 9223372036854775807
    %int1_976 = torch.constant.int 1
    %848 = torch.aten.slice.Tensor %847, %int2_973, %int0_974, %int9223372036854775807_975, %int1_976 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_977 = torch.constant.int -1
    %int0_978 = torch.constant.int 0
    %int3072_979 = torch.constant.int 3072
    %int1_980 = torch.constant.int 1
    %849 = torch.aten.slice.Tensor %848, %int-1_977, %int0_978, %int3072_979, %int1_980 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_981 = torch.constant.int -1
    %int3072_982 = torch.constant.int 3072
    %int6144_983 = torch.constant.int 6144
    %int1_984 = torch.constant.int 1
    %850 = torch.aten.slice.Tensor %848, %int-1_981, %int3072_982, %int6144_983, %int1_984 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_985 = torch.constant.int -1
    %int6144_986 = torch.constant.int 6144
    %int9216_987 = torch.constant.int 9216
    %int1_988 = torch.constant.int 1
    %851 = torch.aten.slice.Tensor %848, %int-1_985, %int6144_986, %int9216_987, %int1_988 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_989 = torch.constant.int -1
    %int9216_990 = torch.constant.int 9216
    %int12288_991 = torch.constant.int 12288
    %int1_992 = torch.constant.int 1
    %852 = torch.aten.slice.Tensor %848, %int-1_989, %int9216_990, %int12288_991, %int1_992 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_993 = torch.constant.int -1
    %int12288_994 = torch.constant.int 12288
    %int15360_995 = torch.constant.int 15360
    %int1_996 = torch.constant.int 1
    %853 = torch.aten.slice.Tensor %848, %int-1_993, %int12288_994, %int15360_995, %int1_996 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_997 = torch.constant.int -1
    %int15360_998 = torch.constant.int 15360
    %int18432_999 = torch.constant.int 18432
    %int1_1000 = torch.constant.int 1
    %854 = torch.aten.slice.Tensor %848, %int-1_997, %int15360_998, %int18432_999, %int1_1000 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %855 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.2.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.2.txt_mod.lin.weight : tensor<18432x3072xf16>
    %856 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_1001 = torch.constant.int 0
    %int1_1002 = torch.constant.int 1
    %857 = torch.aten.transpose.int %856, %int0_1001, %int1_1002 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.2.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.2.txt_mod.lin.bias : tensor<18432xf16>
    %858 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_1003 = torch.constant.int 6
    %859 = torch.prims.convert_element_type %858, %int6_1003 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_1004 = torch.constant.int 6
    %860 = torch.prims.convert_element_type %855, %int6_1004 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_1005 = torch.constant.int 6
    %861 = torch.prims.convert_element_type %857, %int6_1005 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %862 = torch.aten.mm %860, %861 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_1006 = torch.constant.int 1
    %863 = torch.aten.mul.Scalar %862, %int1_1006 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_1007 = torch.constant.int 1
    %864 = torch.aten.mul.Scalar %859, %int1_1007 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_1008 = torch.constant.int 1
    %865 = torch.aten.add.Tensor %863, %864, %int1_1008 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_1009 = torch.constant.int 5
    %866 = torch.prims.convert_element_type %865, %int5_1009 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_1010 = torch.constant.int 0
    %int0_1011 = torch.constant.int 0
    %int9223372036854775807_1012 = torch.constant.int 9223372036854775807
    %int1_1013 = torch.constant.int 1
    %867 = torch.aten.slice.Tensor %866, %int0_1010, %int0_1011, %int9223372036854775807_1012, %int1_1013 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_1014 = torch.constant.int 1
    %868 = torch.aten.unsqueeze %867, %int1_1014 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_1015 = torch.constant.int 2
    %int0_1016 = torch.constant.int 0
    %int9223372036854775807_1017 = torch.constant.int 9223372036854775807
    %int1_1018 = torch.constant.int 1
    %869 = torch.aten.slice.Tensor %868, %int2_1015, %int0_1016, %int9223372036854775807_1017, %int1_1018 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_1019 = torch.constant.int -1
    %int0_1020 = torch.constant.int 0
    %int3072_1021 = torch.constant.int 3072
    %int1_1022 = torch.constant.int 1
    %870 = torch.aten.slice.Tensor %869, %int-1_1019, %int0_1020, %int3072_1021, %int1_1022 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1023 = torch.constant.int -1
    %int3072_1024 = torch.constant.int 3072
    %int6144_1025 = torch.constant.int 6144
    %int1_1026 = torch.constant.int 1
    %871 = torch.aten.slice.Tensor %869, %int-1_1023, %int3072_1024, %int6144_1025, %int1_1026 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1027 = torch.constant.int -1
    %int6144_1028 = torch.constant.int 6144
    %int9216_1029 = torch.constant.int 9216
    %int1_1030 = torch.constant.int 1
    %872 = torch.aten.slice.Tensor %869, %int-1_1027, %int6144_1028, %int9216_1029, %int1_1030 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1031 = torch.constant.int -1
    %int9216_1032 = torch.constant.int 9216
    %int12288_1033 = torch.constant.int 12288
    %int1_1034 = torch.constant.int 1
    %873 = torch.aten.slice.Tensor %869, %int-1_1031, %int9216_1032, %int12288_1033, %int1_1034 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1035 = torch.constant.int -1
    %int12288_1036 = torch.constant.int 12288
    %int15360_1037 = torch.constant.int 15360
    %int1_1038 = torch.constant.int 1
    %874 = torch.aten.slice.Tensor %869, %int-1_1035, %int12288_1036, %int15360_1037, %int1_1038 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1039 = torch.constant.int -1
    %int15360_1040 = torch.constant.int 15360
    %int18432_1041 = torch.constant.int 18432
    %int1_1042 = torch.constant.int 1
    %875 = torch.aten.slice.Tensor %869, %int-1_1039, %int15360_1040, %int18432_1041, %int1_1042 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_1043 = torch.constant.int 6
    %876 = torch.prims.convert_element_type %773, %int6_1043 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_1044 = torch.constant.int 2
    %877 = torch.prim.ListConstruct %int2_1044 : (!torch.int) -> !torch.list<int>
    %int0_1045 = torch.constant.int 0
    %true_1046 = torch.constant.bool true
    %result0_1047, %result1_1048 = torch.aten.var_mean.correction %876, %877, %int0_1045, %true_1046 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_1049 = torch.constant.float 9.9999999999999995E-7
    %int1_1050 = torch.constant.int 1
    %878 = torch.aten.add.Scalar %result0_1047, %float9.999990e-07_1049, %int1_1050 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %879 = torch.aten.rsqrt %878 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_1051 = torch.constant.int 1
    %880 = torch.aten.sub.Tensor %773, %result1_1048, %int1_1051 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %881 = torch.aten.mul.Tensor %880, %879 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_1052 = torch.constant.int 5
    %882 = torch.prims.convert_element_type %881, %int5_1052 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1053 = torch.constant.int 1
    %int1_1054 = torch.constant.int 1
    %883 = torch.aten.add.Scalar %850, %int1_1053, %int1_1054 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %884 = torch.aten.mul.Tensor %883, %882 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1055 = torch.constant.int 1
    %885 = torch.aten.add.Tensor %884, %849, %int1_1055 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_1056 = torch.constant.int 4096
    %int3072_1057 = torch.constant.int 3072
    %886 = torch.prim.ListConstruct %int4096_1056, %int3072_1057 : (!torch.int, !torch.int) -> !torch.list<int>
    %887 = torch.aten.view %885, %886 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.2.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.2.img_attn.qkv.weight : tensor<9216x3072xf16>
    %888 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_1058 = torch.constant.int 0
    %int1_1059 = torch.constant.int 1
    %889 = torch.aten.transpose.int %888, %int0_1058, %int1_1059 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.2.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.2.img_attn.qkv.bias : tensor<9216xf16>
    %890 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_1060 = torch.constant.int 6
    %891 = torch.prims.convert_element_type %890, %int6_1060 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_1061 = torch.constant.int 6
    %892 = torch.prims.convert_element_type %887, %int6_1061 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_1062 = torch.constant.int 6
    %893 = torch.prims.convert_element_type %889, %int6_1062 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %894 = torch.aten.mm %892, %893 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_1063 = torch.constant.int 1
    %895 = torch.aten.mul.Scalar %894, %int1_1063 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_1064 = torch.constant.int 1
    %896 = torch.aten.mul.Scalar %891, %int1_1064 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_1065 = torch.constant.int 1
    %897 = torch.aten.add.Tensor %895, %896, %int1_1065 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_1066 = torch.constant.int 5
    %898 = torch.prims.convert_element_type %897, %int5_1066 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_1067 = torch.constant.int 1
    %int4096_1068 = torch.constant.int 4096
    %int9216_1069 = torch.constant.int 9216
    %899 = torch.prim.ListConstruct %int1_1067, %int4096_1068, %int9216_1069 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %900 = torch.aten.view %898, %899 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_1070 = torch.constant.int 1
    %int4096_1071 = torch.constant.int 4096
    %int3_1072 = torch.constant.int 3
    %int24_1073 = torch.constant.int 24
    %int128_1074 = torch.constant.int 128
    %901 = torch.prim.ListConstruct %int1_1070, %int4096_1071, %int3_1072, %int24_1073, %int128_1074 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %902 = torch.aten.view %900, %901 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_1075 = torch.constant.int 2
    %int0_1076 = torch.constant.int 0
    %int3_1077 = torch.constant.int 3
    %int1_1078 = torch.constant.int 1
    %int4_1079 = torch.constant.int 4
    %903 = torch.prim.ListConstruct %int2_1075, %int0_1076, %int3_1077, %int1_1078, %int4_1079 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %904 = torch.aten.permute %902, %903 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_1080 = torch.constant.int 0
    %int0_1081 = torch.constant.int 0
    %905 = torch.aten.select.int %904, %int0_1080, %int0_1081 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_1082 = torch.constant.int 0
    %int1_1083 = torch.constant.int 1
    %906 = torch.aten.select.int %904, %int0_1082, %int1_1083 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_1084 = torch.constant.int 0
    %int2_1085 = torch.constant.int 2
    %907 = torch.aten.select.int %904, %int0_1084, %int2_1085 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_1086 = torch.constant.int 6
    %908 = torch.prims.convert_element_type %905, %int6_1086 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_1087 = torch.constant.int 2
    %909 = torch.aten.pow.Tensor_Scalar %908, %int2_1087 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_1088 = torch.constant.int -1
    %910 = torch.prim.ListConstruct %int-1_1088 : (!torch.int) -> !torch.list<int>
    %true_1089 = torch.constant.bool true
    %none_1090 = torch.constant.none
    %911 = torch.aten.mean.dim %909, %910, %true_1089, %none_1090 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_1091 = torch.constant.float 9.9999999999999995E-7
    %int1_1092 = torch.constant.int 1
    %912 = torch.aten.add.Scalar %911, %float9.999990e-07_1091, %int1_1092 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %913 = torch.aten.rsqrt %912 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %914 = torch.aten.mul.Tensor %908, %913 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_1093 = torch.constant.int 5
    %915 = torch.prims.convert_element_type %914, %int5_1093 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.2.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.2.img_attn.norm.query_norm.scale : tensor<128xf16>
    %916 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %917 = torch.aten.mul.Tensor %915, %916 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_1094 = torch.constant.int 6
    %918 = torch.prims.convert_element_type %906, %int6_1094 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_1095 = torch.constant.int 2
    %919 = torch.aten.pow.Tensor_Scalar %918, %int2_1095 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_1096 = torch.constant.int -1
    %920 = torch.prim.ListConstruct %int-1_1096 : (!torch.int) -> !torch.list<int>
    %true_1097 = torch.constant.bool true
    %none_1098 = torch.constant.none
    %921 = torch.aten.mean.dim %919, %920, %true_1097, %none_1098 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_1099 = torch.constant.float 9.9999999999999995E-7
    %int1_1100 = torch.constant.int 1
    %922 = torch.aten.add.Scalar %921, %float9.999990e-07_1099, %int1_1100 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %923 = torch.aten.rsqrt %922 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %924 = torch.aten.mul.Tensor %918, %923 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_1101 = torch.constant.int 5
    %925 = torch.prims.convert_element_type %924, %int5_1101 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.2.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.2.img_attn.norm.key_norm.scale : tensor<128xf16>
    %926 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %927 = torch.aten.mul.Tensor %925, %926 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_1102 = torch.constant.int 5
    %928 = torch.prims.convert_element_type %917, %int5_1102 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_1103 = torch.constant.int 5
    %929 = torch.prims.convert_element_type %927, %int5_1103 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_1104 = torch.constant.int 6
    %930 = torch.prims.convert_element_type %833, %int6_1104 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_1105 = torch.constant.int 2
    %931 = torch.prim.ListConstruct %int2_1105 : (!torch.int) -> !torch.list<int>
    %int0_1106 = torch.constant.int 0
    %true_1107 = torch.constant.bool true
    %result0_1108, %result1_1109 = torch.aten.var_mean.correction %930, %931, %int0_1106, %true_1107 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_1110 = torch.constant.float 9.9999999999999995E-7
    %int1_1111 = torch.constant.int 1
    %932 = torch.aten.add.Scalar %result0_1108, %float9.999990e-07_1110, %int1_1111 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %933 = torch.aten.rsqrt %932 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_1112 = torch.constant.int 1
    %934 = torch.aten.sub.Tensor %833, %result1_1109, %int1_1112 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %935 = torch.aten.mul.Tensor %934, %933 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_1113 = torch.constant.int 5
    %936 = torch.prims.convert_element_type %935, %int5_1113 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_1114 = torch.constant.int 1
    %int1_1115 = torch.constant.int 1
    %937 = torch.aten.add.Scalar %871, %int1_1114, %int1_1115 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %938 = torch.aten.mul.Tensor %937, %936 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1116 = torch.constant.int 1
    %939 = torch.aten.add.Tensor %938, %870, %int1_1116 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_1117 = torch.constant.int 512
    %int3072_1118 = torch.constant.int 3072
    %940 = torch.prim.ListConstruct %int512_1117, %int3072_1118 : (!torch.int, !torch.int) -> !torch.list<int>
    %941 = torch.aten.view %939, %940 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.2.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.2.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %942 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_1119 = torch.constant.int 0
    %int1_1120 = torch.constant.int 1
    %943 = torch.aten.transpose.int %942, %int0_1119, %int1_1120 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.2.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.2.txt_attn.qkv.bias : tensor<9216xf16>
    %944 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_1121 = torch.constant.int 6
    %945 = torch.prims.convert_element_type %944, %int6_1121 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_1122 = torch.constant.int 6
    %946 = torch.prims.convert_element_type %941, %int6_1122 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_1123 = torch.constant.int 6
    %947 = torch.prims.convert_element_type %943, %int6_1123 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %948 = torch.aten.mm %946, %947 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_1124 = torch.constant.int 1
    %949 = torch.aten.mul.Scalar %948, %int1_1124 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_1125 = torch.constant.int 1
    %950 = torch.aten.mul.Scalar %945, %int1_1125 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_1126 = torch.constant.int 1
    %951 = torch.aten.add.Tensor %949, %950, %int1_1126 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_1127 = torch.constant.int 5
    %952 = torch.prims.convert_element_type %951, %int5_1127 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_1128 = torch.constant.int 1
    %int512_1129 = torch.constant.int 512
    %int9216_1130 = torch.constant.int 9216
    %953 = torch.prim.ListConstruct %int1_1128, %int512_1129, %int9216_1130 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %954 = torch.aten.view %952, %953 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_1131 = torch.constant.int 1
    %int512_1132 = torch.constant.int 512
    %int3_1133 = torch.constant.int 3
    %int24_1134 = torch.constant.int 24
    %int128_1135 = torch.constant.int 128
    %955 = torch.prim.ListConstruct %int1_1131, %int512_1132, %int3_1133, %int24_1134, %int128_1135 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %956 = torch.aten.view %954, %955 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_1136 = torch.constant.int 2
    %int0_1137 = torch.constant.int 0
    %int3_1138 = torch.constant.int 3
    %int1_1139 = torch.constant.int 1
    %int4_1140 = torch.constant.int 4
    %957 = torch.prim.ListConstruct %int2_1136, %int0_1137, %int3_1138, %int1_1139, %int4_1140 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %958 = torch.aten.permute %956, %957 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_1141 = torch.constant.int 0
    %int0_1142 = torch.constant.int 0
    %959 = torch.aten.select.int %958, %int0_1141, %int0_1142 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_1143 = torch.constant.int 0
    %int1_1144 = torch.constant.int 1
    %960 = torch.aten.select.int %958, %int0_1143, %int1_1144 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_1145 = torch.constant.int 0
    %int2_1146 = torch.constant.int 2
    %961 = torch.aten.select.int %958, %int0_1145, %int2_1146 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_1147 = torch.constant.int 6
    %962 = torch.prims.convert_element_type %959, %int6_1147 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_1148 = torch.constant.int 2
    %963 = torch.aten.pow.Tensor_Scalar %962, %int2_1148 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_1149 = torch.constant.int -1
    %964 = torch.prim.ListConstruct %int-1_1149 : (!torch.int) -> !torch.list<int>
    %true_1150 = torch.constant.bool true
    %none_1151 = torch.constant.none
    %965 = torch.aten.mean.dim %963, %964, %true_1150, %none_1151 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_1152 = torch.constant.float 9.9999999999999995E-7
    %int1_1153 = torch.constant.int 1
    %966 = torch.aten.add.Scalar %965, %float9.999990e-07_1152, %int1_1153 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %967 = torch.aten.rsqrt %966 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %968 = torch.aten.mul.Tensor %962, %967 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_1154 = torch.constant.int 5
    %969 = torch.prims.convert_element_type %968, %int5_1154 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.2.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.2.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %970 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %971 = torch.aten.mul.Tensor %969, %970 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_1155 = torch.constant.int 6
    %972 = torch.prims.convert_element_type %960, %int6_1155 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_1156 = torch.constant.int 2
    %973 = torch.aten.pow.Tensor_Scalar %972, %int2_1156 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_1157 = torch.constant.int -1
    %974 = torch.prim.ListConstruct %int-1_1157 : (!torch.int) -> !torch.list<int>
    %true_1158 = torch.constant.bool true
    %none_1159 = torch.constant.none
    %975 = torch.aten.mean.dim %973, %974, %true_1158, %none_1159 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_1160 = torch.constant.float 9.9999999999999995E-7
    %int1_1161 = torch.constant.int 1
    %976 = torch.aten.add.Scalar %975, %float9.999990e-07_1160, %int1_1161 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %977 = torch.aten.rsqrt %976 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %978 = torch.aten.mul.Tensor %972, %977 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_1162 = torch.constant.int 5
    %979 = torch.prims.convert_element_type %978, %int5_1162 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.2.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.2.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %980 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %981 = torch.aten.mul.Tensor %979, %980 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_1163 = torch.constant.int 5
    %982 = torch.prims.convert_element_type %971, %int5_1163 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_1164 = torch.constant.int 5
    %983 = torch.prims.convert_element_type %981, %int5_1164 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %984 = torch.prim.ListConstruct %982, %928 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_1165 = torch.constant.int 2
    %985 = torch.aten.cat %984, %int2_1165 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %986 = torch.prim.ListConstruct %983, %929 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_1166 = torch.constant.int 2
    %987 = torch.aten.cat %986, %int2_1166 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %988 = torch.prim.ListConstruct %961, %907 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_1167 = torch.constant.int 2
    %989 = torch.aten.cat %988, %int2_1167 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_1168 = torch.constant.int 6
    %990 = torch.prims.convert_element_type %985, %int6_1168 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_1169 = torch.constant.int 1
    %int24_1170 = torch.constant.int 24
    %int4608_1171 = torch.constant.int 4608
    %int-1_1172 = torch.constant.int -1
    %int1_1173 = torch.constant.int 1
    %int2_1174 = torch.constant.int 2
    %991 = torch.prim.ListConstruct %int1_1169, %int24_1170, %int4608_1171, %int-1_1172, %int1_1173, %int2_1174 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %992 = torch.aten.view %990, %991 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_1175 = torch.constant.int 6
    %993 = torch.prims.convert_element_type %987, %int6_1175 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_1176 = torch.constant.int 1
    %int24_1177 = torch.constant.int 24
    %int4608_1178 = torch.constant.int 4608
    %int-1_1179 = torch.constant.int -1
    %int1_1180 = torch.constant.int 1
    %int2_1181 = torch.constant.int 2
    %994 = torch.prim.ListConstruct %int1_1176, %int24_1177, %int4608_1178, %int-1_1179, %int1_1180, %int2_1181 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %995 = torch.aten.view %993, %994 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_1182 = torch.constant.int 5
    %int0_1183 = torch.constant.int 0
    %996 = torch.aten.select.int %211, %int5_1182, %int0_1183 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1184 = torch.constant.int 5
    %int0_1185 = torch.constant.int 0
    %997 = torch.aten.select.int %992, %int5_1184, %int0_1185 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %998 = torch.aten.mul.Tensor %996, %997 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_1186 = torch.constant.int 5
    %int1_1187 = torch.constant.int 1
    %999 = torch.aten.select.int %211, %int5_1186, %int1_1187 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1188 = torch.constant.int 5
    %int1_1189 = torch.constant.int 1
    %1000 = torch.aten.select.int %992, %int5_1188, %int1_1189 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1001 = torch.aten.mul.Tensor %999, %1000 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_1190 = torch.constant.int 1
    %1002 = torch.aten.add.Tensor %998, %1001, %int1_1190 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_1191 = torch.constant.int 5
    %int0_1192 = torch.constant.int 0
    %1003 = torch.aten.select.int %211, %int5_1191, %int0_1192 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1193 = torch.constant.int 5
    %int0_1194 = torch.constant.int 0
    %1004 = torch.aten.select.int %995, %int5_1193, %int0_1194 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1005 = torch.aten.mul.Tensor %1003, %1004 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_1195 = torch.constant.int 5
    %int1_1196 = torch.constant.int 1
    %1006 = torch.aten.select.int %211, %int5_1195, %int1_1196 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1197 = torch.constant.int 5
    %int1_1198 = torch.constant.int 1
    %1007 = torch.aten.select.int %995, %int5_1197, %int1_1198 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1008 = torch.aten.mul.Tensor %1006, %1007 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_1199 = torch.constant.int 1
    %1009 = torch.aten.add.Tensor %1005, %1008, %int1_1199 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_1200 = torch.constant.int 1
    %int24_1201 = torch.constant.int 24
    %int4608_1202 = torch.constant.int 4608
    %int128_1203 = torch.constant.int 128
    %1010 = torch.prim.ListConstruct %int1_1200, %int24_1201, %int4608_1202, %int128_1203 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1011 = torch.aten.view %1002, %1010 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_1204 = torch.constant.int 5
    %1012 = torch.prims.convert_element_type %1011, %int5_1204 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_1205 = torch.constant.int 1
    %int24_1206 = torch.constant.int 24
    %int4608_1207 = torch.constant.int 4608
    %int128_1208 = torch.constant.int 128
    %1013 = torch.prim.ListConstruct %int1_1205, %int24_1206, %int4608_1207, %int128_1208 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1014 = torch.aten.view %1009, %1013 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_1209 = torch.constant.int 5
    %1015 = torch.prims.convert_element_type %1014, %int5_1209 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_1210 = torch.constant.float 0.000000e+00
    %false_1211 = torch.constant.bool false
    %none_1212 = torch.constant.none
    %none_1213 = torch.constant.none
    %1016:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1012, %1015, %989, %float0.000000e00_1210, %false_1211, %none_1212, %none_1213) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_1214 = torch.constant.int 0
    %int2_1215 = torch.constant.int 2
    %int1_1216 = torch.constant.int 1
    %int3_1217 = torch.constant.int 3
    %1017 = torch.prim.ListConstruct %int0_1214, %int2_1215, %int1_1216, %int3_1217 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1018 = torch.aten.permute %1016#0, %1017 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_1218 = torch.constant.int 1
    %int4608_1219 = torch.constant.int 4608
    %int3072_1220 = torch.constant.int 3072
    %1019 = torch.prim.ListConstruct %int1_1218, %int4608_1219, %int3072_1220 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1020 = torch.aten.view %1018, %1019 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_1221 = torch.constant.int 0
    %int0_1222 = torch.constant.int 0
    %int9223372036854775807_1223 = torch.constant.int 9223372036854775807
    %int1_1224 = torch.constant.int 1
    %1021 = torch.aten.slice.Tensor %1020, %int0_1221, %int0_1222, %int9223372036854775807_1223, %int1_1224 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_1225 = torch.constant.int 1
    %int0_1226 = torch.constant.int 0
    %int512_1227 = torch.constant.int 512
    %int1_1228 = torch.constant.int 1
    %1022 = torch.aten.slice.Tensor %1021, %int1_1225, %int0_1226, %int512_1227, %int1_1228 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_1229 = torch.constant.int 0
    %int0_1230 = torch.constant.int 0
    %int9223372036854775807_1231 = torch.constant.int 9223372036854775807
    %int1_1232 = torch.constant.int 1
    %1023 = torch.aten.slice.Tensor %1020, %int0_1229, %int0_1230, %int9223372036854775807_1231, %int1_1232 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_1233 = torch.constant.int 1
    %int512_1234 = torch.constant.int 512
    %int9223372036854775807_1235 = torch.constant.int 9223372036854775807
    %int1_1236 = torch.constant.int 1
    %1024 = torch.aten.slice.Tensor %1023, %int1_1233, %int512_1234, %int9223372036854775807_1235, %int1_1236 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_1237 = torch.constant.int 4096
    %int3072_1238 = torch.constant.int 3072
    %1025 = torch.prim.ListConstruct %int4096_1237, %int3072_1238 : (!torch.int, !torch.int) -> !torch.list<int>
    %1026 = torch.aten.view %1024, %1025 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.2.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.2.img_attn.proj.weight : tensor<3072x3072xf16>
    %1027 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_1239 = torch.constant.int 0
    %int1_1240 = torch.constant.int 1
    %1028 = torch.aten.transpose.int %1027, %int0_1239, %int1_1240 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.2.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.2.img_attn.proj.bias : tensor<3072xf16>
    %1029 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1241 = torch.constant.int 6
    %1030 = torch.prims.convert_element_type %1029, %int6_1241 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1242 = torch.constant.int 6
    %1031 = torch.prims.convert_element_type %1026, %int6_1242 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_1243 = torch.constant.int 6
    %1032 = torch.prims.convert_element_type %1028, %int6_1243 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %1033 = torch.aten.mm %1031, %1032 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_1244 = torch.constant.int 1
    %1034 = torch.aten.mul.Scalar %1033, %int1_1244 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_1245 = torch.constant.int 1
    %1035 = torch.aten.mul.Scalar %1030, %int1_1245 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1246 = torch.constant.int 1
    %1036 = torch.aten.add.Tensor %1034, %1035, %int1_1246 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_1247 = torch.constant.int 5
    %1037 = torch.prims.convert_element_type %1036, %int5_1247 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_1248 = torch.constant.int 1
    %int4096_1249 = torch.constant.int 4096
    %int3072_1250 = torch.constant.int 3072
    %1038 = torch.prim.ListConstruct %int1_1248, %int4096_1249, %int3072_1250 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1039 = torch.aten.view %1037, %1038 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %1040 = torch.aten.mul.Tensor %851, %1039 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1251 = torch.constant.int 1
    %1041 = torch.aten.add.Tensor %773, %1040, %int1_1251 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1252 = torch.constant.int 1
    %int1_1253 = torch.constant.int 1
    %1042 = torch.aten.add.Scalar %853, %int1_1252, %int1_1253 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_1254 = torch.constant.int 6
    %1043 = torch.prims.convert_element_type %1041, %int6_1254 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_1255 = torch.constant.int 2
    %1044 = torch.prim.ListConstruct %int2_1255 : (!torch.int) -> !torch.list<int>
    %int0_1256 = torch.constant.int 0
    %true_1257 = torch.constant.bool true
    %result0_1258, %result1_1259 = torch.aten.var_mean.correction %1043, %1044, %int0_1256, %true_1257 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_1260 = torch.constant.float 9.9999999999999995E-7
    %int1_1261 = torch.constant.int 1
    %1045 = torch.aten.add.Scalar %result0_1258, %float9.999990e-07_1260, %int1_1261 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %1046 = torch.aten.rsqrt %1045 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_1262 = torch.constant.int 1
    %1047 = torch.aten.sub.Tensor %1041, %result1_1259, %int1_1262 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %1048 = torch.aten.mul.Tensor %1047, %1046 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_1263 = torch.constant.int 5
    %1049 = torch.prims.convert_element_type %1048, %int5_1263 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %1050 = torch.aten.mul.Tensor %1042, %1049 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1264 = torch.constant.int 1
    %1051 = torch.aten.add.Tensor %1050, %852, %int1_1264 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_1265 = torch.constant.int 4096
    %int3072_1266 = torch.constant.int 3072
    %1052 = torch.prim.ListConstruct %int4096_1265, %int3072_1266 : (!torch.int, !torch.int) -> !torch.list<int>
    %1053 = torch.aten.view %1051, %1052 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.2.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.2.img_mlp.0.weight : tensor<12288x3072xf16>
    %1054 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_1267 = torch.constant.int 0
    %int1_1268 = torch.constant.int 1
    %1055 = torch.aten.transpose.int %1054, %int0_1267, %int1_1268 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.2.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.2.img_mlp.0.bias : tensor<12288xf16>
    %1056 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_1269 = torch.constant.int 6
    %1057 = torch.prims.convert_element_type %1056, %int6_1269 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_1270 = torch.constant.int 6
    %1058 = torch.prims.convert_element_type %1053, %int6_1270 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_1271 = torch.constant.int 6
    %1059 = torch.prims.convert_element_type %1055, %int6_1271 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %1060 = torch.aten.mm %1058, %1059 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_1272 = torch.constant.int 1
    %1061 = torch.aten.mul.Scalar %1060, %int1_1272 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_1273 = torch.constant.int 1
    %1062 = torch.aten.mul.Scalar %1057, %int1_1273 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_1274 = torch.constant.int 1
    %1063 = torch.aten.add.Tensor %1061, %1062, %int1_1274 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_1275 = torch.constant.int 5
    %1064 = torch.prims.convert_element_type %1063, %int5_1275 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_1276 = torch.constant.int 1
    %int4096_1277 = torch.constant.int 4096
    %int12288_1278 = torch.constant.int 12288
    %1065 = torch.prim.ListConstruct %int1_1276, %int4096_1277, %int12288_1278 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1066 = torch.aten.view %1064, %1065 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_1279 = torch.constant.str "tanh"
    %1067 = torch.aten.gelu %1066, %str_1279 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_1280 = torch.constant.int 4096
    %int12288_1281 = torch.constant.int 12288
    %1068 = torch.prim.ListConstruct %int4096_1280, %int12288_1281 : (!torch.int, !torch.int) -> !torch.list<int>
    %1069 = torch.aten.view %1067, %1068 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.2.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.2.img_mlp.2.weight : tensor<3072x12288xf16>
    %1070 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_1282 = torch.constant.int 0
    %int1_1283 = torch.constant.int 1
    %1071 = torch.aten.transpose.int %1070, %int0_1282, %int1_1283 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.2.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.2.img_mlp.2.bias : tensor<3072xf16>
    %1072 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1284 = torch.constant.int 6
    %1073 = torch.prims.convert_element_type %1072, %int6_1284 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1285 = torch.constant.int 6
    %1074 = torch.prims.convert_element_type %1069, %int6_1285 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_1286 = torch.constant.int 6
    %1075 = torch.prims.convert_element_type %1071, %int6_1286 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %1076 = torch.aten.mm %1074, %1075 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_1287 = torch.constant.int 1
    %1077 = torch.aten.mul.Scalar %1076, %int1_1287 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_1288 = torch.constant.int 1
    %1078 = torch.aten.mul.Scalar %1073, %int1_1288 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1289 = torch.constant.int 1
    %1079 = torch.aten.add.Tensor %1077, %1078, %int1_1289 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_1290 = torch.constant.int 5
    %1080 = torch.prims.convert_element_type %1079, %int5_1290 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_1291 = torch.constant.int 1
    %int4096_1292 = torch.constant.int 4096
    %int3072_1293 = torch.constant.int 3072
    %1081 = torch.prim.ListConstruct %int1_1291, %int4096_1292, %int3072_1293 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1082 = torch.aten.view %1080, %1081 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %1083 = torch.aten.mul.Tensor %854, %1082 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1294 = torch.constant.int 1
    %1084 = torch.aten.add.Tensor %1041, %1083, %int1_1294 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_1295 = torch.constant.int 512
    %int3072_1296 = torch.constant.int 3072
    %1085 = torch.prim.ListConstruct %int512_1295, %int3072_1296 : (!torch.int, !torch.int) -> !torch.list<int>
    %1086 = torch.aten.view %1022, %1085 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.2.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.2.txt_attn.proj.weight : tensor<3072x3072xf16>
    %1087 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_1297 = torch.constant.int 0
    %int1_1298 = torch.constant.int 1
    %1088 = torch.aten.transpose.int %1087, %int0_1297, %int1_1298 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.2.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.2.txt_attn.proj.bias : tensor<3072xf16>
    %1089 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1299 = torch.constant.int 6
    %1090 = torch.prims.convert_element_type %1089, %int6_1299 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1300 = torch.constant.int 6
    %1091 = torch.prims.convert_element_type %1086, %int6_1300 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_1301 = torch.constant.int 6
    %1092 = torch.prims.convert_element_type %1088, %int6_1301 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %1093 = torch.aten.mm %1091, %1092 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_1302 = torch.constant.int 1
    %1094 = torch.aten.mul.Scalar %1093, %int1_1302 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_1303 = torch.constant.int 1
    %1095 = torch.aten.mul.Scalar %1090, %int1_1303 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1304 = torch.constant.int 1
    %1096 = torch.aten.add.Tensor %1094, %1095, %int1_1304 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_1305 = torch.constant.int 5
    %1097 = torch.prims.convert_element_type %1096, %int5_1305 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_1306 = torch.constant.int 1
    %int512_1307 = torch.constant.int 512
    %int3072_1308 = torch.constant.int 3072
    %1098 = torch.prim.ListConstruct %int1_1306, %int512_1307, %int3072_1308 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1099 = torch.aten.view %1097, %1098 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %1100 = torch.aten.mul.Tensor %872, %1099 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1309 = torch.constant.int 1
    %1101 = torch.aten.add.Tensor %833, %1100, %int1_1309 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_1310 = torch.constant.int 1
    %int1_1311 = torch.constant.int 1
    %1102 = torch.aten.add.Scalar %874, %int1_1310, %int1_1311 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_1312 = torch.constant.int 6
    %1103 = torch.prims.convert_element_type %1101, %int6_1312 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_1313 = torch.constant.int 2
    %1104 = torch.prim.ListConstruct %int2_1313 : (!torch.int) -> !torch.list<int>
    %int0_1314 = torch.constant.int 0
    %true_1315 = torch.constant.bool true
    %result0_1316, %result1_1317 = torch.aten.var_mean.correction %1103, %1104, %int0_1314, %true_1315 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_1318 = torch.constant.float 9.9999999999999995E-7
    %int1_1319 = torch.constant.int 1
    %1105 = torch.aten.add.Scalar %result0_1316, %float9.999990e-07_1318, %int1_1319 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %1106 = torch.aten.rsqrt %1105 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_1320 = torch.constant.int 1
    %1107 = torch.aten.sub.Tensor %1101, %result1_1317, %int1_1320 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %1108 = torch.aten.mul.Tensor %1107, %1106 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_1321 = torch.constant.int 5
    %1109 = torch.prims.convert_element_type %1108, %int5_1321 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %1110 = torch.aten.mul.Tensor %1102, %1109 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1322 = torch.constant.int 1
    %1111 = torch.aten.add.Tensor %1110, %873, %int1_1322 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_1323 = torch.constant.int 512
    %int3072_1324 = torch.constant.int 3072
    %1112 = torch.prim.ListConstruct %int512_1323, %int3072_1324 : (!torch.int, !torch.int) -> !torch.list<int>
    %1113 = torch.aten.view %1111, %1112 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.2.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.2.txt_mlp.0.weight : tensor<12288x3072xf16>
    %1114 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_1325 = torch.constant.int 0
    %int1_1326 = torch.constant.int 1
    %1115 = torch.aten.transpose.int %1114, %int0_1325, %int1_1326 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.2.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.2.txt_mlp.0.bias : tensor<12288xf16>
    %1116 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_1327 = torch.constant.int 6
    %1117 = torch.prims.convert_element_type %1116, %int6_1327 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_1328 = torch.constant.int 6
    %1118 = torch.prims.convert_element_type %1113, %int6_1328 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_1329 = torch.constant.int 6
    %1119 = torch.prims.convert_element_type %1115, %int6_1329 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %1120 = torch.aten.mm %1118, %1119 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_1330 = torch.constant.int 1
    %1121 = torch.aten.mul.Scalar %1120, %int1_1330 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_1331 = torch.constant.int 1
    %1122 = torch.aten.mul.Scalar %1117, %int1_1331 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_1332 = torch.constant.int 1
    %1123 = torch.aten.add.Tensor %1121, %1122, %int1_1332 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_1333 = torch.constant.int 5
    %1124 = torch.prims.convert_element_type %1123, %int5_1333 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_1334 = torch.constant.int 1
    %int512_1335 = torch.constant.int 512
    %int12288_1336 = torch.constant.int 12288
    %1125 = torch.prim.ListConstruct %int1_1334, %int512_1335, %int12288_1336 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1126 = torch.aten.view %1124, %1125 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_1337 = torch.constant.str "tanh"
    %1127 = torch.aten.gelu %1126, %str_1337 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_1338 = torch.constant.int 512
    %int12288_1339 = torch.constant.int 12288
    %1128 = torch.prim.ListConstruct %int512_1338, %int12288_1339 : (!torch.int, !torch.int) -> !torch.list<int>
    %1129 = torch.aten.view %1127, %1128 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.2.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.2.txt_mlp.2.weight : tensor<3072x12288xf16>
    %1130 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_1340 = torch.constant.int 0
    %int1_1341 = torch.constant.int 1
    %1131 = torch.aten.transpose.int %1130, %int0_1340, %int1_1341 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.2.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.2.txt_mlp.2.bias : tensor<3072xf16>
    %1132 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.2.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1342 = torch.constant.int 6
    %1133 = torch.prims.convert_element_type %1132, %int6_1342 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1343 = torch.constant.int 6
    %1134 = torch.prims.convert_element_type %1129, %int6_1343 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_1344 = torch.constant.int 6
    %1135 = torch.prims.convert_element_type %1131, %int6_1344 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %1136 = torch.aten.mm %1134, %1135 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_1345 = torch.constant.int 1
    %1137 = torch.aten.mul.Scalar %1136, %int1_1345 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_1346 = torch.constant.int 1
    %1138 = torch.aten.mul.Scalar %1133, %int1_1346 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1347 = torch.constant.int 1
    %1139 = torch.aten.add.Tensor %1137, %1138, %int1_1347 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_1348 = torch.constant.int 5
    %1140 = torch.prims.convert_element_type %1139, %int5_1348 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_1349 = torch.constant.int 1
    %int512_1350 = torch.constant.int 512
    %int3072_1351 = torch.constant.int 3072
    %1141 = torch.prim.ListConstruct %int1_1349, %int512_1350, %int3072_1351 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1142 = torch.aten.view %1140, %1141 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %1143 = torch.aten.mul.Tensor %875, %1142 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1352 = torch.constant.int 1
    %1144 = torch.aten.add.Tensor %1101, %1143, %int1_1352 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %1145 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.3.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.3.img_mod.lin.weight : tensor<18432x3072xf16>
    %1146 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_1353 = torch.constant.int 0
    %int1_1354 = torch.constant.int 1
    %1147 = torch.aten.transpose.int %1146, %int0_1353, %int1_1354 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.3.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.3.img_mod.lin.bias : tensor<18432xf16>
    %1148 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_1355 = torch.constant.int 6
    %1149 = torch.prims.convert_element_type %1148, %int6_1355 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_1356 = torch.constant.int 6
    %1150 = torch.prims.convert_element_type %1145, %int6_1356 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_1357 = torch.constant.int 6
    %1151 = torch.prims.convert_element_type %1147, %int6_1357 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %1152 = torch.aten.mm %1150, %1151 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_1358 = torch.constant.int 1
    %1153 = torch.aten.mul.Scalar %1152, %int1_1358 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_1359 = torch.constant.int 1
    %1154 = torch.aten.mul.Scalar %1149, %int1_1359 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_1360 = torch.constant.int 1
    %1155 = torch.aten.add.Tensor %1153, %1154, %int1_1360 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_1361 = torch.constant.int 5
    %1156 = torch.prims.convert_element_type %1155, %int5_1361 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_1362 = torch.constant.int 0
    %int0_1363 = torch.constant.int 0
    %int9223372036854775807_1364 = torch.constant.int 9223372036854775807
    %int1_1365 = torch.constant.int 1
    %1157 = torch.aten.slice.Tensor %1156, %int0_1362, %int0_1363, %int9223372036854775807_1364, %int1_1365 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_1366 = torch.constant.int 1
    %1158 = torch.aten.unsqueeze %1157, %int1_1366 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_1367 = torch.constant.int 2
    %int0_1368 = torch.constant.int 0
    %int9223372036854775807_1369 = torch.constant.int 9223372036854775807
    %int1_1370 = torch.constant.int 1
    %1159 = torch.aten.slice.Tensor %1158, %int2_1367, %int0_1368, %int9223372036854775807_1369, %int1_1370 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_1371 = torch.constant.int -1
    %int0_1372 = torch.constant.int 0
    %int3072_1373 = torch.constant.int 3072
    %int1_1374 = torch.constant.int 1
    %1160 = torch.aten.slice.Tensor %1159, %int-1_1371, %int0_1372, %int3072_1373, %int1_1374 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1375 = torch.constant.int -1
    %int3072_1376 = torch.constant.int 3072
    %int6144_1377 = torch.constant.int 6144
    %int1_1378 = torch.constant.int 1
    %1161 = torch.aten.slice.Tensor %1159, %int-1_1375, %int3072_1376, %int6144_1377, %int1_1378 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1379 = torch.constant.int -1
    %int6144_1380 = torch.constant.int 6144
    %int9216_1381 = torch.constant.int 9216
    %int1_1382 = torch.constant.int 1
    %1162 = torch.aten.slice.Tensor %1159, %int-1_1379, %int6144_1380, %int9216_1381, %int1_1382 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1383 = torch.constant.int -1
    %int9216_1384 = torch.constant.int 9216
    %int12288_1385 = torch.constant.int 12288
    %int1_1386 = torch.constant.int 1
    %1163 = torch.aten.slice.Tensor %1159, %int-1_1383, %int9216_1384, %int12288_1385, %int1_1386 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1387 = torch.constant.int -1
    %int12288_1388 = torch.constant.int 12288
    %int15360_1389 = torch.constant.int 15360
    %int1_1390 = torch.constant.int 1
    %1164 = torch.aten.slice.Tensor %1159, %int-1_1387, %int12288_1388, %int15360_1389, %int1_1390 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1391 = torch.constant.int -1
    %int15360_1392 = torch.constant.int 15360
    %int18432_1393 = torch.constant.int 18432
    %int1_1394 = torch.constant.int 1
    %1165 = torch.aten.slice.Tensor %1159, %int-1_1391, %int15360_1392, %int18432_1393, %int1_1394 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1166 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.3.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.3.txt_mod.lin.weight : tensor<18432x3072xf16>
    %1167 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_1395 = torch.constant.int 0
    %int1_1396 = torch.constant.int 1
    %1168 = torch.aten.transpose.int %1167, %int0_1395, %int1_1396 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.3.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.3.txt_mod.lin.bias : tensor<18432xf16>
    %1169 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_1397 = torch.constant.int 6
    %1170 = torch.prims.convert_element_type %1169, %int6_1397 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_1398 = torch.constant.int 6
    %1171 = torch.prims.convert_element_type %1166, %int6_1398 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_1399 = torch.constant.int 6
    %1172 = torch.prims.convert_element_type %1168, %int6_1399 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %1173 = torch.aten.mm %1171, %1172 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_1400 = torch.constant.int 1
    %1174 = torch.aten.mul.Scalar %1173, %int1_1400 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_1401 = torch.constant.int 1
    %1175 = torch.aten.mul.Scalar %1170, %int1_1401 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_1402 = torch.constant.int 1
    %1176 = torch.aten.add.Tensor %1174, %1175, %int1_1402 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_1403 = torch.constant.int 5
    %1177 = torch.prims.convert_element_type %1176, %int5_1403 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_1404 = torch.constant.int 0
    %int0_1405 = torch.constant.int 0
    %int9223372036854775807_1406 = torch.constant.int 9223372036854775807
    %int1_1407 = torch.constant.int 1
    %1178 = torch.aten.slice.Tensor %1177, %int0_1404, %int0_1405, %int9223372036854775807_1406, %int1_1407 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_1408 = torch.constant.int 1
    %1179 = torch.aten.unsqueeze %1178, %int1_1408 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_1409 = torch.constant.int 2
    %int0_1410 = torch.constant.int 0
    %int9223372036854775807_1411 = torch.constant.int 9223372036854775807
    %int1_1412 = torch.constant.int 1
    %1180 = torch.aten.slice.Tensor %1179, %int2_1409, %int0_1410, %int9223372036854775807_1411, %int1_1412 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_1413 = torch.constant.int -1
    %int0_1414 = torch.constant.int 0
    %int3072_1415 = torch.constant.int 3072
    %int1_1416 = torch.constant.int 1
    %1181 = torch.aten.slice.Tensor %1180, %int-1_1413, %int0_1414, %int3072_1415, %int1_1416 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1417 = torch.constant.int -1
    %int3072_1418 = torch.constant.int 3072
    %int6144_1419 = torch.constant.int 6144
    %int1_1420 = torch.constant.int 1
    %1182 = torch.aten.slice.Tensor %1180, %int-1_1417, %int3072_1418, %int6144_1419, %int1_1420 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1421 = torch.constant.int -1
    %int6144_1422 = torch.constant.int 6144
    %int9216_1423 = torch.constant.int 9216
    %int1_1424 = torch.constant.int 1
    %1183 = torch.aten.slice.Tensor %1180, %int-1_1421, %int6144_1422, %int9216_1423, %int1_1424 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1425 = torch.constant.int -1
    %int9216_1426 = torch.constant.int 9216
    %int12288_1427 = torch.constant.int 12288
    %int1_1428 = torch.constant.int 1
    %1184 = torch.aten.slice.Tensor %1180, %int-1_1425, %int9216_1426, %int12288_1427, %int1_1428 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1429 = torch.constant.int -1
    %int12288_1430 = torch.constant.int 12288
    %int15360_1431 = torch.constant.int 15360
    %int1_1432 = torch.constant.int 1
    %1185 = torch.aten.slice.Tensor %1180, %int-1_1429, %int12288_1430, %int15360_1431, %int1_1432 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1433 = torch.constant.int -1
    %int15360_1434 = torch.constant.int 15360
    %int18432_1435 = torch.constant.int 18432
    %int1_1436 = torch.constant.int 1
    %1186 = torch.aten.slice.Tensor %1180, %int-1_1433, %int15360_1434, %int18432_1435, %int1_1436 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_1437 = torch.constant.int 6
    %1187 = torch.prims.convert_element_type %1084, %int6_1437 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_1438 = torch.constant.int 2
    %1188 = torch.prim.ListConstruct %int2_1438 : (!torch.int) -> !torch.list<int>
    %int0_1439 = torch.constant.int 0
    %true_1440 = torch.constant.bool true
    %result0_1441, %result1_1442 = torch.aten.var_mean.correction %1187, %1188, %int0_1439, %true_1440 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_1443 = torch.constant.float 9.9999999999999995E-7
    %int1_1444 = torch.constant.int 1
    %1189 = torch.aten.add.Scalar %result0_1441, %float9.999990e-07_1443, %int1_1444 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %1190 = torch.aten.rsqrt %1189 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_1445 = torch.constant.int 1
    %1191 = torch.aten.sub.Tensor %1084, %result1_1442, %int1_1445 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %1192 = torch.aten.mul.Tensor %1191, %1190 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_1446 = torch.constant.int 5
    %1193 = torch.prims.convert_element_type %1192, %int5_1446 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1447 = torch.constant.int 1
    %int1_1448 = torch.constant.int 1
    %1194 = torch.aten.add.Scalar %1161, %int1_1447, %int1_1448 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1195 = torch.aten.mul.Tensor %1194, %1193 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1449 = torch.constant.int 1
    %1196 = torch.aten.add.Tensor %1195, %1160, %int1_1449 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_1450 = torch.constant.int 4096
    %int3072_1451 = torch.constant.int 3072
    %1197 = torch.prim.ListConstruct %int4096_1450, %int3072_1451 : (!torch.int, !torch.int) -> !torch.list<int>
    %1198 = torch.aten.view %1196, %1197 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.3.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.3.img_attn.qkv.weight : tensor<9216x3072xf16>
    %1199 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_1452 = torch.constant.int 0
    %int1_1453 = torch.constant.int 1
    %1200 = torch.aten.transpose.int %1199, %int0_1452, %int1_1453 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.3.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.3.img_attn.qkv.bias : tensor<9216xf16>
    %1201 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_1454 = torch.constant.int 6
    %1202 = torch.prims.convert_element_type %1201, %int6_1454 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_1455 = torch.constant.int 6
    %1203 = torch.prims.convert_element_type %1198, %int6_1455 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_1456 = torch.constant.int 6
    %1204 = torch.prims.convert_element_type %1200, %int6_1456 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %1205 = torch.aten.mm %1203, %1204 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_1457 = torch.constant.int 1
    %1206 = torch.aten.mul.Scalar %1205, %int1_1457 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_1458 = torch.constant.int 1
    %1207 = torch.aten.mul.Scalar %1202, %int1_1458 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_1459 = torch.constant.int 1
    %1208 = torch.aten.add.Tensor %1206, %1207, %int1_1459 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_1460 = torch.constant.int 5
    %1209 = torch.prims.convert_element_type %1208, %int5_1460 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_1461 = torch.constant.int 1
    %int4096_1462 = torch.constant.int 4096
    %int9216_1463 = torch.constant.int 9216
    %1210 = torch.prim.ListConstruct %int1_1461, %int4096_1462, %int9216_1463 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1211 = torch.aten.view %1209, %1210 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_1464 = torch.constant.int 1
    %int4096_1465 = torch.constant.int 4096
    %int3_1466 = torch.constant.int 3
    %int24_1467 = torch.constant.int 24
    %int128_1468 = torch.constant.int 128
    %1212 = torch.prim.ListConstruct %int1_1464, %int4096_1465, %int3_1466, %int24_1467, %int128_1468 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1213 = torch.aten.view %1211, %1212 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_1469 = torch.constant.int 2
    %int0_1470 = torch.constant.int 0
    %int3_1471 = torch.constant.int 3
    %int1_1472 = torch.constant.int 1
    %int4_1473 = torch.constant.int 4
    %1214 = torch.prim.ListConstruct %int2_1469, %int0_1470, %int3_1471, %int1_1472, %int4_1473 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1215 = torch.aten.permute %1213, %1214 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_1474 = torch.constant.int 0
    %int0_1475 = torch.constant.int 0
    %1216 = torch.aten.select.int %1215, %int0_1474, %int0_1475 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_1476 = torch.constant.int 0
    %int1_1477 = torch.constant.int 1
    %1217 = torch.aten.select.int %1215, %int0_1476, %int1_1477 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_1478 = torch.constant.int 0
    %int2_1479 = torch.constant.int 2
    %1218 = torch.aten.select.int %1215, %int0_1478, %int2_1479 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_1480 = torch.constant.int 6
    %1219 = torch.prims.convert_element_type %1216, %int6_1480 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_1481 = torch.constant.int 2
    %1220 = torch.aten.pow.Tensor_Scalar %1219, %int2_1481 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_1482 = torch.constant.int -1
    %1221 = torch.prim.ListConstruct %int-1_1482 : (!torch.int) -> !torch.list<int>
    %true_1483 = torch.constant.bool true
    %none_1484 = torch.constant.none
    %1222 = torch.aten.mean.dim %1220, %1221, %true_1483, %none_1484 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_1485 = torch.constant.float 9.9999999999999995E-7
    %int1_1486 = torch.constant.int 1
    %1223 = torch.aten.add.Scalar %1222, %float9.999990e-07_1485, %int1_1486 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %1224 = torch.aten.rsqrt %1223 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %1225 = torch.aten.mul.Tensor %1219, %1224 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_1487 = torch.constant.int 5
    %1226 = torch.prims.convert_element_type %1225, %int5_1487 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.3.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.3.img_attn.norm.query_norm.scale : tensor<128xf16>
    %1227 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1228 = torch.aten.mul.Tensor %1226, %1227 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_1488 = torch.constant.int 6
    %1229 = torch.prims.convert_element_type %1217, %int6_1488 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_1489 = torch.constant.int 2
    %1230 = torch.aten.pow.Tensor_Scalar %1229, %int2_1489 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_1490 = torch.constant.int -1
    %1231 = torch.prim.ListConstruct %int-1_1490 : (!torch.int) -> !torch.list<int>
    %true_1491 = torch.constant.bool true
    %none_1492 = torch.constant.none
    %1232 = torch.aten.mean.dim %1230, %1231, %true_1491, %none_1492 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_1493 = torch.constant.float 9.9999999999999995E-7
    %int1_1494 = torch.constant.int 1
    %1233 = torch.aten.add.Scalar %1232, %float9.999990e-07_1493, %int1_1494 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %1234 = torch.aten.rsqrt %1233 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %1235 = torch.aten.mul.Tensor %1229, %1234 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_1495 = torch.constant.int 5
    %1236 = torch.prims.convert_element_type %1235, %int5_1495 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.3.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.3.img_attn.norm.key_norm.scale : tensor<128xf16>
    %1237 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1238 = torch.aten.mul.Tensor %1236, %1237 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_1496 = torch.constant.int 5
    %1239 = torch.prims.convert_element_type %1228, %int5_1496 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_1497 = torch.constant.int 5
    %1240 = torch.prims.convert_element_type %1238, %int5_1497 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_1498 = torch.constant.int 6
    %1241 = torch.prims.convert_element_type %1144, %int6_1498 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_1499 = torch.constant.int 2
    %1242 = torch.prim.ListConstruct %int2_1499 : (!torch.int) -> !torch.list<int>
    %int0_1500 = torch.constant.int 0
    %true_1501 = torch.constant.bool true
    %result0_1502, %result1_1503 = torch.aten.var_mean.correction %1241, %1242, %int0_1500, %true_1501 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_1504 = torch.constant.float 9.9999999999999995E-7
    %int1_1505 = torch.constant.int 1
    %1243 = torch.aten.add.Scalar %result0_1502, %float9.999990e-07_1504, %int1_1505 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %1244 = torch.aten.rsqrt %1243 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_1506 = torch.constant.int 1
    %1245 = torch.aten.sub.Tensor %1144, %result1_1503, %int1_1506 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %1246 = torch.aten.mul.Tensor %1245, %1244 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_1507 = torch.constant.int 5
    %1247 = torch.prims.convert_element_type %1246, %int5_1507 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_1508 = torch.constant.int 1
    %int1_1509 = torch.constant.int 1
    %1248 = torch.aten.add.Scalar %1182, %int1_1508, %int1_1509 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1249 = torch.aten.mul.Tensor %1248, %1247 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1510 = torch.constant.int 1
    %1250 = torch.aten.add.Tensor %1249, %1181, %int1_1510 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_1511 = torch.constant.int 512
    %int3072_1512 = torch.constant.int 3072
    %1251 = torch.prim.ListConstruct %int512_1511, %int3072_1512 : (!torch.int, !torch.int) -> !torch.list<int>
    %1252 = torch.aten.view %1250, %1251 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.3.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.3.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %1253 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_1513 = torch.constant.int 0
    %int1_1514 = torch.constant.int 1
    %1254 = torch.aten.transpose.int %1253, %int0_1513, %int1_1514 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.3.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.3.txt_attn.qkv.bias : tensor<9216xf16>
    %1255 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_1515 = torch.constant.int 6
    %1256 = torch.prims.convert_element_type %1255, %int6_1515 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_1516 = torch.constant.int 6
    %1257 = torch.prims.convert_element_type %1252, %int6_1516 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_1517 = torch.constant.int 6
    %1258 = torch.prims.convert_element_type %1254, %int6_1517 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %1259 = torch.aten.mm %1257, %1258 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_1518 = torch.constant.int 1
    %1260 = torch.aten.mul.Scalar %1259, %int1_1518 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_1519 = torch.constant.int 1
    %1261 = torch.aten.mul.Scalar %1256, %int1_1519 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_1520 = torch.constant.int 1
    %1262 = torch.aten.add.Tensor %1260, %1261, %int1_1520 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_1521 = torch.constant.int 5
    %1263 = torch.prims.convert_element_type %1262, %int5_1521 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_1522 = torch.constant.int 1
    %int512_1523 = torch.constant.int 512
    %int9216_1524 = torch.constant.int 9216
    %1264 = torch.prim.ListConstruct %int1_1522, %int512_1523, %int9216_1524 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1265 = torch.aten.view %1263, %1264 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_1525 = torch.constant.int 1
    %int512_1526 = torch.constant.int 512
    %int3_1527 = torch.constant.int 3
    %int24_1528 = torch.constant.int 24
    %int128_1529 = torch.constant.int 128
    %1266 = torch.prim.ListConstruct %int1_1525, %int512_1526, %int3_1527, %int24_1528, %int128_1529 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1267 = torch.aten.view %1265, %1266 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_1530 = torch.constant.int 2
    %int0_1531 = torch.constant.int 0
    %int3_1532 = torch.constant.int 3
    %int1_1533 = torch.constant.int 1
    %int4_1534 = torch.constant.int 4
    %1268 = torch.prim.ListConstruct %int2_1530, %int0_1531, %int3_1532, %int1_1533, %int4_1534 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1269 = torch.aten.permute %1267, %1268 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_1535 = torch.constant.int 0
    %int0_1536 = torch.constant.int 0
    %1270 = torch.aten.select.int %1269, %int0_1535, %int0_1536 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_1537 = torch.constant.int 0
    %int1_1538 = torch.constant.int 1
    %1271 = torch.aten.select.int %1269, %int0_1537, %int1_1538 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_1539 = torch.constant.int 0
    %int2_1540 = torch.constant.int 2
    %1272 = torch.aten.select.int %1269, %int0_1539, %int2_1540 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_1541 = torch.constant.int 6
    %1273 = torch.prims.convert_element_type %1270, %int6_1541 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_1542 = torch.constant.int 2
    %1274 = torch.aten.pow.Tensor_Scalar %1273, %int2_1542 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_1543 = torch.constant.int -1
    %1275 = torch.prim.ListConstruct %int-1_1543 : (!torch.int) -> !torch.list<int>
    %true_1544 = torch.constant.bool true
    %none_1545 = torch.constant.none
    %1276 = torch.aten.mean.dim %1274, %1275, %true_1544, %none_1545 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_1546 = torch.constant.float 9.9999999999999995E-7
    %int1_1547 = torch.constant.int 1
    %1277 = torch.aten.add.Scalar %1276, %float9.999990e-07_1546, %int1_1547 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %1278 = torch.aten.rsqrt %1277 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %1279 = torch.aten.mul.Tensor %1273, %1278 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_1548 = torch.constant.int 5
    %1280 = torch.prims.convert_element_type %1279, %int5_1548 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.3.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.3.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %1281 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1282 = torch.aten.mul.Tensor %1280, %1281 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_1549 = torch.constant.int 6
    %1283 = torch.prims.convert_element_type %1271, %int6_1549 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_1550 = torch.constant.int 2
    %1284 = torch.aten.pow.Tensor_Scalar %1283, %int2_1550 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_1551 = torch.constant.int -1
    %1285 = torch.prim.ListConstruct %int-1_1551 : (!torch.int) -> !torch.list<int>
    %true_1552 = torch.constant.bool true
    %none_1553 = torch.constant.none
    %1286 = torch.aten.mean.dim %1284, %1285, %true_1552, %none_1553 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_1554 = torch.constant.float 9.9999999999999995E-7
    %int1_1555 = torch.constant.int 1
    %1287 = torch.aten.add.Scalar %1286, %float9.999990e-07_1554, %int1_1555 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %1288 = torch.aten.rsqrt %1287 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %1289 = torch.aten.mul.Tensor %1283, %1288 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_1556 = torch.constant.int 5
    %1290 = torch.prims.convert_element_type %1289, %int5_1556 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.3.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.3.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %1291 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1292 = torch.aten.mul.Tensor %1290, %1291 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_1557 = torch.constant.int 5
    %1293 = torch.prims.convert_element_type %1282, %int5_1557 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_1558 = torch.constant.int 5
    %1294 = torch.prims.convert_element_type %1292, %int5_1558 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %1295 = torch.prim.ListConstruct %1293, %1239 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_1559 = torch.constant.int 2
    %1296 = torch.aten.cat %1295, %int2_1559 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %1297 = torch.prim.ListConstruct %1294, %1240 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_1560 = torch.constant.int 2
    %1298 = torch.aten.cat %1297, %int2_1560 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %1299 = torch.prim.ListConstruct %1272, %1218 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_1561 = torch.constant.int 2
    %1300 = torch.aten.cat %1299, %int2_1561 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_1562 = torch.constant.int 6
    %1301 = torch.prims.convert_element_type %1296, %int6_1562 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_1563 = torch.constant.int 1
    %int24_1564 = torch.constant.int 24
    %int4608_1565 = torch.constant.int 4608
    %int-1_1566 = torch.constant.int -1
    %int1_1567 = torch.constant.int 1
    %int2_1568 = torch.constant.int 2
    %1302 = torch.prim.ListConstruct %int1_1563, %int24_1564, %int4608_1565, %int-1_1566, %int1_1567, %int2_1568 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1303 = torch.aten.view %1301, %1302 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_1569 = torch.constant.int 6
    %1304 = torch.prims.convert_element_type %1298, %int6_1569 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_1570 = torch.constant.int 1
    %int24_1571 = torch.constant.int 24
    %int4608_1572 = torch.constant.int 4608
    %int-1_1573 = torch.constant.int -1
    %int1_1574 = torch.constant.int 1
    %int2_1575 = torch.constant.int 2
    %1305 = torch.prim.ListConstruct %int1_1570, %int24_1571, %int4608_1572, %int-1_1573, %int1_1574, %int2_1575 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1306 = torch.aten.view %1304, %1305 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_1576 = torch.constant.int 5
    %int0_1577 = torch.constant.int 0
    %1307 = torch.aten.select.int %211, %int5_1576, %int0_1577 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1578 = torch.constant.int 5
    %int0_1579 = torch.constant.int 0
    %1308 = torch.aten.select.int %1303, %int5_1578, %int0_1579 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1309 = torch.aten.mul.Tensor %1307, %1308 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_1580 = torch.constant.int 5
    %int1_1581 = torch.constant.int 1
    %1310 = torch.aten.select.int %211, %int5_1580, %int1_1581 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1582 = torch.constant.int 5
    %int1_1583 = torch.constant.int 1
    %1311 = torch.aten.select.int %1303, %int5_1582, %int1_1583 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1312 = torch.aten.mul.Tensor %1310, %1311 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_1584 = torch.constant.int 1
    %1313 = torch.aten.add.Tensor %1309, %1312, %int1_1584 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_1585 = torch.constant.int 5
    %int0_1586 = torch.constant.int 0
    %1314 = torch.aten.select.int %211, %int5_1585, %int0_1586 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1587 = torch.constant.int 5
    %int0_1588 = torch.constant.int 0
    %1315 = torch.aten.select.int %1306, %int5_1587, %int0_1588 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1316 = torch.aten.mul.Tensor %1314, %1315 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_1589 = torch.constant.int 5
    %int1_1590 = torch.constant.int 1
    %1317 = torch.aten.select.int %211, %int5_1589, %int1_1590 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1591 = torch.constant.int 5
    %int1_1592 = torch.constant.int 1
    %1318 = torch.aten.select.int %1306, %int5_1591, %int1_1592 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1319 = torch.aten.mul.Tensor %1317, %1318 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_1593 = torch.constant.int 1
    %1320 = torch.aten.add.Tensor %1316, %1319, %int1_1593 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_1594 = torch.constant.int 1
    %int24_1595 = torch.constant.int 24
    %int4608_1596 = torch.constant.int 4608
    %int128_1597 = torch.constant.int 128
    %1321 = torch.prim.ListConstruct %int1_1594, %int24_1595, %int4608_1596, %int128_1597 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1322 = torch.aten.view %1313, %1321 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_1598 = torch.constant.int 5
    %1323 = torch.prims.convert_element_type %1322, %int5_1598 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_1599 = torch.constant.int 1
    %int24_1600 = torch.constant.int 24
    %int4608_1601 = torch.constant.int 4608
    %int128_1602 = torch.constant.int 128
    %1324 = torch.prim.ListConstruct %int1_1599, %int24_1600, %int4608_1601, %int128_1602 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1325 = torch.aten.view %1320, %1324 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_1603 = torch.constant.int 5
    %1326 = torch.prims.convert_element_type %1325, %int5_1603 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_1604 = torch.constant.float 0.000000e+00
    %false_1605 = torch.constant.bool false
    %none_1606 = torch.constant.none
    %none_1607 = torch.constant.none
    %1327:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1323, %1326, %1300, %float0.000000e00_1604, %false_1605, %none_1606, %none_1607) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_1608 = torch.constant.int 0
    %int2_1609 = torch.constant.int 2
    %int1_1610 = torch.constant.int 1
    %int3_1611 = torch.constant.int 3
    %1328 = torch.prim.ListConstruct %int0_1608, %int2_1609, %int1_1610, %int3_1611 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1329 = torch.aten.permute %1327#0, %1328 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_1612 = torch.constant.int 1
    %int4608_1613 = torch.constant.int 4608
    %int3072_1614 = torch.constant.int 3072
    %1330 = torch.prim.ListConstruct %int1_1612, %int4608_1613, %int3072_1614 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1331 = torch.aten.view %1329, %1330 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_1615 = torch.constant.int 0
    %int0_1616 = torch.constant.int 0
    %int9223372036854775807_1617 = torch.constant.int 9223372036854775807
    %int1_1618 = torch.constant.int 1
    %1332 = torch.aten.slice.Tensor %1331, %int0_1615, %int0_1616, %int9223372036854775807_1617, %int1_1618 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_1619 = torch.constant.int 1
    %int0_1620 = torch.constant.int 0
    %int512_1621 = torch.constant.int 512
    %int1_1622 = torch.constant.int 1
    %1333 = torch.aten.slice.Tensor %1332, %int1_1619, %int0_1620, %int512_1621, %int1_1622 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_1623 = torch.constant.int 0
    %int0_1624 = torch.constant.int 0
    %int9223372036854775807_1625 = torch.constant.int 9223372036854775807
    %int1_1626 = torch.constant.int 1
    %1334 = torch.aten.slice.Tensor %1331, %int0_1623, %int0_1624, %int9223372036854775807_1625, %int1_1626 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_1627 = torch.constant.int 1
    %int512_1628 = torch.constant.int 512
    %int9223372036854775807_1629 = torch.constant.int 9223372036854775807
    %int1_1630 = torch.constant.int 1
    %1335 = torch.aten.slice.Tensor %1334, %int1_1627, %int512_1628, %int9223372036854775807_1629, %int1_1630 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_1631 = torch.constant.int 4096
    %int3072_1632 = torch.constant.int 3072
    %1336 = torch.prim.ListConstruct %int4096_1631, %int3072_1632 : (!torch.int, !torch.int) -> !torch.list<int>
    %1337 = torch.aten.view %1335, %1336 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.3.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.3.img_attn.proj.weight : tensor<3072x3072xf16>
    %1338 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_1633 = torch.constant.int 0
    %int1_1634 = torch.constant.int 1
    %1339 = torch.aten.transpose.int %1338, %int0_1633, %int1_1634 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.3.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.3.img_attn.proj.bias : tensor<3072xf16>
    %1340 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1635 = torch.constant.int 6
    %1341 = torch.prims.convert_element_type %1340, %int6_1635 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1636 = torch.constant.int 6
    %1342 = torch.prims.convert_element_type %1337, %int6_1636 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_1637 = torch.constant.int 6
    %1343 = torch.prims.convert_element_type %1339, %int6_1637 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %1344 = torch.aten.mm %1342, %1343 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_1638 = torch.constant.int 1
    %1345 = torch.aten.mul.Scalar %1344, %int1_1638 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_1639 = torch.constant.int 1
    %1346 = torch.aten.mul.Scalar %1341, %int1_1639 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1640 = torch.constant.int 1
    %1347 = torch.aten.add.Tensor %1345, %1346, %int1_1640 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_1641 = torch.constant.int 5
    %1348 = torch.prims.convert_element_type %1347, %int5_1641 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_1642 = torch.constant.int 1
    %int4096_1643 = torch.constant.int 4096
    %int3072_1644 = torch.constant.int 3072
    %1349 = torch.prim.ListConstruct %int1_1642, %int4096_1643, %int3072_1644 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1350 = torch.aten.view %1348, %1349 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %1351 = torch.aten.mul.Tensor %1162, %1350 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1645 = torch.constant.int 1
    %1352 = torch.aten.add.Tensor %1084, %1351, %int1_1645 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1646 = torch.constant.int 1
    %int1_1647 = torch.constant.int 1
    %1353 = torch.aten.add.Scalar %1164, %int1_1646, %int1_1647 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_1648 = torch.constant.int 6
    %1354 = torch.prims.convert_element_type %1352, %int6_1648 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_1649 = torch.constant.int 2
    %1355 = torch.prim.ListConstruct %int2_1649 : (!torch.int) -> !torch.list<int>
    %int0_1650 = torch.constant.int 0
    %true_1651 = torch.constant.bool true
    %result0_1652, %result1_1653 = torch.aten.var_mean.correction %1354, %1355, %int0_1650, %true_1651 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_1654 = torch.constant.float 9.9999999999999995E-7
    %int1_1655 = torch.constant.int 1
    %1356 = torch.aten.add.Scalar %result0_1652, %float9.999990e-07_1654, %int1_1655 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %1357 = torch.aten.rsqrt %1356 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_1656 = torch.constant.int 1
    %1358 = torch.aten.sub.Tensor %1352, %result1_1653, %int1_1656 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %1359 = torch.aten.mul.Tensor %1358, %1357 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_1657 = torch.constant.int 5
    %1360 = torch.prims.convert_element_type %1359, %int5_1657 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %1361 = torch.aten.mul.Tensor %1353, %1360 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1658 = torch.constant.int 1
    %1362 = torch.aten.add.Tensor %1361, %1163, %int1_1658 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_1659 = torch.constant.int 4096
    %int3072_1660 = torch.constant.int 3072
    %1363 = torch.prim.ListConstruct %int4096_1659, %int3072_1660 : (!torch.int, !torch.int) -> !torch.list<int>
    %1364 = torch.aten.view %1362, %1363 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.3.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.3.img_mlp.0.weight : tensor<12288x3072xf16>
    %1365 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_1661 = torch.constant.int 0
    %int1_1662 = torch.constant.int 1
    %1366 = torch.aten.transpose.int %1365, %int0_1661, %int1_1662 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.3.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.3.img_mlp.0.bias : tensor<12288xf16>
    %1367 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_1663 = torch.constant.int 6
    %1368 = torch.prims.convert_element_type %1367, %int6_1663 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_1664 = torch.constant.int 6
    %1369 = torch.prims.convert_element_type %1364, %int6_1664 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_1665 = torch.constant.int 6
    %1370 = torch.prims.convert_element_type %1366, %int6_1665 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %1371 = torch.aten.mm %1369, %1370 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_1666 = torch.constant.int 1
    %1372 = torch.aten.mul.Scalar %1371, %int1_1666 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_1667 = torch.constant.int 1
    %1373 = torch.aten.mul.Scalar %1368, %int1_1667 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_1668 = torch.constant.int 1
    %1374 = torch.aten.add.Tensor %1372, %1373, %int1_1668 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_1669 = torch.constant.int 5
    %1375 = torch.prims.convert_element_type %1374, %int5_1669 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_1670 = torch.constant.int 1
    %int4096_1671 = torch.constant.int 4096
    %int12288_1672 = torch.constant.int 12288
    %1376 = torch.prim.ListConstruct %int1_1670, %int4096_1671, %int12288_1672 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1377 = torch.aten.view %1375, %1376 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_1673 = torch.constant.str "tanh"
    %1378 = torch.aten.gelu %1377, %str_1673 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_1674 = torch.constant.int 4096
    %int12288_1675 = torch.constant.int 12288
    %1379 = torch.prim.ListConstruct %int4096_1674, %int12288_1675 : (!torch.int, !torch.int) -> !torch.list<int>
    %1380 = torch.aten.view %1378, %1379 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.3.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.3.img_mlp.2.weight : tensor<3072x12288xf16>
    %1381 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_1676 = torch.constant.int 0
    %int1_1677 = torch.constant.int 1
    %1382 = torch.aten.transpose.int %1381, %int0_1676, %int1_1677 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.3.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.3.img_mlp.2.bias : tensor<3072xf16>
    %1383 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1678 = torch.constant.int 6
    %1384 = torch.prims.convert_element_type %1383, %int6_1678 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1679 = torch.constant.int 6
    %1385 = torch.prims.convert_element_type %1380, %int6_1679 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_1680 = torch.constant.int 6
    %1386 = torch.prims.convert_element_type %1382, %int6_1680 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %1387 = torch.aten.mm %1385, %1386 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_1681 = torch.constant.int 1
    %1388 = torch.aten.mul.Scalar %1387, %int1_1681 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_1682 = torch.constant.int 1
    %1389 = torch.aten.mul.Scalar %1384, %int1_1682 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1683 = torch.constant.int 1
    %1390 = torch.aten.add.Tensor %1388, %1389, %int1_1683 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_1684 = torch.constant.int 5
    %1391 = torch.prims.convert_element_type %1390, %int5_1684 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_1685 = torch.constant.int 1
    %int4096_1686 = torch.constant.int 4096
    %int3072_1687 = torch.constant.int 3072
    %1392 = torch.prim.ListConstruct %int1_1685, %int4096_1686, %int3072_1687 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1393 = torch.aten.view %1391, %1392 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %1394 = torch.aten.mul.Tensor %1165, %1393 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1688 = torch.constant.int 1
    %1395 = torch.aten.add.Tensor %1352, %1394, %int1_1688 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_1689 = torch.constant.int 512
    %int3072_1690 = torch.constant.int 3072
    %1396 = torch.prim.ListConstruct %int512_1689, %int3072_1690 : (!torch.int, !torch.int) -> !torch.list<int>
    %1397 = torch.aten.view %1333, %1396 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.3.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.3.txt_attn.proj.weight : tensor<3072x3072xf16>
    %1398 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_1691 = torch.constant.int 0
    %int1_1692 = torch.constant.int 1
    %1399 = torch.aten.transpose.int %1398, %int0_1691, %int1_1692 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.3.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.3.txt_attn.proj.bias : tensor<3072xf16>
    %1400 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1693 = torch.constant.int 6
    %1401 = torch.prims.convert_element_type %1400, %int6_1693 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1694 = torch.constant.int 6
    %1402 = torch.prims.convert_element_type %1397, %int6_1694 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_1695 = torch.constant.int 6
    %1403 = torch.prims.convert_element_type %1399, %int6_1695 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %1404 = torch.aten.mm %1402, %1403 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_1696 = torch.constant.int 1
    %1405 = torch.aten.mul.Scalar %1404, %int1_1696 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_1697 = torch.constant.int 1
    %1406 = torch.aten.mul.Scalar %1401, %int1_1697 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1698 = torch.constant.int 1
    %1407 = torch.aten.add.Tensor %1405, %1406, %int1_1698 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_1699 = torch.constant.int 5
    %1408 = torch.prims.convert_element_type %1407, %int5_1699 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_1700 = torch.constant.int 1
    %int512_1701 = torch.constant.int 512
    %int3072_1702 = torch.constant.int 3072
    %1409 = torch.prim.ListConstruct %int1_1700, %int512_1701, %int3072_1702 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1410 = torch.aten.view %1408, %1409 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %1411 = torch.aten.mul.Tensor %1183, %1410 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1703 = torch.constant.int 1
    %1412 = torch.aten.add.Tensor %1144, %1411, %int1_1703 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_1704 = torch.constant.int 1
    %int1_1705 = torch.constant.int 1
    %1413 = torch.aten.add.Scalar %1185, %int1_1704, %int1_1705 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_1706 = torch.constant.int 6
    %1414 = torch.prims.convert_element_type %1412, %int6_1706 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_1707 = torch.constant.int 2
    %1415 = torch.prim.ListConstruct %int2_1707 : (!torch.int) -> !torch.list<int>
    %int0_1708 = torch.constant.int 0
    %true_1709 = torch.constant.bool true
    %result0_1710, %result1_1711 = torch.aten.var_mean.correction %1414, %1415, %int0_1708, %true_1709 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_1712 = torch.constant.float 9.9999999999999995E-7
    %int1_1713 = torch.constant.int 1
    %1416 = torch.aten.add.Scalar %result0_1710, %float9.999990e-07_1712, %int1_1713 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %1417 = torch.aten.rsqrt %1416 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_1714 = torch.constant.int 1
    %1418 = torch.aten.sub.Tensor %1412, %result1_1711, %int1_1714 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %1419 = torch.aten.mul.Tensor %1418, %1417 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_1715 = torch.constant.int 5
    %1420 = torch.prims.convert_element_type %1419, %int5_1715 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %1421 = torch.aten.mul.Tensor %1413, %1420 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1716 = torch.constant.int 1
    %1422 = torch.aten.add.Tensor %1421, %1184, %int1_1716 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_1717 = torch.constant.int 512
    %int3072_1718 = torch.constant.int 3072
    %1423 = torch.prim.ListConstruct %int512_1717, %int3072_1718 : (!torch.int, !torch.int) -> !torch.list<int>
    %1424 = torch.aten.view %1422, %1423 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.3.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.3.txt_mlp.0.weight : tensor<12288x3072xf16>
    %1425 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_1719 = torch.constant.int 0
    %int1_1720 = torch.constant.int 1
    %1426 = torch.aten.transpose.int %1425, %int0_1719, %int1_1720 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.3.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.3.txt_mlp.0.bias : tensor<12288xf16>
    %1427 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_1721 = torch.constant.int 6
    %1428 = torch.prims.convert_element_type %1427, %int6_1721 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_1722 = torch.constant.int 6
    %1429 = torch.prims.convert_element_type %1424, %int6_1722 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_1723 = torch.constant.int 6
    %1430 = torch.prims.convert_element_type %1426, %int6_1723 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %1431 = torch.aten.mm %1429, %1430 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_1724 = torch.constant.int 1
    %1432 = torch.aten.mul.Scalar %1431, %int1_1724 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_1725 = torch.constant.int 1
    %1433 = torch.aten.mul.Scalar %1428, %int1_1725 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_1726 = torch.constant.int 1
    %1434 = torch.aten.add.Tensor %1432, %1433, %int1_1726 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_1727 = torch.constant.int 5
    %1435 = torch.prims.convert_element_type %1434, %int5_1727 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_1728 = torch.constant.int 1
    %int512_1729 = torch.constant.int 512
    %int12288_1730 = torch.constant.int 12288
    %1436 = torch.prim.ListConstruct %int1_1728, %int512_1729, %int12288_1730 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1437 = torch.aten.view %1435, %1436 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_1731 = torch.constant.str "tanh"
    %1438 = torch.aten.gelu %1437, %str_1731 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_1732 = torch.constant.int 512
    %int12288_1733 = torch.constant.int 12288
    %1439 = torch.prim.ListConstruct %int512_1732, %int12288_1733 : (!torch.int, !torch.int) -> !torch.list<int>
    %1440 = torch.aten.view %1438, %1439 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.3.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.3.txt_mlp.2.weight : tensor<3072x12288xf16>
    %1441 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_1734 = torch.constant.int 0
    %int1_1735 = torch.constant.int 1
    %1442 = torch.aten.transpose.int %1441, %int0_1734, %int1_1735 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.3.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.3.txt_mlp.2.bias : tensor<3072xf16>
    %1443 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.3.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_1736 = torch.constant.int 6
    %1444 = torch.prims.convert_element_type %1443, %int6_1736 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_1737 = torch.constant.int 6
    %1445 = torch.prims.convert_element_type %1440, %int6_1737 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_1738 = torch.constant.int 6
    %1446 = torch.prims.convert_element_type %1442, %int6_1738 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %1447 = torch.aten.mm %1445, %1446 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_1739 = torch.constant.int 1
    %1448 = torch.aten.mul.Scalar %1447, %int1_1739 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_1740 = torch.constant.int 1
    %1449 = torch.aten.mul.Scalar %1444, %int1_1740 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_1741 = torch.constant.int 1
    %1450 = torch.aten.add.Tensor %1448, %1449, %int1_1741 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_1742 = torch.constant.int 5
    %1451 = torch.prims.convert_element_type %1450, %int5_1742 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_1743 = torch.constant.int 1
    %int512_1744 = torch.constant.int 512
    %int3072_1745 = torch.constant.int 3072
    %1452 = torch.prim.ListConstruct %int1_1743, %int512_1744, %int3072_1745 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1453 = torch.aten.view %1451, %1452 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %1454 = torch.aten.mul.Tensor %1186, %1453 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1746 = torch.constant.int 1
    %1455 = torch.aten.add.Tensor %1412, %1454, %int1_1746 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %1456 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.4.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.4.img_mod.lin.weight : tensor<18432x3072xf16>
    %1457 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_1747 = torch.constant.int 0
    %int1_1748 = torch.constant.int 1
    %1458 = torch.aten.transpose.int %1457, %int0_1747, %int1_1748 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.4.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.4.img_mod.lin.bias : tensor<18432xf16>
    %1459 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_1749 = torch.constant.int 6
    %1460 = torch.prims.convert_element_type %1459, %int6_1749 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_1750 = torch.constant.int 6
    %1461 = torch.prims.convert_element_type %1456, %int6_1750 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_1751 = torch.constant.int 6
    %1462 = torch.prims.convert_element_type %1458, %int6_1751 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %1463 = torch.aten.mm %1461, %1462 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_1752 = torch.constant.int 1
    %1464 = torch.aten.mul.Scalar %1463, %int1_1752 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_1753 = torch.constant.int 1
    %1465 = torch.aten.mul.Scalar %1460, %int1_1753 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_1754 = torch.constant.int 1
    %1466 = torch.aten.add.Tensor %1464, %1465, %int1_1754 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_1755 = torch.constant.int 5
    %1467 = torch.prims.convert_element_type %1466, %int5_1755 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_1756 = torch.constant.int 0
    %int0_1757 = torch.constant.int 0
    %int9223372036854775807_1758 = torch.constant.int 9223372036854775807
    %int1_1759 = torch.constant.int 1
    %1468 = torch.aten.slice.Tensor %1467, %int0_1756, %int0_1757, %int9223372036854775807_1758, %int1_1759 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_1760 = torch.constant.int 1
    %1469 = torch.aten.unsqueeze %1468, %int1_1760 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_1761 = torch.constant.int 2
    %int0_1762 = torch.constant.int 0
    %int9223372036854775807_1763 = torch.constant.int 9223372036854775807
    %int1_1764 = torch.constant.int 1
    %1470 = torch.aten.slice.Tensor %1469, %int2_1761, %int0_1762, %int9223372036854775807_1763, %int1_1764 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_1765 = torch.constant.int -1
    %int0_1766 = torch.constant.int 0
    %int3072_1767 = torch.constant.int 3072
    %int1_1768 = torch.constant.int 1
    %1471 = torch.aten.slice.Tensor %1470, %int-1_1765, %int0_1766, %int3072_1767, %int1_1768 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1769 = torch.constant.int -1
    %int3072_1770 = torch.constant.int 3072
    %int6144_1771 = torch.constant.int 6144
    %int1_1772 = torch.constant.int 1
    %1472 = torch.aten.slice.Tensor %1470, %int-1_1769, %int3072_1770, %int6144_1771, %int1_1772 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1773 = torch.constant.int -1
    %int6144_1774 = torch.constant.int 6144
    %int9216_1775 = torch.constant.int 9216
    %int1_1776 = torch.constant.int 1
    %1473 = torch.aten.slice.Tensor %1470, %int-1_1773, %int6144_1774, %int9216_1775, %int1_1776 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1777 = torch.constant.int -1
    %int9216_1778 = torch.constant.int 9216
    %int12288_1779 = torch.constant.int 12288
    %int1_1780 = torch.constant.int 1
    %1474 = torch.aten.slice.Tensor %1470, %int-1_1777, %int9216_1778, %int12288_1779, %int1_1780 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1781 = torch.constant.int -1
    %int12288_1782 = torch.constant.int 12288
    %int15360_1783 = torch.constant.int 15360
    %int1_1784 = torch.constant.int 1
    %1475 = torch.aten.slice.Tensor %1470, %int-1_1781, %int12288_1782, %int15360_1783, %int1_1784 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1785 = torch.constant.int -1
    %int15360_1786 = torch.constant.int 15360
    %int18432_1787 = torch.constant.int 18432
    %int1_1788 = torch.constant.int 1
    %1476 = torch.aten.slice.Tensor %1470, %int-1_1785, %int15360_1786, %int18432_1787, %int1_1788 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1477 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.4.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.4.txt_mod.lin.weight : tensor<18432x3072xf16>
    %1478 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_1789 = torch.constant.int 0
    %int1_1790 = torch.constant.int 1
    %1479 = torch.aten.transpose.int %1478, %int0_1789, %int1_1790 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.4.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.4.txt_mod.lin.bias : tensor<18432xf16>
    %1480 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_1791 = torch.constant.int 6
    %1481 = torch.prims.convert_element_type %1480, %int6_1791 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_1792 = torch.constant.int 6
    %1482 = torch.prims.convert_element_type %1477, %int6_1792 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_1793 = torch.constant.int 6
    %1483 = torch.prims.convert_element_type %1479, %int6_1793 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %1484 = torch.aten.mm %1482, %1483 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_1794 = torch.constant.int 1
    %1485 = torch.aten.mul.Scalar %1484, %int1_1794 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_1795 = torch.constant.int 1
    %1486 = torch.aten.mul.Scalar %1481, %int1_1795 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_1796 = torch.constant.int 1
    %1487 = torch.aten.add.Tensor %1485, %1486, %int1_1796 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_1797 = torch.constant.int 5
    %1488 = torch.prims.convert_element_type %1487, %int5_1797 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_1798 = torch.constant.int 0
    %int0_1799 = torch.constant.int 0
    %int9223372036854775807_1800 = torch.constant.int 9223372036854775807
    %int1_1801 = torch.constant.int 1
    %1489 = torch.aten.slice.Tensor %1488, %int0_1798, %int0_1799, %int9223372036854775807_1800, %int1_1801 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_1802 = torch.constant.int 1
    %1490 = torch.aten.unsqueeze %1489, %int1_1802 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_1803 = torch.constant.int 2
    %int0_1804 = torch.constant.int 0
    %int9223372036854775807_1805 = torch.constant.int 9223372036854775807
    %int1_1806 = torch.constant.int 1
    %1491 = torch.aten.slice.Tensor %1490, %int2_1803, %int0_1804, %int9223372036854775807_1805, %int1_1806 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_1807 = torch.constant.int -1
    %int0_1808 = torch.constant.int 0
    %int3072_1809 = torch.constant.int 3072
    %int1_1810 = torch.constant.int 1
    %1492 = torch.aten.slice.Tensor %1491, %int-1_1807, %int0_1808, %int3072_1809, %int1_1810 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1811 = torch.constant.int -1
    %int3072_1812 = torch.constant.int 3072
    %int6144_1813 = torch.constant.int 6144
    %int1_1814 = torch.constant.int 1
    %1493 = torch.aten.slice.Tensor %1491, %int-1_1811, %int3072_1812, %int6144_1813, %int1_1814 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1815 = torch.constant.int -1
    %int6144_1816 = torch.constant.int 6144
    %int9216_1817 = torch.constant.int 9216
    %int1_1818 = torch.constant.int 1
    %1494 = torch.aten.slice.Tensor %1491, %int-1_1815, %int6144_1816, %int9216_1817, %int1_1818 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1819 = torch.constant.int -1
    %int9216_1820 = torch.constant.int 9216
    %int12288_1821 = torch.constant.int 12288
    %int1_1822 = torch.constant.int 1
    %1495 = torch.aten.slice.Tensor %1491, %int-1_1819, %int9216_1820, %int12288_1821, %int1_1822 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1823 = torch.constant.int -1
    %int12288_1824 = torch.constant.int 12288
    %int15360_1825 = torch.constant.int 15360
    %int1_1826 = torch.constant.int 1
    %1496 = torch.aten.slice.Tensor %1491, %int-1_1823, %int12288_1824, %int15360_1825, %int1_1826 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_1827 = torch.constant.int -1
    %int15360_1828 = torch.constant.int 15360
    %int18432_1829 = torch.constant.int 18432
    %int1_1830 = torch.constant.int 1
    %1497 = torch.aten.slice.Tensor %1491, %int-1_1827, %int15360_1828, %int18432_1829, %int1_1830 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_1831 = torch.constant.int 6
    %1498 = torch.prims.convert_element_type %1395, %int6_1831 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_1832 = torch.constant.int 2
    %1499 = torch.prim.ListConstruct %int2_1832 : (!torch.int) -> !torch.list<int>
    %int0_1833 = torch.constant.int 0
    %true_1834 = torch.constant.bool true
    %result0_1835, %result1_1836 = torch.aten.var_mean.correction %1498, %1499, %int0_1833, %true_1834 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_1837 = torch.constant.float 9.9999999999999995E-7
    %int1_1838 = torch.constant.int 1
    %1500 = torch.aten.add.Scalar %result0_1835, %float9.999990e-07_1837, %int1_1838 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %1501 = torch.aten.rsqrt %1500 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_1839 = torch.constant.int 1
    %1502 = torch.aten.sub.Tensor %1395, %result1_1836, %int1_1839 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %1503 = torch.aten.mul.Tensor %1502, %1501 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_1840 = torch.constant.int 5
    %1504 = torch.prims.convert_element_type %1503, %int5_1840 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1841 = torch.constant.int 1
    %int1_1842 = torch.constant.int 1
    %1505 = torch.aten.add.Scalar %1472, %int1_1841, %int1_1842 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1506 = torch.aten.mul.Tensor %1505, %1504 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_1843 = torch.constant.int 1
    %1507 = torch.aten.add.Tensor %1506, %1471, %int1_1843 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_1844 = torch.constant.int 4096
    %int3072_1845 = torch.constant.int 3072
    %1508 = torch.prim.ListConstruct %int4096_1844, %int3072_1845 : (!torch.int, !torch.int) -> !torch.list<int>
    %1509 = torch.aten.view %1507, %1508 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.4.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.4.img_attn.qkv.weight : tensor<9216x3072xf16>
    %1510 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_1846 = torch.constant.int 0
    %int1_1847 = torch.constant.int 1
    %1511 = torch.aten.transpose.int %1510, %int0_1846, %int1_1847 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.4.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.4.img_attn.qkv.bias : tensor<9216xf16>
    %1512 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_1848 = torch.constant.int 6
    %1513 = torch.prims.convert_element_type %1512, %int6_1848 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_1849 = torch.constant.int 6
    %1514 = torch.prims.convert_element_type %1509, %int6_1849 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_1850 = torch.constant.int 6
    %1515 = torch.prims.convert_element_type %1511, %int6_1850 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %1516 = torch.aten.mm %1514, %1515 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_1851 = torch.constant.int 1
    %1517 = torch.aten.mul.Scalar %1516, %int1_1851 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_1852 = torch.constant.int 1
    %1518 = torch.aten.mul.Scalar %1513, %int1_1852 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_1853 = torch.constant.int 1
    %1519 = torch.aten.add.Tensor %1517, %1518, %int1_1853 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_1854 = torch.constant.int 5
    %1520 = torch.prims.convert_element_type %1519, %int5_1854 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_1855 = torch.constant.int 1
    %int4096_1856 = torch.constant.int 4096
    %int9216_1857 = torch.constant.int 9216
    %1521 = torch.prim.ListConstruct %int1_1855, %int4096_1856, %int9216_1857 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1522 = torch.aten.view %1520, %1521 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_1858 = torch.constant.int 1
    %int4096_1859 = torch.constant.int 4096
    %int3_1860 = torch.constant.int 3
    %int24_1861 = torch.constant.int 24
    %int128_1862 = torch.constant.int 128
    %1523 = torch.prim.ListConstruct %int1_1858, %int4096_1859, %int3_1860, %int24_1861, %int128_1862 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1524 = torch.aten.view %1522, %1523 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_1863 = torch.constant.int 2
    %int0_1864 = torch.constant.int 0
    %int3_1865 = torch.constant.int 3
    %int1_1866 = torch.constant.int 1
    %int4_1867 = torch.constant.int 4
    %1525 = torch.prim.ListConstruct %int2_1863, %int0_1864, %int3_1865, %int1_1866, %int4_1867 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1526 = torch.aten.permute %1524, %1525 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_1868 = torch.constant.int 0
    %int0_1869 = torch.constant.int 0
    %1527 = torch.aten.select.int %1526, %int0_1868, %int0_1869 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_1870 = torch.constant.int 0
    %int1_1871 = torch.constant.int 1
    %1528 = torch.aten.select.int %1526, %int0_1870, %int1_1871 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_1872 = torch.constant.int 0
    %int2_1873 = torch.constant.int 2
    %1529 = torch.aten.select.int %1526, %int0_1872, %int2_1873 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_1874 = torch.constant.int 6
    %1530 = torch.prims.convert_element_type %1527, %int6_1874 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_1875 = torch.constant.int 2
    %1531 = torch.aten.pow.Tensor_Scalar %1530, %int2_1875 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_1876 = torch.constant.int -1
    %1532 = torch.prim.ListConstruct %int-1_1876 : (!torch.int) -> !torch.list<int>
    %true_1877 = torch.constant.bool true
    %none_1878 = torch.constant.none
    %1533 = torch.aten.mean.dim %1531, %1532, %true_1877, %none_1878 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_1879 = torch.constant.float 9.9999999999999995E-7
    %int1_1880 = torch.constant.int 1
    %1534 = torch.aten.add.Scalar %1533, %float9.999990e-07_1879, %int1_1880 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %1535 = torch.aten.rsqrt %1534 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %1536 = torch.aten.mul.Tensor %1530, %1535 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_1881 = torch.constant.int 5
    %1537 = torch.prims.convert_element_type %1536, %int5_1881 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.4.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.4.img_attn.norm.query_norm.scale : tensor<128xf16>
    %1538 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1539 = torch.aten.mul.Tensor %1537, %1538 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_1882 = torch.constant.int 6
    %1540 = torch.prims.convert_element_type %1528, %int6_1882 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_1883 = torch.constant.int 2
    %1541 = torch.aten.pow.Tensor_Scalar %1540, %int2_1883 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_1884 = torch.constant.int -1
    %1542 = torch.prim.ListConstruct %int-1_1884 : (!torch.int) -> !torch.list<int>
    %true_1885 = torch.constant.bool true
    %none_1886 = torch.constant.none
    %1543 = torch.aten.mean.dim %1541, %1542, %true_1885, %none_1886 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_1887 = torch.constant.float 9.9999999999999995E-7
    %int1_1888 = torch.constant.int 1
    %1544 = torch.aten.add.Scalar %1543, %float9.999990e-07_1887, %int1_1888 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %1545 = torch.aten.rsqrt %1544 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %1546 = torch.aten.mul.Tensor %1540, %1545 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_1889 = torch.constant.int 5
    %1547 = torch.prims.convert_element_type %1546, %int5_1889 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.4.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.4.img_attn.norm.key_norm.scale : tensor<128xf16>
    %1548 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1549 = torch.aten.mul.Tensor %1547, %1548 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_1890 = torch.constant.int 5
    %1550 = torch.prims.convert_element_type %1539, %int5_1890 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_1891 = torch.constant.int 5
    %1551 = torch.prims.convert_element_type %1549, %int5_1891 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_1892 = torch.constant.int 6
    %1552 = torch.prims.convert_element_type %1455, %int6_1892 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_1893 = torch.constant.int 2
    %1553 = torch.prim.ListConstruct %int2_1893 : (!torch.int) -> !torch.list<int>
    %int0_1894 = torch.constant.int 0
    %true_1895 = torch.constant.bool true
    %result0_1896, %result1_1897 = torch.aten.var_mean.correction %1552, %1553, %int0_1894, %true_1895 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_1898 = torch.constant.float 9.9999999999999995E-7
    %int1_1899 = torch.constant.int 1
    %1554 = torch.aten.add.Scalar %result0_1896, %float9.999990e-07_1898, %int1_1899 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %1555 = torch.aten.rsqrt %1554 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_1900 = torch.constant.int 1
    %1556 = torch.aten.sub.Tensor %1455, %result1_1897, %int1_1900 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %1557 = torch.aten.mul.Tensor %1556, %1555 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_1901 = torch.constant.int 5
    %1558 = torch.prims.convert_element_type %1557, %int5_1901 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_1902 = torch.constant.int 1
    %int1_1903 = torch.constant.int 1
    %1559 = torch.aten.add.Scalar %1493, %int1_1902, %int1_1903 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1560 = torch.aten.mul.Tensor %1559, %1558 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_1904 = torch.constant.int 1
    %1561 = torch.aten.add.Tensor %1560, %1492, %int1_1904 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_1905 = torch.constant.int 512
    %int3072_1906 = torch.constant.int 3072
    %1562 = torch.prim.ListConstruct %int512_1905, %int3072_1906 : (!torch.int, !torch.int) -> !torch.list<int>
    %1563 = torch.aten.view %1561, %1562 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.4.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.4.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %1564 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_1907 = torch.constant.int 0
    %int1_1908 = torch.constant.int 1
    %1565 = torch.aten.transpose.int %1564, %int0_1907, %int1_1908 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.4.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.4.txt_attn.qkv.bias : tensor<9216xf16>
    %1566 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_1909 = torch.constant.int 6
    %1567 = torch.prims.convert_element_type %1566, %int6_1909 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_1910 = torch.constant.int 6
    %1568 = torch.prims.convert_element_type %1563, %int6_1910 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_1911 = torch.constant.int 6
    %1569 = torch.prims.convert_element_type %1565, %int6_1911 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %1570 = torch.aten.mm %1568, %1569 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_1912 = torch.constant.int 1
    %1571 = torch.aten.mul.Scalar %1570, %int1_1912 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_1913 = torch.constant.int 1
    %1572 = torch.aten.mul.Scalar %1567, %int1_1913 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_1914 = torch.constant.int 1
    %1573 = torch.aten.add.Tensor %1571, %1572, %int1_1914 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_1915 = torch.constant.int 5
    %1574 = torch.prims.convert_element_type %1573, %int5_1915 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_1916 = torch.constant.int 1
    %int512_1917 = torch.constant.int 512
    %int9216_1918 = torch.constant.int 9216
    %1575 = torch.prim.ListConstruct %int1_1916, %int512_1917, %int9216_1918 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1576 = torch.aten.view %1574, %1575 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_1919 = torch.constant.int 1
    %int512_1920 = torch.constant.int 512
    %int3_1921 = torch.constant.int 3
    %int24_1922 = torch.constant.int 24
    %int128_1923 = torch.constant.int 128
    %1577 = torch.prim.ListConstruct %int1_1919, %int512_1920, %int3_1921, %int24_1922, %int128_1923 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1578 = torch.aten.view %1576, %1577 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_1924 = torch.constant.int 2
    %int0_1925 = torch.constant.int 0
    %int3_1926 = torch.constant.int 3
    %int1_1927 = torch.constant.int 1
    %int4_1928 = torch.constant.int 4
    %1579 = torch.prim.ListConstruct %int2_1924, %int0_1925, %int3_1926, %int1_1927, %int4_1928 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1580 = torch.aten.permute %1578, %1579 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_1929 = torch.constant.int 0
    %int0_1930 = torch.constant.int 0
    %1581 = torch.aten.select.int %1580, %int0_1929, %int0_1930 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_1931 = torch.constant.int 0
    %int1_1932 = torch.constant.int 1
    %1582 = torch.aten.select.int %1580, %int0_1931, %int1_1932 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_1933 = torch.constant.int 0
    %int2_1934 = torch.constant.int 2
    %1583 = torch.aten.select.int %1580, %int0_1933, %int2_1934 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_1935 = torch.constant.int 6
    %1584 = torch.prims.convert_element_type %1581, %int6_1935 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_1936 = torch.constant.int 2
    %1585 = torch.aten.pow.Tensor_Scalar %1584, %int2_1936 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_1937 = torch.constant.int -1
    %1586 = torch.prim.ListConstruct %int-1_1937 : (!torch.int) -> !torch.list<int>
    %true_1938 = torch.constant.bool true
    %none_1939 = torch.constant.none
    %1587 = torch.aten.mean.dim %1585, %1586, %true_1938, %none_1939 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_1940 = torch.constant.float 9.9999999999999995E-7
    %int1_1941 = torch.constant.int 1
    %1588 = torch.aten.add.Scalar %1587, %float9.999990e-07_1940, %int1_1941 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %1589 = torch.aten.rsqrt %1588 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %1590 = torch.aten.mul.Tensor %1584, %1589 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_1942 = torch.constant.int 5
    %1591 = torch.prims.convert_element_type %1590, %int5_1942 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.4.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.4.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %1592 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1593 = torch.aten.mul.Tensor %1591, %1592 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_1943 = torch.constant.int 6
    %1594 = torch.prims.convert_element_type %1582, %int6_1943 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_1944 = torch.constant.int 2
    %1595 = torch.aten.pow.Tensor_Scalar %1594, %int2_1944 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_1945 = torch.constant.int -1
    %1596 = torch.prim.ListConstruct %int-1_1945 : (!torch.int) -> !torch.list<int>
    %true_1946 = torch.constant.bool true
    %none_1947 = torch.constant.none
    %1597 = torch.aten.mean.dim %1595, %1596, %true_1946, %none_1947 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_1948 = torch.constant.float 9.9999999999999995E-7
    %int1_1949 = torch.constant.int 1
    %1598 = torch.aten.add.Scalar %1597, %float9.999990e-07_1948, %int1_1949 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %1599 = torch.aten.rsqrt %1598 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %1600 = torch.aten.mul.Tensor %1594, %1599 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_1950 = torch.constant.int 5
    %1601 = torch.prims.convert_element_type %1600, %int5_1950 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.4.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.4.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %1602 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1603 = torch.aten.mul.Tensor %1601, %1602 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_1951 = torch.constant.int 5
    %1604 = torch.prims.convert_element_type %1593, %int5_1951 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_1952 = torch.constant.int 5
    %1605 = torch.prims.convert_element_type %1603, %int5_1952 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %1606 = torch.prim.ListConstruct %1604, %1550 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_1953 = torch.constant.int 2
    %1607 = torch.aten.cat %1606, %int2_1953 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %1608 = torch.prim.ListConstruct %1605, %1551 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_1954 = torch.constant.int 2
    %1609 = torch.aten.cat %1608, %int2_1954 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %1610 = torch.prim.ListConstruct %1583, %1529 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_1955 = torch.constant.int 2
    %1611 = torch.aten.cat %1610, %int2_1955 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_1956 = torch.constant.int 6
    %1612 = torch.prims.convert_element_type %1607, %int6_1956 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_1957 = torch.constant.int 1
    %int24_1958 = torch.constant.int 24
    %int4608_1959 = torch.constant.int 4608
    %int-1_1960 = torch.constant.int -1
    %int1_1961 = torch.constant.int 1
    %int2_1962 = torch.constant.int 2
    %1613 = torch.prim.ListConstruct %int1_1957, %int24_1958, %int4608_1959, %int-1_1960, %int1_1961, %int2_1962 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1614 = torch.aten.view %1612, %1613 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_1963 = torch.constant.int 6
    %1615 = torch.prims.convert_element_type %1609, %int6_1963 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_1964 = torch.constant.int 1
    %int24_1965 = torch.constant.int 24
    %int4608_1966 = torch.constant.int 4608
    %int-1_1967 = torch.constant.int -1
    %int1_1968 = torch.constant.int 1
    %int2_1969 = torch.constant.int 2
    %1616 = torch.prim.ListConstruct %int1_1964, %int24_1965, %int4608_1966, %int-1_1967, %int1_1968, %int2_1969 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1617 = torch.aten.view %1615, %1616 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_1970 = torch.constant.int 5
    %int0_1971 = torch.constant.int 0
    %1618 = torch.aten.select.int %211, %int5_1970, %int0_1971 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1972 = torch.constant.int 5
    %int0_1973 = torch.constant.int 0
    %1619 = torch.aten.select.int %1614, %int5_1972, %int0_1973 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1620 = torch.aten.mul.Tensor %1618, %1619 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_1974 = torch.constant.int 5
    %int1_1975 = torch.constant.int 1
    %1621 = torch.aten.select.int %211, %int5_1974, %int1_1975 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1976 = torch.constant.int 5
    %int1_1977 = torch.constant.int 1
    %1622 = torch.aten.select.int %1614, %int5_1976, %int1_1977 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1623 = torch.aten.mul.Tensor %1621, %1622 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_1978 = torch.constant.int 1
    %1624 = torch.aten.add.Tensor %1620, %1623, %int1_1978 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_1979 = torch.constant.int 5
    %int0_1980 = torch.constant.int 0
    %1625 = torch.aten.select.int %211, %int5_1979, %int0_1980 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1981 = torch.constant.int 5
    %int0_1982 = torch.constant.int 0
    %1626 = torch.aten.select.int %1617, %int5_1981, %int0_1982 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1627 = torch.aten.mul.Tensor %1625, %1626 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_1983 = torch.constant.int 5
    %int1_1984 = torch.constant.int 1
    %1628 = torch.aten.select.int %211, %int5_1983, %int1_1984 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_1985 = torch.constant.int 5
    %int1_1986 = torch.constant.int 1
    %1629 = torch.aten.select.int %1617, %int5_1985, %int1_1986 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1630 = torch.aten.mul.Tensor %1628, %1629 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_1987 = torch.constant.int 1
    %1631 = torch.aten.add.Tensor %1627, %1630, %int1_1987 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_1988 = torch.constant.int 1
    %int24_1989 = torch.constant.int 24
    %int4608_1990 = torch.constant.int 4608
    %int128_1991 = torch.constant.int 128
    %1632 = torch.prim.ListConstruct %int1_1988, %int24_1989, %int4608_1990, %int128_1991 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1633 = torch.aten.view %1624, %1632 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_1992 = torch.constant.int 5
    %1634 = torch.prims.convert_element_type %1633, %int5_1992 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_1993 = torch.constant.int 1
    %int24_1994 = torch.constant.int 24
    %int4608_1995 = torch.constant.int 4608
    %int128_1996 = torch.constant.int 128
    %1635 = torch.prim.ListConstruct %int1_1993, %int24_1994, %int4608_1995, %int128_1996 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1636 = torch.aten.view %1631, %1635 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_1997 = torch.constant.int 5
    %1637 = torch.prims.convert_element_type %1636, %int5_1997 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_1998 = torch.constant.float 0.000000e+00
    %false_1999 = torch.constant.bool false
    %none_2000 = torch.constant.none
    %none_2001 = torch.constant.none
    %1638:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1634, %1637, %1611, %float0.000000e00_1998, %false_1999, %none_2000, %none_2001) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_2002 = torch.constant.int 0
    %int2_2003 = torch.constant.int 2
    %int1_2004 = torch.constant.int 1
    %int3_2005 = torch.constant.int 3
    %1639 = torch.prim.ListConstruct %int0_2002, %int2_2003, %int1_2004, %int3_2005 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1640 = torch.aten.permute %1638#0, %1639 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_2006 = torch.constant.int 1
    %int4608_2007 = torch.constant.int 4608
    %int3072_2008 = torch.constant.int 3072
    %1641 = torch.prim.ListConstruct %int1_2006, %int4608_2007, %int3072_2008 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1642 = torch.aten.view %1640, %1641 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_2009 = torch.constant.int 0
    %int0_2010 = torch.constant.int 0
    %int9223372036854775807_2011 = torch.constant.int 9223372036854775807
    %int1_2012 = torch.constant.int 1
    %1643 = torch.aten.slice.Tensor %1642, %int0_2009, %int0_2010, %int9223372036854775807_2011, %int1_2012 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_2013 = torch.constant.int 1
    %int0_2014 = torch.constant.int 0
    %int512_2015 = torch.constant.int 512
    %int1_2016 = torch.constant.int 1
    %1644 = torch.aten.slice.Tensor %1643, %int1_2013, %int0_2014, %int512_2015, %int1_2016 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_2017 = torch.constant.int 0
    %int0_2018 = torch.constant.int 0
    %int9223372036854775807_2019 = torch.constant.int 9223372036854775807
    %int1_2020 = torch.constant.int 1
    %1645 = torch.aten.slice.Tensor %1642, %int0_2017, %int0_2018, %int9223372036854775807_2019, %int1_2020 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_2021 = torch.constant.int 1
    %int512_2022 = torch.constant.int 512
    %int9223372036854775807_2023 = torch.constant.int 9223372036854775807
    %int1_2024 = torch.constant.int 1
    %1646 = torch.aten.slice.Tensor %1645, %int1_2021, %int512_2022, %int9223372036854775807_2023, %int1_2024 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_2025 = torch.constant.int 4096
    %int3072_2026 = torch.constant.int 3072
    %1647 = torch.prim.ListConstruct %int4096_2025, %int3072_2026 : (!torch.int, !torch.int) -> !torch.list<int>
    %1648 = torch.aten.view %1646, %1647 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.4.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.4.img_attn.proj.weight : tensor<3072x3072xf16>
    %1649 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_2027 = torch.constant.int 0
    %int1_2028 = torch.constant.int 1
    %1650 = torch.aten.transpose.int %1649, %int0_2027, %int1_2028 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.4.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.4.img_attn.proj.bias : tensor<3072xf16>
    %1651 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2029 = torch.constant.int 6
    %1652 = torch.prims.convert_element_type %1651, %int6_2029 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2030 = torch.constant.int 6
    %1653 = torch.prims.convert_element_type %1648, %int6_2030 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_2031 = torch.constant.int 6
    %1654 = torch.prims.convert_element_type %1650, %int6_2031 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %1655 = torch.aten.mm %1653, %1654 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_2032 = torch.constant.int 1
    %1656 = torch.aten.mul.Scalar %1655, %int1_2032 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_2033 = torch.constant.int 1
    %1657 = torch.aten.mul.Scalar %1652, %int1_2033 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2034 = torch.constant.int 1
    %1658 = torch.aten.add.Tensor %1656, %1657, %int1_2034 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_2035 = torch.constant.int 5
    %1659 = torch.prims.convert_element_type %1658, %int5_2035 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_2036 = torch.constant.int 1
    %int4096_2037 = torch.constant.int 4096
    %int3072_2038 = torch.constant.int 3072
    %1660 = torch.prim.ListConstruct %int1_2036, %int4096_2037, %int3072_2038 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1661 = torch.aten.view %1659, %1660 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %1662 = torch.aten.mul.Tensor %1473, %1661 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2039 = torch.constant.int 1
    %1663 = torch.aten.add.Tensor %1395, %1662, %int1_2039 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2040 = torch.constant.int 1
    %int1_2041 = torch.constant.int 1
    %1664 = torch.aten.add.Scalar %1475, %int1_2040, %int1_2041 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_2042 = torch.constant.int 6
    %1665 = torch.prims.convert_element_type %1663, %int6_2042 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_2043 = torch.constant.int 2
    %1666 = torch.prim.ListConstruct %int2_2043 : (!torch.int) -> !torch.list<int>
    %int0_2044 = torch.constant.int 0
    %true_2045 = torch.constant.bool true
    %result0_2046, %result1_2047 = torch.aten.var_mean.correction %1665, %1666, %int0_2044, %true_2045 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_2048 = torch.constant.float 9.9999999999999995E-7
    %int1_2049 = torch.constant.int 1
    %1667 = torch.aten.add.Scalar %result0_2046, %float9.999990e-07_2048, %int1_2049 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %1668 = torch.aten.rsqrt %1667 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_2050 = torch.constant.int 1
    %1669 = torch.aten.sub.Tensor %1663, %result1_2047, %int1_2050 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %1670 = torch.aten.mul.Tensor %1669, %1668 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_2051 = torch.constant.int 5
    %1671 = torch.prims.convert_element_type %1670, %int5_2051 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %1672 = torch.aten.mul.Tensor %1664, %1671 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2052 = torch.constant.int 1
    %1673 = torch.aten.add.Tensor %1672, %1474, %int1_2052 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_2053 = torch.constant.int 4096
    %int3072_2054 = torch.constant.int 3072
    %1674 = torch.prim.ListConstruct %int4096_2053, %int3072_2054 : (!torch.int, !torch.int) -> !torch.list<int>
    %1675 = torch.aten.view %1673, %1674 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.4.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.4.img_mlp.0.weight : tensor<12288x3072xf16>
    %1676 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_2055 = torch.constant.int 0
    %int1_2056 = torch.constant.int 1
    %1677 = torch.aten.transpose.int %1676, %int0_2055, %int1_2056 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.4.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.4.img_mlp.0.bias : tensor<12288xf16>
    %1678 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_2057 = torch.constant.int 6
    %1679 = torch.prims.convert_element_type %1678, %int6_2057 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_2058 = torch.constant.int 6
    %1680 = torch.prims.convert_element_type %1675, %int6_2058 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_2059 = torch.constant.int 6
    %1681 = torch.prims.convert_element_type %1677, %int6_2059 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %1682 = torch.aten.mm %1680, %1681 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_2060 = torch.constant.int 1
    %1683 = torch.aten.mul.Scalar %1682, %int1_2060 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_2061 = torch.constant.int 1
    %1684 = torch.aten.mul.Scalar %1679, %int1_2061 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_2062 = torch.constant.int 1
    %1685 = torch.aten.add.Tensor %1683, %1684, %int1_2062 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_2063 = torch.constant.int 5
    %1686 = torch.prims.convert_element_type %1685, %int5_2063 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_2064 = torch.constant.int 1
    %int4096_2065 = torch.constant.int 4096
    %int12288_2066 = torch.constant.int 12288
    %1687 = torch.prim.ListConstruct %int1_2064, %int4096_2065, %int12288_2066 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1688 = torch.aten.view %1686, %1687 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_2067 = torch.constant.str "tanh"
    %1689 = torch.aten.gelu %1688, %str_2067 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_2068 = torch.constant.int 4096
    %int12288_2069 = torch.constant.int 12288
    %1690 = torch.prim.ListConstruct %int4096_2068, %int12288_2069 : (!torch.int, !torch.int) -> !torch.list<int>
    %1691 = torch.aten.view %1689, %1690 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.4.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.4.img_mlp.2.weight : tensor<3072x12288xf16>
    %1692 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_2070 = torch.constant.int 0
    %int1_2071 = torch.constant.int 1
    %1693 = torch.aten.transpose.int %1692, %int0_2070, %int1_2071 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.4.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.4.img_mlp.2.bias : tensor<3072xf16>
    %1694 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2072 = torch.constant.int 6
    %1695 = torch.prims.convert_element_type %1694, %int6_2072 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2073 = torch.constant.int 6
    %1696 = torch.prims.convert_element_type %1691, %int6_2073 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_2074 = torch.constant.int 6
    %1697 = torch.prims.convert_element_type %1693, %int6_2074 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %1698 = torch.aten.mm %1696, %1697 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_2075 = torch.constant.int 1
    %1699 = torch.aten.mul.Scalar %1698, %int1_2075 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_2076 = torch.constant.int 1
    %1700 = torch.aten.mul.Scalar %1695, %int1_2076 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2077 = torch.constant.int 1
    %1701 = torch.aten.add.Tensor %1699, %1700, %int1_2077 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_2078 = torch.constant.int 5
    %1702 = torch.prims.convert_element_type %1701, %int5_2078 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_2079 = torch.constant.int 1
    %int4096_2080 = torch.constant.int 4096
    %int3072_2081 = torch.constant.int 3072
    %1703 = torch.prim.ListConstruct %int1_2079, %int4096_2080, %int3072_2081 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1704 = torch.aten.view %1702, %1703 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %1705 = torch.aten.mul.Tensor %1476, %1704 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2082 = torch.constant.int 1
    %1706 = torch.aten.add.Tensor %1663, %1705, %int1_2082 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_2083 = torch.constant.int 512
    %int3072_2084 = torch.constant.int 3072
    %1707 = torch.prim.ListConstruct %int512_2083, %int3072_2084 : (!torch.int, !torch.int) -> !torch.list<int>
    %1708 = torch.aten.view %1644, %1707 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.4.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.4.txt_attn.proj.weight : tensor<3072x3072xf16>
    %1709 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_2085 = torch.constant.int 0
    %int1_2086 = torch.constant.int 1
    %1710 = torch.aten.transpose.int %1709, %int0_2085, %int1_2086 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.4.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.4.txt_attn.proj.bias : tensor<3072xf16>
    %1711 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2087 = torch.constant.int 6
    %1712 = torch.prims.convert_element_type %1711, %int6_2087 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2088 = torch.constant.int 6
    %1713 = torch.prims.convert_element_type %1708, %int6_2088 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_2089 = torch.constant.int 6
    %1714 = torch.prims.convert_element_type %1710, %int6_2089 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %1715 = torch.aten.mm %1713, %1714 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_2090 = torch.constant.int 1
    %1716 = torch.aten.mul.Scalar %1715, %int1_2090 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_2091 = torch.constant.int 1
    %1717 = torch.aten.mul.Scalar %1712, %int1_2091 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2092 = torch.constant.int 1
    %1718 = torch.aten.add.Tensor %1716, %1717, %int1_2092 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_2093 = torch.constant.int 5
    %1719 = torch.prims.convert_element_type %1718, %int5_2093 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_2094 = torch.constant.int 1
    %int512_2095 = torch.constant.int 512
    %int3072_2096 = torch.constant.int 3072
    %1720 = torch.prim.ListConstruct %int1_2094, %int512_2095, %int3072_2096 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1721 = torch.aten.view %1719, %1720 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %1722 = torch.aten.mul.Tensor %1494, %1721 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2097 = torch.constant.int 1
    %1723 = torch.aten.add.Tensor %1455, %1722, %int1_2097 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_2098 = torch.constant.int 1
    %int1_2099 = torch.constant.int 1
    %1724 = torch.aten.add.Scalar %1496, %int1_2098, %int1_2099 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_2100 = torch.constant.int 6
    %1725 = torch.prims.convert_element_type %1723, %int6_2100 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_2101 = torch.constant.int 2
    %1726 = torch.prim.ListConstruct %int2_2101 : (!torch.int) -> !torch.list<int>
    %int0_2102 = torch.constant.int 0
    %true_2103 = torch.constant.bool true
    %result0_2104, %result1_2105 = torch.aten.var_mean.correction %1725, %1726, %int0_2102, %true_2103 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_2106 = torch.constant.float 9.9999999999999995E-7
    %int1_2107 = torch.constant.int 1
    %1727 = torch.aten.add.Scalar %result0_2104, %float9.999990e-07_2106, %int1_2107 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %1728 = torch.aten.rsqrt %1727 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_2108 = torch.constant.int 1
    %1729 = torch.aten.sub.Tensor %1723, %result1_2105, %int1_2108 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %1730 = torch.aten.mul.Tensor %1729, %1728 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_2109 = torch.constant.int 5
    %1731 = torch.prims.convert_element_type %1730, %int5_2109 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %1732 = torch.aten.mul.Tensor %1724, %1731 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2110 = torch.constant.int 1
    %1733 = torch.aten.add.Tensor %1732, %1495, %int1_2110 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_2111 = torch.constant.int 512
    %int3072_2112 = torch.constant.int 3072
    %1734 = torch.prim.ListConstruct %int512_2111, %int3072_2112 : (!torch.int, !torch.int) -> !torch.list<int>
    %1735 = torch.aten.view %1733, %1734 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.4.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.4.txt_mlp.0.weight : tensor<12288x3072xf16>
    %1736 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_2113 = torch.constant.int 0
    %int1_2114 = torch.constant.int 1
    %1737 = torch.aten.transpose.int %1736, %int0_2113, %int1_2114 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.4.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.4.txt_mlp.0.bias : tensor<12288xf16>
    %1738 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_2115 = torch.constant.int 6
    %1739 = torch.prims.convert_element_type %1738, %int6_2115 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_2116 = torch.constant.int 6
    %1740 = torch.prims.convert_element_type %1735, %int6_2116 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_2117 = torch.constant.int 6
    %1741 = torch.prims.convert_element_type %1737, %int6_2117 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %1742 = torch.aten.mm %1740, %1741 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_2118 = torch.constant.int 1
    %1743 = torch.aten.mul.Scalar %1742, %int1_2118 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_2119 = torch.constant.int 1
    %1744 = torch.aten.mul.Scalar %1739, %int1_2119 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_2120 = torch.constant.int 1
    %1745 = torch.aten.add.Tensor %1743, %1744, %int1_2120 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_2121 = torch.constant.int 5
    %1746 = torch.prims.convert_element_type %1745, %int5_2121 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_2122 = torch.constant.int 1
    %int512_2123 = torch.constant.int 512
    %int12288_2124 = torch.constant.int 12288
    %1747 = torch.prim.ListConstruct %int1_2122, %int512_2123, %int12288_2124 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1748 = torch.aten.view %1746, %1747 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_2125 = torch.constant.str "tanh"
    %1749 = torch.aten.gelu %1748, %str_2125 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_2126 = torch.constant.int 512
    %int12288_2127 = torch.constant.int 12288
    %1750 = torch.prim.ListConstruct %int512_2126, %int12288_2127 : (!torch.int, !torch.int) -> !torch.list<int>
    %1751 = torch.aten.view %1749, %1750 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.4.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.4.txt_mlp.2.weight : tensor<3072x12288xf16>
    %1752 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_2128 = torch.constant.int 0
    %int1_2129 = torch.constant.int 1
    %1753 = torch.aten.transpose.int %1752, %int0_2128, %int1_2129 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.4.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.4.txt_mlp.2.bias : tensor<3072xf16>
    %1754 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.4.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2130 = torch.constant.int 6
    %1755 = torch.prims.convert_element_type %1754, %int6_2130 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2131 = torch.constant.int 6
    %1756 = torch.prims.convert_element_type %1751, %int6_2131 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_2132 = torch.constant.int 6
    %1757 = torch.prims.convert_element_type %1753, %int6_2132 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %1758 = torch.aten.mm %1756, %1757 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_2133 = torch.constant.int 1
    %1759 = torch.aten.mul.Scalar %1758, %int1_2133 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_2134 = torch.constant.int 1
    %1760 = torch.aten.mul.Scalar %1755, %int1_2134 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2135 = torch.constant.int 1
    %1761 = torch.aten.add.Tensor %1759, %1760, %int1_2135 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_2136 = torch.constant.int 5
    %1762 = torch.prims.convert_element_type %1761, %int5_2136 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_2137 = torch.constant.int 1
    %int512_2138 = torch.constant.int 512
    %int3072_2139 = torch.constant.int 3072
    %1763 = torch.prim.ListConstruct %int1_2137, %int512_2138, %int3072_2139 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1764 = torch.aten.view %1762, %1763 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %1765 = torch.aten.mul.Tensor %1497, %1764 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2140 = torch.constant.int 1
    %1766 = torch.aten.add.Tensor %1723, %1765, %int1_2140 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %1767 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.5.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.5.img_mod.lin.weight : tensor<18432x3072xf16>
    %1768 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_2141 = torch.constant.int 0
    %int1_2142 = torch.constant.int 1
    %1769 = torch.aten.transpose.int %1768, %int0_2141, %int1_2142 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.5.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.5.img_mod.lin.bias : tensor<18432xf16>
    %1770 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_2143 = torch.constant.int 6
    %1771 = torch.prims.convert_element_type %1770, %int6_2143 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_2144 = torch.constant.int 6
    %1772 = torch.prims.convert_element_type %1767, %int6_2144 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_2145 = torch.constant.int 6
    %1773 = torch.prims.convert_element_type %1769, %int6_2145 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %1774 = torch.aten.mm %1772, %1773 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_2146 = torch.constant.int 1
    %1775 = torch.aten.mul.Scalar %1774, %int1_2146 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_2147 = torch.constant.int 1
    %1776 = torch.aten.mul.Scalar %1771, %int1_2147 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_2148 = torch.constant.int 1
    %1777 = torch.aten.add.Tensor %1775, %1776, %int1_2148 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_2149 = torch.constant.int 5
    %1778 = torch.prims.convert_element_type %1777, %int5_2149 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_2150 = torch.constant.int 0
    %int0_2151 = torch.constant.int 0
    %int9223372036854775807_2152 = torch.constant.int 9223372036854775807
    %int1_2153 = torch.constant.int 1
    %1779 = torch.aten.slice.Tensor %1778, %int0_2150, %int0_2151, %int9223372036854775807_2152, %int1_2153 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_2154 = torch.constant.int 1
    %1780 = torch.aten.unsqueeze %1779, %int1_2154 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_2155 = torch.constant.int 2
    %int0_2156 = torch.constant.int 0
    %int9223372036854775807_2157 = torch.constant.int 9223372036854775807
    %int1_2158 = torch.constant.int 1
    %1781 = torch.aten.slice.Tensor %1780, %int2_2155, %int0_2156, %int9223372036854775807_2157, %int1_2158 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_2159 = torch.constant.int -1
    %int0_2160 = torch.constant.int 0
    %int3072_2161 = torch.constant.int 3072
    %int1_2162 = torch.constant.int 1
    %1782 = torch.aten.slice.Tensor %1781, %int-1_2159, %int0_2160, %int3072_2161, %int1_2162 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2163 = torch.constant.int -1
    %int3072_2164 = torch.constant.int 3072
    %int6144_2165 = torch.constant.int 6144
    %int1_2166 = torch.constant.int 1
    %1783 = torch.aten.slice.Tensor %1781, %int-1_2163, %int3072_2164, %int6144_2165, %int1_2166 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2167 = torch.constant.int -1
    %int6144_2168 = torch.constant.int 6144
    %int9216_2169 = torch.constant.int 9216
    %int1_2170 = torch.constant.int 1
    %1784 = torch.aten.slice.Tensor %1781, %int-1_2167, %int6144_2168, %int9216_2169, %int1_2170 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2171 = torch.constant.int -1
    %int9216_2172 = torch.constant.int 9216
    %int12288_2173 = torch.constant.int 12288
    %int1_2174 = torch.constant.int 1
    %1785 = torch.aten.slice.Tensor %1781, %int-1_2171, %int9216_2172, %int12288_2173, %int1_2174 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2175 = torch.constant.int -1
    %int12288_2176 = torch.constant.int 12288
    %int15360_2177 = torch.constant.int 15360
    %int1_2178 = torch.constant.int 1
    %1786 = torch.aten.slice.Tensor %1781, %int-1_2175, %int12288_2176, %int15360_2177, %int1_2178 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2179 = torch.constant.int -1
    %int15360_2180 = torch.constant.int 15360
    %int18432_2181 = torch.constant.int 18432
    %int1_2182 = torch.constant.int 1
    %1787 = torch.aten.slice.Tensor %1781, %int-1_2179, %int15360_2180, %int18432_2181, %int1_2182 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1788 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.5.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.5.txt_mod.lin.weight : tensor<18432x3072xf16>
    %1789 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_2183 = torch.constant.int 0
    %int1_2184 = torch.constant.int 1
    %1790 = torch.aten.transpose.int %1789, %int0_2183, %int1_2184 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.5.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.5.txt_mod.lin.bias : tensor<18432xf16>
    %1791 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_2185 = torch.constant.int 6
    %1792 = torch.prims.convert_element_type %1791, %int6_2185 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_2186 = torch.constant.int 6
    %1793 = torch.prims.convert_element_type %1788, %int6_2186 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_2187 = torch.constant.int 6
    %1794 = torch.prims.convert_element_type %1790, %int6_2187 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %1795 = torch.aten.mm %1793, %1794 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_2188 = torch.constant.int 1
    %1796 = torch.aten.mul.Scalar %1795, %int1_2188 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_2189 = torch.constant.int 1
    %1797 = torch.aten.mul.Scalar %1792, %int1_2189 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_2190 = torch.constant.int 1
    %1798 = torch.aten.add.Tensor %1796, %1797, %int1_2190 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_2191 = torch.constant.int 5
    %1799 = torch.prims.convert_element_type %1798, %int5_2191 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_2192 = torch.constant.int 0
    %int0_2193 = torch.constant.int 0
    %int9223372036854775807_2194 = torch.constant.int 9223372036854775807
    %int1_2195 = torch.constant.int 1
    %1800 = torch.aten.slice.Tensor %1799, %int0_2192, %int0_2193, %int9223372036854775807_2194, %int1_2195 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_2196 = torch.constant.int 1
    %1801 = torch.aten.unsqueeze %1800, %int1_2196 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_2197 = torch.constant.int 2
    %int0_2198 = torch.constant.int 0
    %int9223372036854775807_2199 = torch.constant.int 9223372036854775807
    %int1_2200 = torch.constant.int 1
    %1802 = torch.aten.slice.Tensor %1801, %int2_2197, %int0_2198, %int9223372036854775807_2199, %int1_2200 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_2201 = torch.constant.int -1
    %int0_2202 = torch.constant.int 0
    %int3072_2203 = torch.constant.int 3072
    %int1_2204 = torch.constant.int 1
    %1803 = torch.aten.slice.Tensor %1802, %int-1_2201, %int0_2202, %int3072_2203, %int1_2204 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2205 = torch.constant.int -1
    %int3072_2206 = torch.constant.int 3072
    %int6144_2207 = torch.constant.int 6144
    %int1_2208 = torch.constant.int 1
    %1804 = torch.aten.slice.Tensor %1802, %int-1_2205, %int3072_2206, %int6144_2207, %int1_2208 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2209 = torch.constant.int -1
    %int6144_2210 = torch.constant.int 6144
    %int9216_2211 = torch.constant.int 9216
    %int1_2212 = torch.constant.int 1
    %1805 = torch.aten.slice.Tensor %1802, %int-1_2209, %int6144_2210, %int9216_2211, %int1_2212 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2213 = torch.constant.int -1
    %int9216_2214 = torch.constant.int 9216
    %int12288_2215 = torch.constant.int 12288
    %int1_2216 = torch.constant.int 1
    %1806 = torch.aten.slice.Tensor %1802, %int-1_2213, %int9216_2214, %int12288_2215, %int1_2216 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2217 = torch.constant.int -1
    %int12288_2218 = torch.constant.int 12288
    %int15360_2219 = torch.constant.int 15360
    %int1_2220 = torch.constant.int 1
    %1807 = torch.aten.slice.Tensor %1802, %int-1_2217, %int12288_2218, %int15360_2219, %int1_2220 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2221 = torch.constant.int -1
    %int15360_2222 = torch.constant.int 15360
    %int18432_2223 = torch.constant.int 18432
    %int1_2224 = torch.constant.int 1
    %1808 = torch.aten.slice.Tensor %1802, %int-1_2221, %int15360_2222, %int18432_2223, %int1_2224 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_2225 = torch.constant.int 6
    %1809 = torch.prims.convert_element_type %1706, %int6_2225 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_2226 = torch.constant.int 2
    %1810 = torch.prim.ListConstruct %int2_2226 : (!torch.int) -> !torch.list<int>
    %int0_2227 = torch.constant.int 0
    %true_2228 = torch.constant.bool true
    %result0_2229, %result1_2230 = torch.aten.var_mean.correction %1809, %1810, %int0_2227, %true_2228 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_2231 = torch.constant.float 9.9999999999999995E-7
    %int1_2232 = torch.constant.int 1
    %1811 = torch.aten.add.Scalar %result0_2229, %float9.999990e-07_2231, %int1_2232 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %1812 = torch.aten.rsqrt %1811 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_2233 = torch.constant.int 1
    %1813 = torch.aten.sub.Tensor %1706, %result1_2230, %int1_2233 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %1814 = torch.aten.mul.Tensor %1813, %1812 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_2234 = torch.constant.int 5
    %1815 = torch.prims.convert_element_type %1814, %int5_2234 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2235 = torch.constant.int 1
    %int1_2236 = torch.constant.int 1
    %1816 = torch.aten.add.Scalar %1783, %int1_2235, %int1_2236 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1817 = torch.aten.mul.Tensor %1816, %1815 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2237 = torch.constant.int 1
    %1818 = torch.aten.add.Tensor %1817, %1782, %int1_2237 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_2238 = torch.constant.int 4096
    %int3072_2239 = torch.constant.int 3072
    %1819 = torch.prim.ListConstruct %int4096_2238, %int3072_2239 : (!torch.int, !torch.int) -> !torch.list<int>
    %1820 = torch.aten.view %1818, %1819 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.5.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.5.img_attn.qkv.weight : tensor<9216x3072xf16>
    %1821 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_2240 = torch.constant.int 0
    %int1_2241 = torch.constant.int 1
    %1822 = torch.aten.transpose.int %1821, %int0_2240, %int1_2241 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.5.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.5.img_attn.qkv.bias : tensor<9216xf16>
    %1823 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_2242 = torch.constant.int 6
    %1824 = torch.prims.convert_element_type %1823, %int6_2242 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_2243 = torch.constant.int 6
    %1825 = torch.prims.convert_element_type %1820, %int6_2243 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_2244 = torch.constant.int 6
    %1826 = torch.prims.convert_element_type %1822, %int6_2244 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %1827 = torch.aten.mm %1825, %1826 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_2245 = torch.constant.int 1
    %1828 = torch.aten.mul.Scalar %1827, %int1_2245 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_2246 = torch.constant.int 1
    %1829 = torch.aten.mul.Scalar %1824, %int1_2246 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_2247 = torch.constant.int 1
    %1830 = torch.aten.add.Tensor %1828, %1829, %int1_2247 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_2248 = torch.constant.int 5
    %1831 = torch.prims.convert_element_type %1830, %int5_2248 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_2249 = torch.constant.int 1
    %int4096_2250 = torch.constant.int 4096
    %int9216_2251 = torch.constant.int 9216
    %1832 = torch.prim.ListConstruct %int1_2249, %int4096_2250, %int9216_2251 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1833 = torch.aten.view %1831, %1832 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_2252 = torch.constant.int 1
    %int4096_2253 = torch.constant.int 4096
    %int3_2254 = torch.constant.int 3
    %int24_2255 = torch.constant.int 24
    %int128_2256 = torch.constant.int 128
    %1834 = torch.prim.ListConstruct %int1_2252, %int4096_2253, %int3_2254, %int24_2255, %int128_2256 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1835 = torch.aten.view %1833, %1834 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_2257 = torch.constant.int 2
    %int0_2258 = torch.constant.int 0
    %int3_2259 = torch.constant.int 3
    %int1_2260 = torch.constant.int 1
    %int4_2261 = torch.constant.int 4
    %1836 = torch.prim.ListConstruct %int2_2257, %int0_2258, %int3_2259, %int1_2260, %int4_2261 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1837 = torch.aten.permute %1835, %1836 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_2262 = torch.constant.int 0
    %int0_2263 = torch.constant.int 0
    %1838 = torch.aten.select.int %1837, %int0_2262, %int0_2263 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_2264 = torch.constant.int 0
    %int1_2265 = torch.constant.int 1
    %1839 = torch.aten.select.int %1837, %int0_2264, %int1_2265 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_2266 = torch.constant.int 0
    %int2_2267 = torch.constant.int 2
    %1840 = torch.aten.select.int %1837, %int0_2266, %int2_2267 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_2268 = torch.constant.int 6
    %1841 = torch.prims.convert_element_type %1838, %int6_2268 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_2269 = torch.constant.int 2
    %1842 = torch.aten.pow.Tensor_Scalar %1841, %int2_2269 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_2270 = torch.constant.int -1
    %1843 = torch.prim.ListConstruct %int-1_2270 : (!torch.int) -> !torch.list<int>
    %true_2271 = torch.constant.bool true
    %none_2272 = torch.constant.none
    %1844 = torch.aten.mean.dim %1842, %1843, %true_2271, %none_2272 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_2273 = torch.constant.float 9.9999999999999995E-7
    %int1_2274 = torch.constant.int 1
    %1845 = torch.aten.add.Scalar %1844, %float9.999990e-07_2273, %int1_2274 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %1846 = torch.aten.rsqrt %1845 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %1847 = torch.aten.mul.Tensor %1841, %1846 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_2275 = torch.constant.int 5
    %1848 = torch.prims.convert_element_type %1847, %int5_2275 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.5.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.5.img_attn.norm.query_norm.scale : tensor<128xf16>
    %1849 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1850 = torch.aten.mul.Tensor %1848, %1849 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_2276 = torch.constant.int 6
    %1851 = torch.prims.convert_element_type %1839, %int6_2276 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_2277 = torch.constant.int 2
    %1852 = torch.aten.pow.Tensor_Scalar %1851, %int2_2277 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_2278 = torch.constant.int -1
    %1853 = torch.prim.ListConstruct %int-1_2278 : (!torch.int) -> !torch.list<int>
    %true_2279 = torch.constant.bool true
    %none_2280 = torch.constant.none
    %1854 = torch.aten.mean.dim %1852, %1853, %true_2279, %none_2280 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_2281 = torch.constant.float 9.9999999999999995E-7
    %int1_2282 = torch.constant.int 1
    %1855 = torch.aten.add.Scalar %1854, %float9.999990e-07_2281, %int1_2282 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %1856 = torch.aten.rsqrt %1855 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %1857 = torch.aten.mul.Tensor %1851, %1856 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_2283 = torch.constant.int 5
    %1858 = torch.prims.convert_element_type %1857, %int5_2283 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.5.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.5.img_attn.norm.key_norm.scale : tensor<128xf16>
    %1859 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1860 = torch.aten.mul.Tensor %1858, %1859 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_2284 = torch.constant.int 5
    %1861 = torch.prims.convert_element_type %1850, %int5_2284 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_2285 = torch.constant.int 5
    %1862 = torch.prims.convert_element_type %1860, %int5_2285 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_2286 = torch.constant.int 6
    %1863 = torch.prims.convert_element_type %1766, %int6_2286 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_2287 = torch.constant.int 2
    %1864 = torch.prim.ListConstruct %int2_2287 : (!torch.int) -> !torch.list<int>
    %int0_2288 = torch.constant.int 0
    %true_2289 = torch.constant.bool true
    %result0_2290, %result1_2291 = torch.aten.var_mean.correction %1863, %1864, %int0_2288, %true_2289 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_2292 = torch.constant.float 9.9999999999999995E-7
    %int1_2293 = torch.constant.int 1
    %1865 = torch.aten.add.Scalar %result0_2290, %float9.999990e-07_2292, %int1_2293 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %1866 = torch.aten.rsqrt %1865 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_2294 = torch.constant.int 1
    %1867 = torch.aten.sub.Tensor %1766, %result1_2291, %int1_2294 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %1868 = torch.aten.mul.Tensor %1867, %1866 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_2295 = torch.constant.int 5
    %1869 = torch.prims.convert_element_type %1868, %int5_2295 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_2296 = torch.constant.int 1
    %int1_2297 = torch.constant.int 1
    %1870 = torch.aten.add.Scalar %1804, %int1_2296, %int1_2297 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %1871 = torch.aten.mul.Tensor %1870, %1869 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2298 = torch.constant.int 1
    %1872 = torch.aten.add.Tensor %1871, %1803, %int1_2298 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_2299 = torch.constant.int 512
    %int3072_2300 = torch.constant.int 3072
    %1873 = torch.prim.ListConstruct %int512_2299, %int3072_2300 : (!torch.int, !torch.int) -> !torch.list<int>
    %1874 = torch.aten.view %1872, %1873 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.5.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.5.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %1875 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_2301 = torch.constant.int 0
    %int1_2302 = torch.constant.int 1
    %1876 = torch.aten.transpose.int %1875, %int0_2301, %int1_2302 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.5.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.5.txt_attn.qkv.bias : tensor<9216xf16>
    %1877 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_2303 = torch.constant.int 6
    %1878 = torch.prims.convert_element_type %1877, %int6_2303 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_2304 = torch.constant.int 6
    %1879 = torch.prims.convert_element_type %1874, %int6_2304 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_2305 = torch.constant.int 6
    %1880 = torch.prims.convert_element_type %1876, %int6_2305 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %1881 = torch.aten.mm %1879, %1880 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_2306 = torch.constant.int 1
    %1882 = torch.aten.mul.Scalar %1881, %int1_2306 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_2307 = torch.constant.int 1
    %1883 = torch.aten.mul.Scalar %1878, %int1_2307 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_2308 = torch.constant.int 1
    %1884 = torch.aten.add.Tensor %1882, %1883, %int1_2308 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_2309 = torch.constant.int 5
    %1885 = torch.prims.convert_element_type %1884, %int5_2309 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_2310 = torch.constant.int 1
    %int512_2311 = torch.constant.int 512
    %int9216_2312 = torch.constant.int 9216
    %1886 = torch.prim.ListConstruct %int1_2310, %int512_2311, %int9216_2312 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1887 = torch.aten.view %1885, %1886 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_2313 = torch.constant.int 1
    %int512_2314 = torch.constant.int 512
    %int3_2315 = torch.constant.int 3
    %int24_2316 = torch.constant.int 24
    %int128_2317 = torch.constant.int 128
    %1888 = torch.prim.ListConstruct %int1_2313, %int512_2314, %int3_2315, %int24_2316, %int128_2317 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1889 = torch.aten.view %1887, %1888 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_2318 = torch.constant.int 2
    %int0_2319 = torch.constant.int 0
    %int3_2320 = torch.constant.int 3
    %int1_2321 = torch.constant.int 1
    %int4_2322 = torch.constant.int 4
    %1890 = torch.prim.ListConstruct %int2_2318, %int0_2319, %int3_2320, %int1_2321, %int4_2322 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1891 = torch.aten.permute %1889, %1890 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_2323 = torch.constant.int 0
    %int0_2324 = torch.constant.int 0
    %1892 = torch.aten.select.int %1891, %int0_2323, %int0_2324 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_2325 = torch.constant.int 0
    %int1_2326 = torch.constant.int 1
    %1893 = torch.aten.select.int %1891, %int0_2325, %int1_2326 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_2327 = torch.constant.int 0
    %int2_2328 = torch.constant.int 2
    %1894 = torch.aten.select.int %1891, %int0_2327, %int2_2328 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_2329 = torch.constant.int 6
    %1895 = torch.prims.convert_element_type %1892, %int6_2329 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_2330 = torch.constant.int 2
    %1896 = torch.aten.pow.Tensor_Scalar %1895, %int2_2330 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_2331 = torch.constant.int -1
    %1897 = torch.prim.ListConstruct %int-1_2331 : (!torch.int) -> !torch.list<int>
    %true_2332 = torch.constant.bool true
    %none_2333 = torch.constant.none
    %1898 = torch.aten.mean.dim %1896, %1897, %true_2332, %none_2333 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_2334 = torch.constant.float 9.9999999999999995E-7
    %int1_2335 = torch.constant.int 1
    %1899 = torch.aten.add.Scalar %1898, %float9.999990e-07_2334, %int1_2335 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %1900 = torch.aten.rsqrt %1899 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %1901 = torch.aten.mul.Tensor %1895, %1900 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_2336 = torch.constant.int 5
    %1902 = torch.prims.convert_element_type %1901, %int5_2336 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.5.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.5.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %1903 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1904 = torch.aten.mul.Tensor %1902, %1903 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_2337 = torch.constant.int 6
    %1905 = torch.prims.convert_element_type %1893, %int6_2337 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_2338 = torch.constant.int 2
    %1906 = torch.aten.pow.Tensor_Scalar %1905, %int2_2338 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_2339 = torch.constant.int -1
    %1907 = torch.prim.ListConstruct %int-1_2339 : (!torch.int) -> !torch.list<int>
    %true_2340 = torch.constant.bool true
    %none_2341 = torch.constant.none
    %1908 = torch.aten.mean.dim %1906, %1907, %true_2340, %none_2341 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_2342 = torch.constant.float 9.9999999999999995E-7
    %int1_2343 = torch.constant.int 1
    %1909 = torch.aten.add.Scalar %1908, %float9.999990e-07_2342, %int1_2343 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %1910 = torch.aten.rsqrt %1909 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %1911 = torch.aten.mul.Tensor %1905, %1910 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_2344 = torch.constant.int 5
    %1912 = torch.prims.convert_element_type %1911, %int5_2344 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.5.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.5.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %1913 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %1914 = torch.aten.mul.Tensor %1912, %1913 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_2345 = torch.constant.int 5
    %1915 = torch.prims.convert_element_type %1904, %int5_2345 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_2346 = torch.constant.int 5
    %1916 = torch.prims.convert_element_type %1914, %int5_2346 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %1917 = torch.prim.ListConstruct %1915, %1861 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_2347 = torch.constant.int 2
    %1918 = torch.aten.cat %1917, %int2_2347 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %1919 = torch.prim.ListConstruct %1916, %1862 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_2348 = torch.constant.int 2
    %1920 = torch.aten.cat %1919, %int2_2348 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %1921 = torch.prim.ListConstruct %1894, %1840 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_2349 = torch.constant.int 2
    %1922 = torch.aten.cat %1921, %int2_2349 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_2350 = torch.constant.int 6
    %1923 = torch.prims.convert_element_type %1918, %int6_2350 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_2351 = torch.constant.int 1
    %int24_2352 = torch.constant.int 24
    %int4608_2353 = torch.constant.int 4608
    %int-1_2354 = torch.constant.int -1
    %int1_2355 = torch.constant.int 1
    %int2_2356 = torch.constant.int 2
    %1924 = torch.prim.ListConstruct %int1_2351, %int24_2352, %int4608_2353, %int-1_2354, %int1_2355, %int2_2356 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1925 = torch.aten.view %1923, %1924 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_2357 = torch.constant.int 6
    %1926 = torch.prims.convert_element_type %1920, %int6_2357 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_2358 = torch.constant.int 1
    %int24_2359 = torch.constant.int 24
    %int4608_2360 = torch.constant.int 4608
    %int-1_2361 = torch.constant.int -1
    %int1_2362 = torch.constant.int 1
    %int2_2363 = torch.constant.int 2
    %1927 = torch.prim.ListConstruct %int1_2358, %int24_2359, %int4608_2360, %int-1_2361, %int1_2362, %int2_2363 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1928 = torch.aten.view %1926, %1927 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_2364 = torch.constant.int 5
    %int0_2365 = torch.constant.int 0
    %1929 = torch.aten.select.int %211, %int5_2364, %int0_2365 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2366 = torch.constant.int 5
    %int0_2367 = torch.constant.int 0
    %1930 = torch.aten.select.int %1925, %int5_2366, %int0_2367 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1931 = torch.aten.mul.Tensor %1929, %1930 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_2368 = torch.constant.int 5
    %int1_2369 = torch.constant.int 1
    %1932 = torch.aten.select.int %211, %int5_2368, %int1_2369 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2370 = torch.constant.int 5
    %int1_2371 = torch.constant.int 1
    %1933 = torch.aten.select.int %1925, %int5_2370, %int1_2371 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1934 = torch.aten.mul.Tensor %1932, %1933 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_2372 = torch.constant.int 1
    %1935 = torch.aten.add.Tensor %1931, %1934, %int1_2372 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_2373 = torch.constant.int 5
    %int0_2374 = torch.constant.int 0
    %1936 = torch.aten.select.int %211, %int5_2373, %int0_2374 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2375 = torch.constant.int 5
    %int0_2376 = torch.constant.int 0
    %1937 = torch.aten.select.int %1928, %int5_2375, %int0_2376 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1938 = torch.aten.mul.Tensor %1936, %1937 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_2377 = torch.constant.int 5
    %int1_2378 = torch.constant.int 1
    %1939 = torch.aten.select.int %211, %int5_2377, %int1_2378 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2379 = torch.constant.int 5
    %int1_2380 = torch.constant.int 1
    %1940 = torch.aten.select.int %1928, %int5_2379, %int1_2380 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %1941 = torch.aten.mul.Tensor %1939, %1940 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_2381 = torch.constant.int 1
    %1942 = torch.aten.add.Tensor %1938, %1941, %int1_2381 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_2382 = torch.constant.int 1
    %int24_2383 = torch.constant.int 24
    %int4608_2384 = torch.constant.int 4608
    %int128_2385 = torch.constant.int 128
    %1943 = torch.prim.ListConstruct %int1_2382, %int24_2383, %int4608_2384, %int128_2385 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1944 = torch.aten.view %1935, %1943 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_2386 = torch.constant.int 5
    %1945 = torch.prims.convert_element_type %1944, %int5_2386 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_2387 = torch.constant.int 1
    %int24_2388 = torch.constant.int 24
    %int4608_2389 = torch.constant.int 4608
    %int128_2390 = torch.constant.int 128
    %1946 = torch.prim.ListConstruct %int1_2387, %int24_2388, %int4608_2389, %int128_2390 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1947 = torch.aten.view %1942, %1946 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_2391 = torch.constant.int 5
    %1948 = torch.prims.convert_element_type %1947, %int5_2391 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_2392 = torch.constant.float 0.000000e+00
    %false_2393 = torch.constant.bool false
    %none_2394 = torch.constant.none
    %none_2395 = torch.constant.none
    %1949:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%1945, %1948, %1922, %float0.000000e00_2392, %false_2393, %none_2394, %none_2395) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_2396 = torch.constant.int 0
    %int2_2397 = torch.constant.int 2
    %int1_2398 = torch.constant.int 1
    %int3_2399 = torch.constant.int 3
    %1950 = torch.prim.ListConstruct %int0_2396, %int2_2397, %int1_2398, %int3_2399 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1951 = torch.aten.permute %1949#0, %1950 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_2400 = torch.constant.int 1
    %int4608_2401 = torch.constant.int 4608
    %int3072_2402 = torch.constant.int 3072
    %1952 = torch.prim.ListConstruct %int1_2400, %int4608_2401, %int3072_2402 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1953 = torch.aten.view %1951, %1952 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_2403 = torch.constant.int 0
    %int0_2404 = torch.constant.int 0
    %int9223372036854775807_2405 = torch.constant.int 9223372036854775807
    %int1_2406 = torch.constant.int 1
    %1954 = torch.aten.slice.Tensor %1953, %int0_2403, %int0_2404, %int9223372036854775807_2405, %int1_2406 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_2407 = torch.constant.int 1
    %int0_2408 = torch.constant.int 0
    %int512_2409 = torch.constant.int 512
    %int1_2410 = torch.constant.int 1
    %1955 = torch.aten.slice.Tensor %1954, %int1_2407, %int0_2408, %int512_2409, %int1_2410 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_2411 = torch.constant.int 0
    %int0_2412 = torch.constant.int 0
    %int9223372036854775807_2413 = torch.constant.int 9223372036854775807
    %int1_2414 = torch.constant.int 1
    %1956 = torch.aten.slice.Tensor %1953, %int0_2411, %int0_2412, %int9223372036854775807_2413, %int1_2414 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_2415 = torch.constant.int 1
    %int512_2416 = torch.constant.int 512
    %int9223372036854775807_2417 = torch.constant.int 9223372036854775807
    %int1_2418 = torch.constant.int 1
    %1957 = torch.aten.slice.Tensor %1956, %int1_2415, %int512_2416, %int9223372036854775807_2417, %int1_2418 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_2419 = torch.constant.int 4096
    %int3072_2420 = torch.constant.int 3072
    %1958 = torch.prim.ListConstruct %int4096_2419, %int3072_2420 : (!torch.int, !torch.int) -> !torch.list<int>
    %1959 = torch.aten.view %1957, %1958 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.5.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.5.img_attn.proj.weight : tensor<3072x3072xf16>
    %1960 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_2421 = torch.constant.int 0
    %int1_2422 = torch.constant.int 1
    %1961 = torch.aten.transpose.int %1960, %int0_2421, %int1_2422 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.5.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.5.img_attn.proj.bias : tensor<3072xf16>
    %1962 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2423 = torch.constant.int 6
    %1963 = torch.prims.convert_element_type %1962, %int6_2423 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2424 = torch.constant.int 6
    %1964 = torch.prims.convert_element_type %1959, %int6_2424 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_2425 = torch.constant.int 6
    %1965 = torch.prims.convert_element_type %1961, %int6_2425 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %1966 = torch.aten.mm %1964, %1965 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_2426 = torch.constant.int 1
    %1967 = torch.aten.mul.Scalar %1966, %int1_2426 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_2427 = torch.constant.int 1
    %1968 = torch.aten.mul.Scalar %1963, %int1_2427 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2428 = torch.constant.int 1
    %1969 = torch.aten.add.Tensor %1967, %1968, %int1_2428 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_2429 = torch.constant.int 5
    %1970 = torch.prims.convert_element_type %1969, %int5_2429 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_2430 = torch.constant.int 1
    %int4096_2431 = torch.constant.int 4096
    %int3072_2432 = torch.constant.int 3072
    %1971 = torch.prim.ListConstruct %int1_2430, %int4096_2431, %int3072_2432 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1972 = torch.aten.view %1970, %1971 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %1973 = torch.aten.mul.Tensor %1784, %1972 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2433 = torch.constant.int 1
    %1974 = torch.aten.add.Tensor %1706, %1973, %int1_2433 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2434 = torch.constant.int 1
    %int1_2435 = torch.constant.int 1
    %1975 = torch.aten.add.Scalar %1786, %int1_2434, %int1_2435 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_2436 = torch.constant.int 6
    %1976 = torch.prims.convert_element_type %1974, %int6_2436 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_2437 = torch.constant.int 2
    %1977 = torch.prim.ListConstruct %int2_2437 : (!torch.int) -> !torch.list<int>
    %int0_2438 = torch.constant.int 0
    %true_2439 = torch.constant.bool true
    %result0_2440, %result1_2441 = torch.aten.var_mean.correction %1976, %1977, %int0_2438, %true_2439 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_2442 = torch.constant.float 9.9999999999999995E-7
    %int1_2443 = torch.constant.int 1
    %1978 = torch.aten.add.Scalar %result0_2440, %float9.999990e-07_2442, %int1_2443 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %1979 = torch.aten.rsqrt %1978 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_2444 = torch.constant.int 1
    %1980 = torch.aten.sub.Tensor %1974, %result1_2441, %int1_2444 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %1981 = torch.aten.mul.Tensor %1980, %1979 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_2445 = torch.constant.int 5
    %1982 = torch.prims.convert_element_type %1981, %int5_2445 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %1983 = torch.aten.mul.Tensor %1975, %1982 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2446 = torch.constant.int 1
    %1984 = torch.aten.add.Tensor %1983, %1785, %int1_2446 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_2447 = torch.constant.int 4096
    %int3072_2448 = torch.constant.int 3072
    %1985 = torch.prim.ListConstruct %int4096_2447, %int3072_2448 : (!torch.int, !torch.int) -> !torch.list<int>
    %1986 = torch.aten.view %1984, %1985 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.5.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.5.img_mlp.0.weight : tensor<12288x3072xf16>
    %1987 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_2449 = torch.constant.int 0
    %int1_2450 = torch.constant.int 1
    %1988 = torch.aten.transpose.int %1987, %int0_2449, %int1_2450 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.5.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.5.img_mlp.0.bias : tensor<12288xf16>
    %1989 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_2451 = torch.constant.int 6
    %1990 = torch.prims.convert_element_type %1989, %int6_2451 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_2452 = torch.constant.int 6
    %1991 = torch.prims.convert_element_type %1986, %int6_2452 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_2453 = torch.constant.int 6
    %1992 = torch.prims.convert_element_type %1988, %int6_2453 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %1993 = torch.aten.mm %1991, %1992 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_2454 = torch.constant.int 1
    %1994 = torch.aten.mul.Scalar %1993, %int1_2454 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_2455 = torch.constant.int 1
    %1995 = torch.aten.mul.Scalar %1990, %int1_2455 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_2456 = torch.constant.int 1
    %1996 = torch.aten.add.Tensor %1994, %1995, %int1_2456 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_2457 = torch.constant.int 5
    %1997 = torch.prims.convert_element_type %1996, %int5_2457 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_2458 = torch.constant.int 1
    %int4096_2459 = torch.constant.int 4096
    %int12288_2460 = torch.constant.int 12288
    %1998 = torch.prim.ListConstruct %int1_2458, %int4096_2459, %int12288_2460 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %1999 = torch.aten.view %1997, %1998 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_2461 = torch.constant.str "tanh"
    %2000 = torch.aten.gelu %1999, %str_2461 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_2462 = torch.constant.int 4096
    %int12288_2463 = torch.constant.int 12288
    %2001 = torch.prim.ListConstruct %int4096_2462, %int12288_2463 : (!torch.int, !torch.int) -> !torch.list<int>
    %2002 = torch.aten.view %2000, %2001 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.5.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.5.img_mlp.2.weight : tensor<3072x12288xf16>
    %2003 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_2464 = torch.constant.int 0
    %int1_2465 = torch.constant.int 1
    %2004 = torch.aten.transpose.int %2003, %int0_2464, %int1_2465 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.5.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.5.img_mlp.2.bias : tensor<3072xf16>
    %2005 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2466 = torch.constant.int 6
    %2006 = torch.prims.convert_element_type %2005, %int6_2466 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2467 = torch.constant.int 6
    %2007 = torch.prims.convert_element_type %2002, %int6_2467 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_2468 = torch.constant.int 6
    %2008 = torch.prims.convert_element_type %2004, %int6_2468 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %2009 = torch.aten.mm %2007, %2008 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_2469 = torch.constant.int 1
    %2010 = torch.aten.mul.Scalar %2009, %int1_2469 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_2470 = torch.constant.int 1
    %2011 = torch.aten.mul.Scalar %2006, %int1_2470 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2471 = torch.constant.int 1
    %2012 = torch.aten.add.Tensor %2010, %2011, %int1_2471 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_2472 = torch.constant.int 5
    %2013 = torch.prims.convert_element_type %2012, %int5_2472 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_2473 = torch.constant.int 1
    %int4096_2474 = torch.constant.int 4096
    %int3072_2475 = torch.constant.int 3072
    %2014 = torch.prim.ListConstruct %int1_2473, %int4096_2474, %int3072_2475 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2015 = torch.aten.view %2013, %2014 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %2016 = torch.aten.mul.Tensor %1787, %2015 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2476 = torch.constant.int 1
    %2017 = torch.aten.add.Tensor %1974, %2016, %int1_2476 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_2477 = torch.constant.int 512
    %int3072_2478 = torch.constant.int 3072
    %2018 = torch.prim.ListConstruct %int512_2477, %int3072_2478 : (!torch.int, !torch.int) -> !torch.list<int>
    %2019 = torch.aten.view %1955, %2018 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.5.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.5.txt_attn.proj.weight : tensor<3072x3072xf16>
    %2020 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_2479 = torch.constant.int 0
    %int1_2480 = torch.constant.int 1
    %2021 = torch.aten.transpose.int %2020, %int0_2479, %int1_2480 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.5.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.5.txt_attn.proj.bias : tensor<3072xf16>
    %2022 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2481 = torch.constant.int 6
    %2023 = torch.prims.convert_element_type %2022, %int6_2481 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2482 = torch.constant.int 6
    %2024 = torch.prims.convert_element_type %2019, %int6_2482 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_2483 = torch.constant.int 6
    %2025 = torch.prims.convert_element_type %2021, %int6_2483 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %2026 = torch.aten.mm %2024, %2025 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_2484 = torch.constant.int 1
    %2027 = torch.aten.mul.Scalar %2026, %int1_2484 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_2485 = torch.constant.int 1
    %2028 = torch.aten.mul.Scalar %2023, %int1_2485 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2486 = torch.constant.int 1
    %2029 = torch.aten.add.Tensor %2027, %2028, %int1_2486 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_2487 = torch.constant.int 5
    %2030 = torch.prims.convert_element_type %2029, %int5_2487 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_2488 = torch.constant.int 1
    %int512_2489 = torch.constant.int 512
    %int3072_2490 = torch.constant.int 3072
    %2031 = torch.prim.ListConstruct %int1_2488, %int512_2489, %int3072_2490 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2032 = torch.aten.view %2030, %2031 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %2033 = torch.aten.mul.Tensor %1805, %2032 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2491 = torch.constant.int 1
    %2034 = torch.aten.add.Tensor %1766, %2033, %int1_2491 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_2492 = torch.constant.int 1
    %int1_2493 = torch.constant.int 1
    %2035 = torch.aten.add.Scalar %1807, %int1_2492, %int1_2493 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_2494 = torch.constant.int 6
    %2036 = torch.prims.convert_element_type %2034, %int6_2494 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_2495 = torch.constant.int 2
    %2037 = torch.prim.ListConstruct %int2_2495 : (!torch.int) -> !torch.list<int>
    %int0_2496 = torch.constant.int 0
    %true_2497 = torch.constant.bool true
    %result0_2498, %result1_2499 = torch.aten.var_mean.correction %2036, %2037, %int0_2496, %true_2497 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_2500 = torch.constant.float 9.9999999999999995E-7
    %int1_2501 = torch.constant.int 1
    %2038 = torch.aten.add.Scalar %result0_2498, %float9.999990e-07_2500, %int1_2501 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %2039 = torch.aten.rsqrt %2038 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_2502 = torch.constant.int 1
    %2040 = torch.aten.sub.Tensor %2034, %result1_2499, %int1_2502 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %2041 = torch.aten.mul.Tensor %2040, %2039 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_2503 = torch.constant.int 5
    %2042 = torch.prims.convert_element_type %2041, %int5_2503 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %2043 = torch.aten.mul.Tensor %2035, %2042 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2504 = torch.constant.int 1
    %2044 = torch.aten.add.Tensor %2043, %1806, %int1_2504 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_2505 = torch.constant.int 512
    %int3072_2506 = torch.constant.int 3072
    %2045 = torch.prim.ListConstruct %int512_2505, %int3072_2506 : (!torch.int, !torch.int) -> !torch.list<int>
    %2046 = torch.aten.view %2044, %2045 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.5.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.5.txt_mlp.0.weight : tensor<12288x3072xf16>
    %2047 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_2507 = torch.constant.int 0
    %int1_2508 = torch.constant.int 1
    %2048 = torch.aten.transpose.int %2047, %int0_2507, %int1_2508 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.5.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.5.txt_mlp.0.bias : tensor<12288xf16>
    %2049 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_2509 = torch.constant.int 6
    %2050 = torch.prims.convert_element_type %2049, %int6_2509 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_2510 = torch.constant.int 6
    %2051 = torch.prims.convert_element_type %2046, %int6_2510 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_2511 = torch.constant.int 6
    %2052 = torch.prims.convert_element_type %2048, %int6_2511 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %2053 = torch.aten.mm %2051, %2052 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_2512 = torch.constant.int 1
    %2054 = torch.aten.mul.Scalar %2053, %int1_2512 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_2513 = torch.constant.int 1
    %2055 = torch.aten.mul.Scalar %2050, %int1_2513 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_2514 = torch.constant.int 1
    %2056 = torch.aten.add.Tensor %2054, %2055, %int1_2514 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_2515 = torch.constant.int 5
    %2057 = torch.prims.convert_element_type %2056, %int5_2515 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_2516 = torch.constant.int 1
    %int512_2517 = torch.constant.int 512
    %int12288_2518 = torch.constant.int 12288
    %2058 = torch.prim.ListConstruct %int1_2516, %int512_2517, %int12288_2518 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2059 = torch.aten.view %2057, %2058 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_2519 = torch.constant.str "tanh"
    %2060 = torch.aten.gelu %2059, %str_2519 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_2520 = torch.constant.int 512
    %int12288_2521 = torch.constant.int 12288
    %2061 = torch.prim.ListConstruct %int512_2520, %int12288_2521 : (!torch.int, !torch.int) -> !torch.list<int>
    %2062 = torch.aten.view %2060, %2061 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.5.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.5.txt_mlp.2.weight : tensor<3072x12288xf16>
    %2063 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_2522 = torch.constant.int 0
    %int1_2523 = torch.constant.int 1
    %2064 = torch.aten.transpose.int %2063, %int0_2522, %int1_2523 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.5.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.5.txt_mlp.2.bias : tensor<3072xf16>
    %2065 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.5.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2524 = torch.constant.int 6
    %2066 = torch.prims.convert_element_type %2065, %int6_2524 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2525 = torch.constant.int 6
    %2067 = torch.prims.convert_element_type %2062, %int6_2525 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_2526 = torch.constant.int 6
    %2068 = torch.prims.convert_element_type %2064, %int6_2526 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %2069 = torch.aten.mm %2067, %2068 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_2527 = torch.constant.int 1
    %2070 = torch.aten.mul.Scalar %2069, %int1_2527 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_2528 = torch.constant.int 1
    %2071 = torch.aten.mul.Scalar %2066, %int1_2528 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2529 = torch.constant.int 1
    %2072 = torch.aten.add.Tensor %2070, %2071, %int1_2529 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_2530 = torch.constant.int 5
    %2073 = torch.prims.convert_element_type %2072, %int5_2530 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_2531 = torch.constant.int 1
    %int512_2532 = torch.constant.int 512
    %int3072_2533 = torch.constant.int 3072
    %2074 = torch.prim.ListConstruct %int1_2531, %int512_2532, %int3072_2533 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2075 = torch.aten.view %2073, %2074 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %2076 = torch.aten.mul.Tensor %1808, %2075 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2534 = torch.constant.int 1
    %2077 = torch.aten.add.Tensor %2034, %2076, %int1_2534 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %2078 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.6.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.6.img_mod.lin.weight : tensor<18432x3072xf16>
    %2079 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_2535 = torch.constant.int 0
    %int1_2536 = torch.constant.int 1
    %2080 = torch.aten.transpose.int %2079, %int0_2535, %int1_2536 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.6.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.6.img_mod.lin.bias : tensor<18432xf16>
    %2081 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_2537 = torch.constant.int 6
    %2082 = torch.prims.convert_element_type %2081, %int6_2537 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_2538 = torch.constant.int 6
    %2083 = torch.prims.convert_element_type %2078, %int6_2538 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_2539 = torch.constant.int 6
    %2084 = torch.prims.convert_element_type %2080, %int6_2539 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %2085 = torch.aten.mm %2083, %2084 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_2540 = torch.constant.int 1
    %2086 = torch.aten.mul.Scalar %2085, %int1_2540 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_2541 = torch.constant.int 1
    %2087 = torch.aten.mul.Scalar %2082, %int1_2541 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_2542 = torch.constant.int 1
    %2088 = torch.aten.add.Tensor %2086, %2087, %int1_2542 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_2543 = torch.constant.int 5
    %2089 = torch.prims.convert_element_type %2088, %int5_2543 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_2544 = torch.constant.int 0
    %int0_2545 = torch.constant.int 0
    %int9223372036854775807_2546 = torch.constant.int 9223372036854775807
    %int1_2547 = torch.constant.int 1
    %2090 = torch.aten.slice.Tensor %2089, %int0_2544, %int0_2545, %int9223372036854775807_2546, %int1_2547 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_2548 = torch.constant.int 1
    %2091 = torch.aten.unsqueeze %2090, %int1_2548 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_2549 = torch.constant.int 2
    %int0_2550 = torch.constant.int 0
    %int9223372036854775807_2551 = torch.constant.int 9223372036854775807
    %int1_2552 = torch.constant.int 1
    %2092 = torch.aten.slice.Tensor %2091, %int2_2549, %int0_2550, %int9223372036854775807_2551, %int1_2552 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_2553 = torch.constant.int -1
    %int0_2554 = torch.constant.int 0
    %int3072_2555 = torch.constant.int 3072
    %int1_2556 = torch.constant.int 1
    %2093 = torch.aten.slice.Tensor %2092, %int-1_2553, %int0_2554, %int3072_2555, %int1_2556 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2557 = torch.constant.int -1
    %int3072_2558 = torch.constant.int 3072
    %int6144_2559 = torch.constant.int 6144
    %int1_2560 = torch.constant.int 1
    %2094 = torch.aten.slice.Tensor %2092, %int-1_2557, %int3072_2558, %int6144_2559, %int1_2560 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2561 = torch.constant.int -1
    %int6144_2562 = torch.constant.int 6144
    %int9216_2563 = torch.constant.int 9216
    %int1_2564 = torch.constant.int 1
    %2095 = torch.aten.slice.Tensor %2092, %int-1_2561, %int6144_2562, %int9216_2563, %int1_2564 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2565 = torch.constant.int -1
    %int9216_2566 = torch.constant.int 9216
    %int12288_2567 = torch.constant.int 12288
    %int1_2568 = torch.constant.int 1
    %2096 = torch.aten.slice.Tensor %2092, %int-1_2565, %int9216_2566, %int12288_2567, %int1_2568 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2569 = torch.constant.int -1
    %int12288_2570 = torch.constant.int 12288
    %int15360_2571 = torch.constant.int 15360
    %int1_2572 = torch.constant.int 1
    %2097 = torch.aten.slice.Tensor %2092, %int-1_2569, %int12288_2570, %int15360_2571, %int1_2572 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2573 = torch.constant.int -1
    %int15360_2574 = torch.constant.int 15360
    %int18432_2575 = torch.constant.int 18432
    %int1_2576 = torch.constant.int 1
    %2098 = torch.aten.slice.Tensor %2092, %int-1_2573, %int15360_2574, %int18432_2575, %int1_2576 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2099 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.6.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.6.txt_mod.lin.weight : tensor<18432x3072xf16>
    %2100 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_2577 = torch.constant.int 0
    %int1_2578 = torch.constant.int 1
    %2101 = torch.aten.transpose.int %2100, %int0_2577, %int1_2578 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.6.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.6.txt_mod.lin.bias : tensor<18432xf16>
    %2102 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_2579 = torch.constant.int 6
    %2103 = torch.prims.convert_element_type %2102, %int6_2579 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_2580 = torch.constant.int 6
    %2104 = torch.prims.convert_element_type %2099, %int6_2580 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_2581 = torch.constant.int 6
    %2105 = torch.prims.convert_element_type %2101, %int6_2581 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %2106 = torch.aten.mm %2104, %2105 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_2582 = torch.constant.int 1
    %2107 = torch.aten.mul.Scalar %2106, %int1_2582 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_2583 = torch.constant.int 1
    %2108 = torch.aten.mul.Scalar %2103, %int1_2583 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_2584 = torch.constant.int 1
    %2109 = torch.aten.add.Tensor %2107, %2108, %int1_2584 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_2585 = torch.constant.int 5
    %2110 = torch.prims.convert_element_type %2109, %int5_2585 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_2586 = torch.constant.int 0
    %int0_2587 = torch.constant.int 0
    %int9223372036854775807_2588 = torch.constant.int 9223372036854775807
    %int1_2589 = torch.constant.int 1
    %2111 = torch.aten.slice.Tensor %2110, %int0_2586, %int0_2587, %int9223372036854775807_2588, %int1_2589 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_2590 = torch.constant.int 1
    %2112 = torch.aten.unsqueeze %2111, %int1_2590 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_2591 = torch.constant.int 2
    %int0_2592 = torch.constant.int 0
    %int9223372036854775807_2593 = torch.constant.int 9223372036854775807
    %int1_2594 = torch.constant.int 1
    %2113 = torch.aten.slice.Tensor %2112, %int2_2591, %int0_2592, %int9223372036854775807_2593, %int1_2594 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_2595 = torch.constant.int -1
    %int0_2596 = torch.constant.int 0
    %int3072_2597 = torch.constant.int 3072
    %int1_2598 = torch.constant.int 1
    %2114 = torch.aten.slice.Tensor %2113, %int-1_2595, %int0_2596, %int3072_2597, %int1_2598 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2599 = torch.constant.int -1
    %int3072_2600 = torch.constant.int 3072
    %int6144_2601 = torch.constant.int 6144
    %int1_2602 = torch.constant.int 1
    %2115 = torch.aten.slice.Tensor %2113, %int-1_2599, %int3072_2600, %int6144_2601, %int1_2602 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2603 = torch.constant.int -1
    %int6144_2604 = torch.constant.int 6144
    %int9216_2605 = torch.constant.int 9216
    %int1_2606 = torch.constant.int 1
    %2116 = torch.aten.slice.Tensor %2113, %int-1_2603, %int6144_2604, %int9216_2605, %int1_2606 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2607 = torch.constant.int -1
    %int9216_2608 = torch.constant.int 9216
    %int12288_2609 = torch.constant.int 12288
    %int1_2610 = torch.constant.int 1
    %2117 = torch.aten.slice.Tensor %2113, %int-1_2607, %int9216_2608, %int12288_2609, %int1_2610 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2611 = torch.constant.int -1
    %int12288_2612 = torch.constant.int 12288
    %int15360_2613 = torch.constant.int 15360
    %int1_2614 = torch.constant.int 1
    %2118 = torch.aten.slice.Tensor %2113, %int-1_2611, %int12288_2612, %int15360_2613, %int1_2614 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2615 = torch.constant.int -1
    %int15360_2616 = torch.constant.int 15360
    %int18432_2617 = torch.constant.int 18432
    %int1_2618 = torch.constant.int 1
    %2119 = torch.aten.slice.Tensor %2113, %int-1_2615, %int15360_2616, %int18432_2617, %int1_2618 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_2619 = torch.constant.int 6
    %2120 = torch.prims.convert_element_type %2017, %int6_2619 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_2620 = torch.constant.int 2
    %2121 = torch.prim.ListConstruct %int2_2620 : (!torch.int) -> !torch.list<int>
    %int0_2621 = torch.constant.int 0
    %true_2622 = torch.constant.bool true
    %result0_2623, %result1_2624 = torch.aten.var_mean.correction %2120, %2121, %int0_2621, %true_2622 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_2625 = torch.constant.float 9.9999999999999995E-7
    %int1_2626 = torch.constant.int 1
    %2122 = torch.aten.add.Scalar %result0_2623, %float9.999990e-07_2625, %int1_2626 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %2123 = torch.aten.rsqrt %2122 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_2627 = torch.constant.int 1
    %2124 = torch.aten.sub.Tensor %2017, %result1_2624, %int1_2627 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %2125 = torch.aten.mul.Tensor %2124, %2123 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_2628 = torch.constant.int 5
    %2126 = torch.prims.convert_element_type %2125, %int5_2628 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2629 = torch.constant.int 1
    %int1_2630 = torch.constant.int 1
    %2127 = torch.aten.add.Scalar %2094, %int1_2629, %int1_2630 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2128 = torch.aten.mul.Tensor %2127, %2126 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2631 = torch.constant.int 1
    %2129 = torch.aten.add.Tensor %2128, %2093, %int1_2631 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_2632 = torch.constant.int 4096
    %int3072_2633 = torch.constant.int 3072
    %2130 = torch.prim.ListConstruct %int4096_2632, %int3072_2633 : (!torch.int, !torch.int) -> !torch.list<int>
    %2131 = torch.aten.view %2129, %2130 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.6.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.6.img_attn.qkv.weight : tensor<9216x3072xf16>
    %2132 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_2634 = torch.constant.int 0
    %int1_2635 = torch.constant.int 1
    %2133 = torch.aten.transpose.int %2132, %int0_2634, %int1_2635 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.6.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.6.img_attn.qkv.bias : tensor<9216xf16>
    %2134 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_2636 = torch.constant.int 6
    %2135 = torch.prims.convert_element_type %2134, %int6_2636 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_2637 = torch.constant.int 6
    %2136 = torch.prims.convert_element_type %2131, %int6_2637 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_2638 = torch.constant.int 6
    %2137 = torch.prims.convert_element_type %2133, %int6_2638 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %2138 = torch.aten.mm %2136, %2137 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_2639 = torch.constant.int 1
    %2139 = torch.aten.mul.Scalar %2138, %int1_2639 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_2640 = torch.constant.int 1
    %2140 = torch.aten.mul.Scalar %2135, %int1_2640 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_2641 = torch.constant.int 1
    %2141 = torch.aten.add.Tensor %2139, %2140, %int1_2641 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_2642 = torch.constant.int 5
    %2142 = torch.prims.convert_element_type %2141, %int5_2642 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_2643 = torch.constant.int 1
    %int4096_2644 = torch.constant.int 4096
    %int9216_2645 = torch.constant.int 9216
    %2143 = torch.prim.ListConstruct %int1_2643, %int4096_2644, %int9216_2645 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2144 = torch.aten.view %2142, %2143 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_2646 = torch.constant.int 1
    %int4096_2647 = torch.constant.int 4096
    %int3_2648 = torch.constant.int 3
    %int24_2649 = torch.constant.int 24
    %int128_2650 = torch.constant.int 128
    %2145 = torch.prim.ListConstruct %int1_2646, %int4096_2647, %int3_2648, %int24_2649, %int128_2650 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2146 = torch.aten.view %2144, %2145 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_2651 = torch.constant.int 2
    %int0_2652 = torch.constant.int 0
    %int3_2653 = torch.constant.int 3
    %int1_2654 = torch.constant.int 1
    %int4_2655 = torch.constant.int 4
    %2147 = torch.prim.ListConstruct %int2_2651, %int0_2652, %int3_2653, %int1_2654, %int4_2655 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2148 = torch.aten.permute %2146, %2147 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_2656 = torch.constant.int 0
    %int0_2657 = torch.constant.int 0
    %2149 = torch.aten.select.int %2148, %int0_2656, %int0_2657 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_2658 = torch.constant.int 0
    %int1_2659 = torch.constant.int 1
    %2150 = torch.aten.select.int %2148, %int0_2658, %int1_2659 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_2660 = torch.constant.int 0
    %int2_2661 = torch.constant.int 2
    %2151 = torch.aten.select.int %2148, %int0_2660, %int2_2661 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_2662 = torch.constant.int 6
    %2152 = torch.prims.convert_element_type %2149, %int6_2662 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_2663 = torch.constant.int 2
    %2153 = torch.aten.pow.Tensor_Scalar %2152, %int2_2663 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_2664 = torch.constant.int -1
    %2154 = torch.prim.ListConstruct %int-1_2664 : (!torch.int) -> !torch.list<int>
    %true_2665 = torch.constant.bool true
    %none_2666 = torch.constant.none
    %2155 = torch.aten.mean.dim %2153, %2154, %true_2665, %none_2666 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_2667 = torch.constant.float 9.9999999999999995E-7
    %int1_2668 = torch.constant.int 1
    %2156 = torch.aten.add.Scalar %2155, %float9.999990e-07_2667, %int1_2668 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %2157 = torch.aten.rsqrt %2156 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %2158 = torch.aten.mul.Tensor %2152, %2157 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_2669 = torch.constant.int 5
    %2159 = torch.prims.convert_element_type %2158, %int5_2669 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.6.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.6.img_attn.norm.query_norm.scale : tensor<128xf16>
    %2160 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2161 = torch.aten.mul.Tensor %2159, %2160 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_2670 = torch.constant.int 6
    %2162 = torch.prims.convert_element_type %2150, %int6_2670 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_2671 = torch.constant.int 2
    %2163 = torch.aten.pow.Tensor_Scalar %2162, %int2_2671 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_2672 = torch.constant.int -1
    %2164 = torch.prim.ListConstruct %int-1_2672 : (!torch.int) -> !torch.list<int>
    %true_2673 = torch.constant.bool true
    %none_2674 = torch.constant.none
    %2165 = torch.aten.mean.dim %2163, %2164, %true_2673, %none_2674 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_2675 = torch.constant.float 9.9999999999999995E-7
    %int1_2676 = torch.constant.int 1
    %2166 = torch.aten.add.Scalar %2165, %float9.999990e-07_2675, %int1_2676 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %2167 = torch.aten.rsqrt %2166 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %2168 = torch.aten.mul.Tensor %2162, %2167 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_2677 = torch.constant.int 5
    %2169 = torch.prims.convert_element_type %2168, %int5_2677 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.6.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.6.img_attn.norm.key_norm.scale : tensor<128xf16>
    %2170 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2171 = torch.aten.mul.Tensor %2169, %2170 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_2678 = torch.constant.int 5
    %2172 = torch.prims.convert_element_type %2161, %int5_2678 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_2679 = torch.constant.int 5
    %2173 = torch.prims.convert_element_type %2171, %int5_2679 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_2680 = torch.constant.int 6
    %2174 = torch.prims.convert_element_type %2077, %int6_2680 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_2681 = torch.constant.int 2
    %2175 = torch.prim.ListConstruct %int2_2681 : (!torch.int) -> !torch.list<int>
    %int0_2682 = torch.constant.int 0
    %true_2683 = torch.constant.bool true
    %result0_2684, %result1_2685 = torch.aten.var_mean.correction %2174, %2175, %int0_2682, %true_2683 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_2686 = torch.constant.float 9.9999999999999995E-7
    %int1_2687 = torch.constant.int 1
    %2176 = torch.aten.add.Scalar %result0_2684, %float9.999990e-07_2686, %int1_2687 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %2177 = torch.aten.rsqrt %2176 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_2688 = torch.constant.int 1
    %2178 = torch.aten.sub.Tensor %2077, %result1_2685, %int1_2688 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %2179 = torch.aten.mul.Tensor %2178, %2177 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_2689 = torch.constant.int 5
    %2180 = torch.prims.convert_element_type %2179, %int5_2689 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_2690 = torch.constant.int 1
    %int1_2691 = torch.constant.int 1
    %2181 = torch.aten.add.Scalar %2115, %int1_2690, %int1_2691 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2182 = torch.aten.mul.Tensor %2181, %2180 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2692 = torch.constant.int 1
    %2183 = torch.aten.add.Tensor %2182, %2114, %int1_2692 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_2693 = torch.constant.int 512
    %int3072_2694 = torch.constant.int 3072
    %2184 = torch.prim.ListConstruct %int512_2693, %int3072_2694 : (!torch.int, !torch.int) -> !torch.list<int>
    %2185 = torch.aten.view %2183, %2184 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.6.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.6.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %2186 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_2695 = torch.constant.int 0
    %int1_2696 = torch.constant.int 1
    %2187 = torch.aten.transpose.int %2186, %int0_2695, %int1_2696 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.6.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.6.txt_attn.qkv.bias : tensor<9216xf16>
    %2188 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_2697 = torch.constant.int 6
    %2189 = torch.prims.convert_element_type %2188, %int6_2697 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_2698 = torch.constant.int 6
    %2190 = torch.prims.convert_element_type %2185, %int6_2698 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_2699 = torch.constant.int 6
    %2191 = torch.prims.convert_element_type %2187, %int6_2699 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %2192 = torch.aten.mm %2190, %2191 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_2700 = torch.constant.int 1
    %2193 = torch.aten.mul.Scalar %2192, %int1_2700 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_2701 = torch.constant.int 1
    %2194 = torch.aten.mul.Scalar %2189, %int1_2701 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_2702 = torch.constant.int 1
    %2195 = torch.aten.add.Tensor %2193, %2194, %int1_2702 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_2703 = torch.constant.int 5
    %2196 = torch.prims.convert_element_type %2195, %int5_2703 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_2704 = torch.constant.int 1
    %int512_2705 = torch.constant.int 512
    %int9216_2706 = torch.constant.int 9216
    %2197 = torch.prim.ListConstruct %int1_2704, %int512_2705, %int9216_2706 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2198 = torch.aten.view %2196, %2197 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_2707 = torch.constant.int 1
    %int512_2708 = torch.constant.int 512
    %int3_2709 = torch.constant.int 3
    %int24_2710 = torch.constant.int 24
    %int128_2711 = torch.constant.int 128
    %2199 = torch.prim.ListConstruct %int1_2707, %int512_2708, %int3_2709, %int24_2710, %int128_2711 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2200 = torch.aten.view %2198, %2199 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_2712 = torch.constant.int 2
    %int0_2713 = torch.constant.int 0
    %int3_2714 = torch.constant.int 3
    %int1_2715 = torch.constant.int 1
    %int4_2716 = torch.constant.int 4
    %2201 = torch.prim.ListConstruct %int2_2712, %int0_2713, %int3_2714, %int1_2715, %int4_2716 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2202 = torch.aten.permute %2200, %2201 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_2717 = torch.constant.int 0
    %int0_2718 = torch.constant.int 0
    %2203 = torch.aten.select.int %2202, %int0_2717, %int0_2718 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_2719 = torch.constant.int 0
    %int1_2720 = torch.constant.int 1
    %2204 = torch.aten.select.int %2202, %int0_2719, %int1_2720 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_2721 = torch.constant.int 0
    %int2_2722 = torch.constant.int 2
    %2205 = torch.aten.select.int %2202, %int0_2721, %int2_2722 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_2723 = torch.constant.int 6
    %2206 = torch.prims.convert_element_type %2203, %int6_2723 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_2724 = torch.constant.int 2
    %2207 = torch.aten.pow.Tensor_Scalar %2206, %int2_2724 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_2725 = torch.constant.int -1
    %2208 = torch.prim.ListConstruct %int-1_2725 : (!torch.int) -> !torch.list<int>
    %true_2726 = torch.constant.bool true
    %none_2727 = torch.constant.none
    %2209 = torch.aten.mean.dim %2207, %2208, %true_2726, %none_2727 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_2728 = torch.constant.float 9.9999999999999995E-7
    %int1_2729 = torch.constant.int 1
    %2210 = torch.aten.add.Scalar %2209, %float9.999990e-07_2728, %int1_2729 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %2211 = torch.aten.rsqrt %2210 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %2212 = torch.aten.mul.Tensor %2206, %2211 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_2730 = torch.constant.int 5
    %2213 = torch.prims.convert_element_type %2212, %int5_2730 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.6.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.6.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %2214 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2215 = torch.aten.mul.Tensor %2213, %2214 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_2731 = torch.constant.int 6
    %2216 = torch.prims.convert_element_type %2204, %int6_2731 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_2732 = torch.constant.int 2
    %2217 = torch.aten.pow.Tensor_Scalar %2216, %int2_2732 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_2733 = torch.constant.int -1
    %2218 = torch.prim.ListConstruct %int-1_2733 : (!torch.int) -> !torch.list<int>
    %true_2734 = torch.constant.bool true
    %none_2735 = torch.constant.none
    %2219 = torch.aten.mean.dim %2217, %2218, %true_2734, %none_2735 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_2736 = torch.constant.float 9.9999999999999995E-7
    %int1_2737 = torch.constant.int 1
    %2220 = torch.aten.add.Scalar %2219, %float9.999990e-07_2736, %int1_2737 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %2221 = torch.aten.rsqrt %2220 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %2222 = torch.aten.mul.Tensor %2216, %2221 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_2738 = torch.constant.int 5
    %2223 = torch.prims.convert_element_type %2222, %int5_2738 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.6.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.6.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %2224 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2225 = torch.aten.mul.Tensor %2223, %2224 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_2739 = torch.constant.int 5
    %2226 = torch.prims.convert_element_type %2215, %int5_2739 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_2740 = torch.constant.int 5
    %2227 = torch.prims.convert_element_type %2225, %int5_2740 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %2228 = torch.prim.ListConstruct %2226, %2172 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_2741 = torch.constant.int 2
    %2229 = torch.aten.cat %2228, %int2_2741 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %2230 = torch.prim.ListConstruct %2227, %2173 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_2742 = torch.constant.int 2
    %2231 = torch.aten.cat %2230, %int2_2742 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %2232 = torch.prim.ListConstruct %2205, %2151 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_2743 = torch.constant.int 2
    %2233 = torch.aten.cat %2232, %int2_2743 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_2744 = torch.constant.int 6
    %2234 = torch.prims.convert_element_type %2229, %int6_2744 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_2745 = torch.constant.int 1
    %int24_2746 = torch.constant.int 24
    %int4608_2747 = torch.constant.int 4608
    %int-1_2748 = torch.constant.int -1
    %int1_2749 = torch.constant.int 1
    %int2_2750 = torch.constant.int 2
    %2235 = torch.prim.ListConstruct %int1_2745, %int24_2746, %int4608_2747, %int-1_2748, %int1_2749, %int2_2750 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2236 = torch.aten.view %2234, %2235 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_2751 = torch.constant.int 6
    %2237 = torch.prims.convert_element_type %2231, %int6_2751 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_2752 = torch.constant.int 1
    %int24_2753 = torch.constant.int 24
    %int4608_2754 = torch.constant.int 4608
    %int-1_2755 = torch.constant.int -1
    %int1_2756 = torch.constant.int 1
    %int2_2757 = torch.constant.int 2
    %2238 = torch.prim.ListConstruct %int1_2752, %int24_2753, %int4608_2754, %int-1_2755, %int1_2756, %int2_2757 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2239 = torch.aten.view %2237, %2238 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_2758 = torch.constant.int 5
    %int0_2759 = torch.constant.int 0
    %2240 = torch.aten.select.int %211, %int5_2758, %int0_2759 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2760 = torch.constant.int 5
    %int0_2761 = torch.constant.int 0
    %2241 = torch.aten.select.int %2236, %int5_2760, %int0_2761 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2242 = torch.aten.mul.Tensor %2240, %2241 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_2762 = torch.constant.int 5
    %int1_2763 = torch.constant.int 1
    %2243 = torch.aten.select.int %211, %int5_2762, %int1_2763 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2764 = torch.constant.int 5
    %int1_2765 = torch.constant.int 1
    %2244 = torch.aten.select.int %2236, %int5_2764, %int1_2765 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2245 = torch.aten.mul.Tensor %2243, %2244 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_2766 = torch.constant.int 1
    %2246 = torch.aten.add.Tensor %2242, %2245, %int1_2766 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_2767 = torch.constant.int 5
    %int0_2768 = torch.constant.int 0
    %2247 = torch.aten.select.int %211, %int5_2767, %int0_2768 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2769 = torch.constant.int 5
    %int0_2770 = torch.constant.int 0
    %2248 = torch.aten.select.int %2239, %int5_2769, %int0_2770 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2249 = torch.aten.mul.Tensor %2247, %2248 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_2771 = torch.constant.int 5
    %int1_2772 = torch.constant.int 1
    %2250 = torch.aten.select.int %211, %int5_2771, %int1_2772 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_2773 = torch.constant.int 5
    %int1_2774 = torch.constant.int 1
    %2251 = torch.aten.select.int %2239, %int5_2773, %int1_2774 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2252 = torch.aten.mul.Tensor %2250, %2251 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_2775 = torch.constant.int 1
    %2253 = torch.aten.add.Tensor %2249, %2252, %int1_2775 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_2776 = torch.constant.int 1
    %int24_2777 = torch.constant.int 24
    %int4608_2778 = torch.constant.int 4608
    %int128_2779 = torch.constant.int 128
    %2254 = torch.prim.ListConstruct %int1_2776, %int24_2777, %int4608_2778, %int128_2779 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2255 = torch.aten.view %2246, %2254 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_2780 = torch.constant.int 5
    %2256 = torch.prims.convert_element_type %2255, %int5_2780 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_2781 = torch.constant.int 1
    %int24_2782 = torch.constant.int 24
    %int4608_2783 = torch.constant.int 4608
    %int128_2784 = torch.constant.int 128
    %2257 = torch.prim.ListConstruct %int1_2781, %int24_2782, %int4608_2783, %int128_2784 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2258 = torch.aten.view %2253, %2257 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_2785 = torch.constant.int 5
    %2259 = torch.prims.convert_element_type %2258, %int5_2785 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_2786 = torch.constant.float 0.000000e+00
    %false_2787 = torch.constant.bool false
    %none_2788 = torch.constant.none
    %none_2789 = torch.constant.none
    %2260:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2256, %2259, %2233, %float0.000000e00_2786, %false_2787, %none_2788, %none_2789) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_2790 = torch.constant.int 0
    %int2_2791 = torch.constant.int 2
    %int1_2792 = torch.constant.int 1
    %int3_2793 = torch.constant.int 3
    %2261 = torch.prim.ListConstruct %int0_2790, %int2_2791, %int1_2792, %int3_2793 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2262 = torch.aten.permute %2260#0, %2261 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_2794 = torch.constant.int 1
    %int4608_2795 = torch.constant.int 4608
    %int3072_2796 = torch.constant.int 3072
    %2263 = torch.prim.ListConstruct %int1_2794, %int4608_2795, %int3072_2796 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2264 = torch.aten.view %2262, %2263 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_2797 = torch.constant.int 0
    %int0_2798 = torch.constant.int 0
    %int9223372036854775807_2799 = torch.constant.int 9223372036854775807
    %int1_2800 = torch.constant.int 1
    %2265 = torch.aten.slice.Tensor %2264, %int0_2797, %int0_2798, %int9223372036854775807_2799, %int1_2800 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_2801 = torch.constant.int 1
    %int0_2802 = torch.constant.int 0
    %int512_2803 = torch.constant.int 512
    %int1_2804 = torch.constant.int 1
    %2266 = torch.aten.slice.Tensor %2265, %int1_2801, %int0_2802, %int512_2803, %int1_2804 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_2805 = torch.constant.int 0
    %int0_2806 = torch.constant.int 0
    %int9223372036854775807_2807 = torch.constant.int 9223372036854775807
    %int1_2808 = torch.constant.int 1
    %2267 = torch.aten.slice.Tensor %2264, %int0_2805, %int0_2806, %int9223372036854775807_2807, %int1_2808 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_2809 = torch.constant.int 1
    %int512_2810 = torch.constant.int 512
    %int9223372036854775807_2811 = torch.constant.int 9223372036854775807
    %int1_2812 = torch.constant.int 1
    %2268 = torch.aten.slice.Tensor %2267, %int1_2809, %int512_2810, %int9223372036854775807_2811, %int1_2812 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_2813 = torch.constant.int 4096
    %int3072_2814 = torch.constant.int 3072
    %2269 = torch.prim.ListConstruct %int4096_2813, %int3072_2814 : (!torch.int, !torch.int) -> !torch.list<int>
    %2270 = torch.aten.view %2268, %2269 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.6.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.6.img_attn.proj.weight : tensor<3072x3072xf16>
    %2271 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_2815 = torch.constant.int 0
    %int1_2816 = torch.constant.int 1
    %2272 = torch.aten.transpose.int %2271, %int0_2815, %int1_2816 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.6.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.6.img_attn.proj.bias : tensor<3072xf16>
    %2273 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2817 = torch.constant.int 6
    %2274 = torch.prims.convert_element_type %2273, %int6_2817 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2818 = torch.constant.int 6
    %2275 = torch.prims.convert_element_type %2270, %int6_2818 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_2819 = torch.constant.int 6
    %2276 = torch.prims.convert_element_type %2272, %int6_2819 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %2277 = torch.aten.mm %2275, %2276 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_2820 = torch.constant.int 1
    %2278 = torch.aten.mul.Scalar %2277, %int1_2820 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_2821 = torch.constant.int 1
    %2279 = torch.aten.mul.Scalar %2274, %int1_2821 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2822 = torch.constant.int 1
    %2280 = torch.aten.add.Tensor %2278, %2279, %int1_2822 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_2823 = torch.constant.int 5
    %2281 = torch.prims.convert_element_type %2280, %int5_2823 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_2824 = torch.constant.int 1
    %int4096_2825 = torch.constant.int 4096
    %int3072_2826 = torch.constant.int 3072
    %2282 = torch.prim.ListConstruct %int1_2824, %int4096_2825, %int3072_2826 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2283 = torch.aten.view %2281, %2282 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %2284 = torch.aten.mul.Tensor %2095, %2283 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2827 = torch.constant.int 1
    %2285 = torch.aten.add.Tensor %2017, %2284, %int1_2827 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2828 = torch.constant.int 1
    %int1_2829 = torch.constant.int 1
    %2286 = torch.aten.add.Scalar %2097, %int1_2828, %int1_2829 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_2830 = torch.constant.int 6
    %2287 = torch.prims.convert_element_type %2285, %int6_2830 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_2831 = torch.constant.int 2
    %2288 = torch.prim.ListConstruct %int2_2831 : (!torch.int) -> !torch.list<int>
    %int0_2832 = torch.constant.int 0
    %true_2833 = torch.constant.bool true
    %result0_2834, %result1_2835 = torch.aten.var_mean.correction %2287, %2288, %int0_2832, %true_2833 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_2836 = torch.constant.float 9.9999999999999995E-7
    %int1_2837 = torch.constant.int 1
    %2289 = torch.aten.add.Scalar %result0_2834, %float9.999990e-07_2836, %int1_2837 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %2290 = torch.aten.rsqrt %2289 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_2838 = torch.constant.int 1
    %2291 = torch.aten.sub.Tensor %2285, %result1_2835, %int1_2838 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %2292 = torch.aten.mul.Tensor %2291, %2290 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_2839 = torch.constant.int 5
    %2293 = torch.prims.convert_element_type %2292, %int5_2839 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %2294 = torch.aten.mul.Tensor %2286, %2293 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2840 = torch.constant.int 1
    %2295 = torch.aten.add.Tensor %2294, %2096, %int1_2840 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_2841 = torch.constant.int 4096
    %int3072_2842 = torch.constant.int 3072
    %2296 = torch.prim.ListConstruct %int4096_2841, %int3072_2842 : (!torch.int, !torch.int) -> !torch.list<int>
    %2297 = torch.aten.view %2295, %2296 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.6.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.6.img_mlp.0.weight : tensor<12288x3072xf16>
    %2298 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_2843 = torch.constant.int 0
    %int1_2844 = torch.constant.int 1
    %2299 = torch.aten.transpose.int %2298, %int0_2843, %int1_2844 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.6.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.6.img_mlp.0.bias : tensor<12288xf16>
    %2300 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_2845 = torch.constant.int 6
    %2301 = torch.prims.convert_element_type %2300, %int6_2845 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_2846 = torch.constant.int 6
    %2302 = torch.prims.convert_element_type %2297, %int6_2846 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_2847 = torch.constant.int 6
    %2303 = torch.prims.convert_element_type %2299, %int6_2847 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %2304 = torch.aten.mm %2302, %2303 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_2848 = torch.constant.int 1
    %2305 = torch.aten.mul.Scalar %2304, %int1_2848 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_2849 = torch.constant.int 1
    %2306 = torch.aten.mul.Scalar %2301, %int1_2849 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_2850 = torch.constant.int 1
    %2307 = torch.aten.add.Tensor %2305, %2306, %int1_2850 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_2851 = torch.constant.int 5
    %2308 = torch.prims.convert_element_type %2307, %int5_2851 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_2852 = torch.constant.int 1
    %int4096_2853 = torch.constant.int 4096
    %int12288_2854 = torch.constant.int 12288
    %2309 = torch.prim.ListConstruct %int1_2852, %int4096_2853, %int12288_2854 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2310 = torch.aten.view %2308, %2309 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_2855 = torch.constant.str "tanh"
    %2311 = torch.aten.gelu %2310, %str_2855 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_2856 = torch.constant.int 4096
    %int12288_2857 = torch.constant.int 12288
    %2312 = torch.prim.ListConstruct %int4096_2856, %int12288_2857 : (!torch.int, !torch.int) -> !torch.list<int>
    %2313 = torch.aten.view %2311, %2312 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.6.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.6.img_mlp.2.weight : tensor<3072x12288xf16>
    %2314 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_2858 = torch.constant.int 0
    %int1_2859 = torch.constant.int 1
    %2315 = torch.aten.transpose.int %2314, %int0_2858, %int1_2859 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.6.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.6.img_mlp.2.bias : tensor<3072xf16>
    %2316 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2860 = torch.constant.int 6
    %2317 = torch.prims.convert_element_type %2316, %int6_2860 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2861 = torch.constant.int 6
    %2318 = torch.prims.convert_element_type %2313, %int6_2861 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_2862 = torch.constant.int 6
    %2319 = torch.prims.convert_element_type %2315, %int6_2862 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %2320 = torch.aten.mm %2318, %2319 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_2863 = torch.constant.int 1
    %2321 = torch.aten.mul.Scalar %2320, %int1_2863 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_2864 = torch.constant.int 1
    %2322 = torch.aten.mul.Scalar %2317, %int1_2864 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2865 = torch.constant.int 1
    %2323 = torch.aten.add.Tensor %2321, %2322, %int1_2865 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_2866 = torch.constant.int 5
    %2324 = torch.prims.convert_element_type %2323, %int5_2866 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_2867 = torch.constant.int 1
    %int4096_2868 = torch.constant.int 4096
    %int3072_2869 = torch.constant.int 3072
    %2325 = torch.prim.ListConstruct %int1_2867, %int4096_2868, %int3072_2869 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2326 = torch.aten.view %2324, %2325 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %2327 = torch.aten.mul.Tensor %2098, %2326 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_2870 = torch.constant.int 1
    %2328 = torch.aten.add.Tensor %2285, %2327, %int1_2870 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_2871 = torch.constant.int 512
    %int3072_2872 = torch.constant.int 3072
    %2329 = torch.prim.ListConstruct %int512_2871, %int3072_2872 : (!torch.int, !torch.int) -> !torch.list<int>
    %2330 = torch.aten.view %2266, %2329 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.6.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.6.txt_attn.proj.weight : tensor<3072x3072xf16>
    %2331 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_2873 = torch.constant.int 0
    %int1_2874 = torch.constant.int 1
    %2332 = torch.aten.transpose.int %2331, %int0_2873, %int1_2874 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.6.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.6.txt_attn.proj.bias : tensor<3072xf16>
    %2333 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2875 = torch.constant.int 6
    %2334 = torch.prims.convert_element_type %2333, %int6_2875 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2876 = torch.constant.int 6
    %2335 = torch.prims.convert_element_type %2330, %int6_2876 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_2877 = torch.constant.int 6
    %2336 = torch.prims.convert_element_type %2332, %int6_2877 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %2337 = torch.aten.mm %2335, %2336 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_2878 = torch.constant.int 1
    %2338 = torch.aten.mul.Scalar %2337, %int1_2878 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_2879 = torch.constant.int 1
    %2339 = torch.aten.mul.Scalar %2334, %int1_2879 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2880 = torch.constant.int 1
    %2340 = torch.aten.add.Tensor %2338, %2339, %int1_2880 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_2881 = torch.constant.int 5
    %2341 = torch.prims.convert_element_type %2340, %int5_2881 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_2882 = torch.constant.int 1
    %int512_2883 = torch.constant.int 512
    %int3072_2884 = torch.constant.int 3072
    %2342 = torch.prim.ListConstruct %int1_2882, %int512_2883, %int3072_2884 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2343 = torch.aten.view %2341, %2342 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %2344 = torch.aten.mul.Tensor %2116, %2343 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2885 = torch.constant.int 1
    %2345 = torch.aten.add.Tensor %2077, %2344, %int1_2885 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_2886 = torch.constant.int 1
    %int1_2887 = torch.constant.int 1
    %2346 = torch.aten.add.Scalar %2118, %int1_2886, %int1_2887 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_2888 = torch.constant.int 6
    %2347 = torch.prims.convert_element_type %2345, %int6_2888 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_2889 = torch.constant.int 2
    %2348 = torch.prim.ListConstruct %int2_2889 : (!torch.int) -> !torch.list<int>
    %int0_2890 = torch.constant.int 0
    %true_2891 = torch.constant.bool true
    %result0_2892, %result1_2893 = torch.aten.var_mean.correction %2347, %2348, %int0_2890, %true_2891 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_2894 = torch.constant.float 9.9999999999999995E-7
    %int1_2895 = torch.constant.int 1
    %2349 = torch.aten.add.Scalar %result0_2892, %float9.999990e-07_2894, %int1_2895 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %2350 = torch.aten.rsqrt %2349 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_2896 = torch.constant.int 1
    %2351 = torch.aten.sub.Tensor %2345, %result1_2893, %int1_2896 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %2352 = torch.aten.mul.Tensor %2351, %2350 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_2897 = torch.constant.int 5
    %2353 = torch.prims.convert_element_type %2352, %int5_2897 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %2354 = torch.aten.mul.Tensor %2346, %2353 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2898 = torch.constant.int 1
    %2355 = torch.aten.add.Tensor %2354, %2117, %int1_2898 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_2899 = torch.constant.int 512
    %int3072_2900 = torch.constant.int 3072
    %2356 = torch.prim.ListConstruct %int512_2899, %int3072_2900 : (!torch.int, !torch.int) -> !torch.list<int>
    %2357 = torch.aten.view %2355, %2356 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.6.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.6.txt_mlp.0.weight : tensor<12288x3072xf16>
    %2358 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_2901 = torch.constant.int 0
    %int1_2902 = torch.constant.int 1
    %2359 = torch.aten.transpose.int %2358, %int0_2901, %int1_2902 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.6.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.6.txt_mlp.0.bias : tensor<12288xf16>
    %2360 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_2903 = torch.constant.int 6
    %2361 = torch.prims.convert_element_type %2360, %int6_2903 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_2904 = torch.constant.int 6
    %2362 = torch.prims.convert_element_type %2357, %int6_2904 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_2905 = torch.constant.int 6
    %2363 = torch.prims.convert_element_type %2359, %int6_2905 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %2364 = torch.aten.mm %2362, %2363 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_2906 = torch.constant.int 1
    %2365 = torch.aten.mul.Scalar %2364, %int1_2906 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_2907 = torch.constant.int 1
    %2366 = torch.aten.mul.Scalar %2361, %int1_2907 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_2908 = torch.constant.int 1
    %2367 = torch.aten.add.Tensor %2365, %2366, %int1_2908 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_2909 = torch.constant.int 5
    %2368 = torch.prims.convert_element_type %2367, %int5_2909 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_2910 = torch.constant.int 1
    %int512_2911 = torch.constant.int 512
    %int12288_2912 = torch.constant.int 12288
    %2369 = torch.prim.ListConstruct %int1_2910, %int512_2911, %int12288_2912 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2370 = torch.aten.view %2368, %2369 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_2913 = torch.constant.str "tanh"
    %2371 = torch.aten.gelu %2370, %str_2913 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_2914 = torch.constant.int 512
    %int12288_2915 = torch.constant.int 12288
    %2372 = torch.prim.ListConstruct %int512_2914, %int12288_2915 : (!torch.int, !torch.int) -> !torch.list<int>
    %2373 = torch.aten.view %2371, %2372 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.6.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.6.txt_mlp.2.weight : tensor<3072x12288xf16>
    %2374 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_2916 = torch.constant.int 0
    %int1_2917 = torch.constant.int 1
    %2375 = torch.aten.transpose.int %2374, %int0_2916, %int1_2917 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.6.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.6.txt_mlp.2.bias : tensor<3072xf16>
    %2376 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.6.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_2918 = torch.constant.int 6
    %2377 = torch.prims.convert_element_type %2376, %int6_2918 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_2919 = torch.constant.int 6
    %2378 = torch.prims.convert_element_type %2373, %int6_2919 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_2920 = torch.constant.int 6
    %2379 = torch.prims.convert_element_type %2375, %int6_2920 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %2380 = torch.aten.mm %2378, %2379 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_2921 = torch.constant.int 1
    %2381 = torch.aten.mul.Scalar %2380, %int1_2921 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_2922 = torch.constant.int 1
    %2382 = torch.aten.mul.Scalar %2377, %int1_2922 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_2923 = torch.constant.int 1
    %2383 = torch.aten.add.Tensor %2381, %2382, %int1_2923 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_2924 = torch.constant.int 5
    %2384 = torch.prims.convert_element_type %2383, %int5_2924 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_2925 = torch.constant.int 1
    %int512_2926 = torch.constant.int 512
    %int3072_2927 = torch.constant.int 3072
    %2385 = torch.prim.ListConstruct %int1_2925, %int512_2926, %int3072_2927 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2386 = torch.aten.view %2384, %2385 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %2387 = torch.aten.mul.Tensor %2119, %2386 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_2928 = torch.constant.int 1
    %2388 = torch.aten.add.Tensor %2345, %2387, %int1_2928 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %2389 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.7.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.7.img_mod.lin.weight : tensor<18432x3072xf16>
    %2390 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_2929 = torch.constant.int 0
    %int1_2930 = torch.constant.int 1
    %2391 = torch.aten.transpose.int %2390, %int0_2929, %int1_2930 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.7.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.7.img_mod.lin.bias : tensor<18432xf16>
    %2392 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_2931 = torch.constant.int 6
    %2393 = torch.prims.convert_element_type %2392, %int6_2931 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_2932 = torch.constant.int 6
    %2394 = torch.prims.convert_element_type %2389, %int6_2932 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_2933 = torch.constant.int 6
    %2395 = torch.prims.convert_element_type %2391, %int6_2933 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %2396 = torch.aten.mm %2394, %2395 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_2934 = torch.constant.int 1
    %2397 = torch.aten.mul.Scalar %2396, %int1_2934 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_2935 = torch.constant.int 1
    %2398 = torch.aten.mul.Scalar %2393, %int1_2935 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_2936 = torch.constant.int 1
    %2399 = torch.aten.add.Tensor %2397, %2398, %int1_2936 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_2937 = torch.constant.int 5
    %2400 = torch.prims.convert_element_type %2399, %int5_2937 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_2938 = torch.constant.int 0
    %int0_2939 = torch.constant.int 0
    %int9223372036854775807_2940 = torch.constant.int 9223372036854775807
    %int1_2941 = torch.constant.int 1
    %2401 = torch.aten.slice.Tensor %2400, %int0_2938, %int0_2939, %int9223372036854775807_2940, %int1_2941 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_2942 = torch.constant.int 1
    %2402 = torch.aten.unsqueeze %2401, %int1_2942 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_2943 = torch.constant.int 2
    %int0_2944 = torch.constant.int 0
    %int9223372036854775807_2945 = torch.constant.int 9223372036854775807
    %int1_2946 = torch.constant.int 1
    %2403 = torch.aten.slice.Tensor %2402, %int2_2943, %int0_2944, %int9223372036854775807_2945, %int1_2946 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_2947 = torch.constant.int -1
    %int0_2948 = torch.constant.int 0
    %int3072_2949 = torch.constant.int 3072
    %int1_2950 = torch.constant.int 1
    %2404 = torch.aten.slice.Tensor %2403, %int-1_2947, %int0_2948, %int3072_2949, %int1_2950 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2951 = torch.constant.int -1
    %int3072_2952 = torch.constant.int 3072
    %int6144_2953 = torch.constant.int 6144
    %int1_2954 = torch.constant.int 1
    %2405 = torch.aten.slice.Tensor %2403, %int-1_2951, %int3072_2952, %int6144_2953, %int1_2954 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2955 = torch.constant.int -1
    %int6144_2956 = torch.constant.int 6144
    %int9216_2957 = torch.constant.int 9216
    %int1_2958 = torch.constant.int 1
    %2406 = torch.aten.slice.Tensor %2403, %int-1_2955, %int6144_2956, %int9216_2957, %int1_2958 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2959 = torch.constant.int -1
    %int9216_2960 = torch.constant.int 9216
    %int12288_2961 = torch.constant.int 12288
    %int1_2962 = torch.constant.int 1
    %2407 = torch.aten.slice.Tensor %2403, %int-1_2959, %int9216_2960, %int12288_2961, %int1_2962 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2963 = torch.constant.int -1
    %int12288_2964 = torch.constant.int 12288
    %int15360_2965 = torch.constant.int 15360
    %int1_2966 = torch.constant.int 1
    %2408 = torch.aten.slice.Tensor %2403, %int-1_2963, %int12288_2964, %int15360_2965, %int1_2966 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2967 = torch.constant.int -1
    %int15360_2968 = torch.constant.int 15360
    %int18432_2969 = torch.constant.int 18432
    %int1_2970 = torch.constant.int 1
    %2409 = torch.aten.slice.Tensor %2403, %int-1_2967, %int15360_2968, %int18432_2969, %int1_2970 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2410 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.7.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.7.txt_mod.lin.weight : tensor<18432x3072xf16>
    %2411 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_2971 = torch.constant.int 0
    %int1_2972 = torch.constant.int 1
    %2412 = torch.aten.transpose.int %2411, %int0_2971, %int1_2972 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.7.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.7.txt_mod.lin.bias : tensor<18432xf16>
    %2413 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_2973 = torch.constant.int 6
    %2414 = torch.prims.convert_element_type %2413, %int6_2973 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_2974 = torch.constant.int 6
    %2415 = torch.prims.convert_element_type %2410, %int6_2974 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_2975 = torch.constant.int 6
    %2416 = torch.prims.convert_element_type %2412, %int6_2975 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %2417 = torch.aten.mm %2415, %2416 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_2976 = torch.constant.int 1
    %2418 = torch.aten.mul.Scalar %2417, %int1_2976 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_2977 = torch.constant.int 1
    %2419 = torch.aten.mul.Scalar %2414, %int1_2977 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_2978 = torch.constant.int 1
    %2420 = torch.aten.add.Tensor %2418, %2419, %int1_2978 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_2979 = torch.constant.int 5
    %2421 = torch.prims.convert_element_type %2420, %int5_2979 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_2980 = torch.constant.int 0
    %int0_2981 = torch.constant.int 0
    %int9223372036854775807_2982 = torch.constant.int 9223372036854775807
    %int1_2983 = torch.constant.int 1
    %2422 = torch.aten.slice.Tensor %2421, %int0_2980, %int0_2981, %int9223372036854775807_2982, %int1_2983 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_2984 = torch.constant.int 1
    %2423 = torch.aten.unsqueeze %2422, %int1_2984 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_2985 = torch.constant.int 2
    %int0_2986 = torch.constant.int 0
    %int9223372036854775807_2987 = torch.constant.int 9223372036854775807
    %int1_2988 = torch.constant.int 1
    %2424 = torch.aten.slice.Tensor %2423, %int2_2985, %int0_2986, %int9223372036854775807_2987, %int1_2988 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_2989 = torch.constant.int -1
    %int0_2990 = torch.constant.int 0
    %int3072_2991 = torch.constant.int 3072
    %int1_2992 = torch.constant.int 1
    %2425 = torch.aten.slice.Tensor %2424, %int-1_2989, %int0_2990, %int3072_2991, %int1_2992 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2993 = torch.constant.int -1
    %int3072_2994 = torch.constant.int 3072
    %int6144_2995 = torch.constant.int 6144
    %int1_2996 = torch.constant.int 1
    %2426 = torch.aten.slice.Tensor %2424, %int-1_2993, %int3072_2994, %int6144_2995, %int1_2996 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_2997 = torch.constant.int -1
    %int6144_2998 = torch.constant.int 6144
    %int9216_2999 = torch.constant.int 9216
    %int1_3000 = torch.constant.int 1
    %2427 = torch.aten.slice.Tensor %2424, %int-1_2997, %int6144_2998, %int9216_2999, %int1_3000 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3001 = torch.constant.int -1
    %int9216_3002 = torch.constant.int 9216
    %int12288_3003 = torch.constant.int 12288
    %int1_3004 = torch.constant.int 1
    %2428 = torch.aten.slice.Tensor %2424, %int-1_3001, %int9216_3002, %int12288_3003, %int1_3004 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3005 = torch.constant.int -1
    %int12288_3006 = torch.constant.int 12288
    %int15360_3007 = torch.constant.int 15360
    %int1_3008 = torch.constant.int 1
    %2429 = torch.aten.slice.Tensor %2424, %int-1_3005, %int12288_3006, %int15360_3007, %int1_3008 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3009 = torch.constant.int -1
    %int15360_3010 = torch.constant.int 15360
    %int18432_3011 = torch.constant.int 18432
    %int1_3012 = torch.constant.int 1
    %2430 = torch.aten.slice.Tensor %2424, %int-1_3009, %int15360_3010, %int18432_3011, %int1_3012 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_3013 = torch.constant.int 6
    %2431 = torch.prims.convert_element_type %2328, %int6_3013 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_3014 = torch.constant.int 2
    %2432 = torch.prim.ListConstruct %int2_3014 : (!torch.int) -> !torch.list<int>
    %int0_3015 = torch.constant.int 0
    %true_3016 = torch.constant.bool true
    %result0_3017, %result1_3018 = torch.aten.var_mean.correction %2431, %2432, %int0_3015, %true_3016 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_3019 = torch.constant.float 9.9999999999999995E-7
    %int1_3020 = torch.constant.int 1
    %2433 = torch.aten.add.Scalar %result0_3017, %float9.999990e-07_3019, %int1_3020 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %2434 = torch.aten.rsqrt %2433 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_3021 = torch.constant.int 1
    %2435 = torch.aten.sub.Tensor %2328, %result1_3018, %int1_3021 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %2436 = torch.aten.mul.Tensor %2435, %2434 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_3022 = torch.constant.int 5
    %2437 = torch.prims.convert_element_type %2436, %int5_3022 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3023 = torch.constant.int 1
    %int1_3024 = torch.constant.int 1
    %2438 = torch.aten.add.Scalar %2405, %int1_3023, %int1_3024 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2439 = torch.aten.mul.Tensor %2438, %2437 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3025 = torch.constant.int 1
    %2440 = torch.aten.add.Tensor %2439, %2404, %int1_3025 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_3026 = torch.constant.int 4096
    %int3072_3027 = torch.constant.int 3072
    %2441 = torch.prim.ListConstruct %int4096_3026, %int3072_3027 : (!torch.int, !torch.int) -> !torch.list<int>
    %2442 = torch.aten.view %2440, %2441 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.7.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.7.img_attn.qkv.weight : tensor<9216x3072xf16>
    %2443 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_3028 = torch.constant.int 0
    %int1_3029 = torch.constant.int 1
    %2444 = torch.aten.transpose.int %2443, %int0_3028, %int1_3029 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.7.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.7.img_attn.qkv.bias : tensor<9216xf16>
    %2445 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_3030 = torch.constant.int 6
    %2446 = torch.prims.convert_element_type %2445, %int6_3030 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_3031 = torch.constant.int 6
    %2447 = torch.prims.convert_element_type %2442, %int6_3031 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_3032 = torch.constant.int 6
    %2448 = torch.prims.convert_element_type %2444, %int6_3032 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %2449 = torch.aten.mm %2447, %2448 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_3033 = torch.constant.int 1
    %2450 = torch.aten.mul.Scalar %2449, %int1_3033 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_3034 = torch.constant.int 1
    %2451 = torch.aten.mul.Scalar %2446, %int1_3034 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_3035 = torch.constant.int 1
    %2452 = torch.aten.add.Tensor %2450, %2451, %int1_3035 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_3036 = torch.constant.int 5
    %2453 = torch.prims.convert_element_type %2452, %int5_3036 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_3037 = torch.constant.int 1
    %int4096_3038 = torch.constant.int 4096
    %int9216_3039 = torch.constant.int 9216
    %2454 = torch.prim.ListConstruct %int1_3037, %int4096_3038, %int9216_3039 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2455 = torch.aten.view %2453, %2454 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_3040 = torch.constant.int 1
    %int4096_3041 = torch.constant.int 4096
    %int3_3042 = torch.constant.int 3
    %int24_3043 = torch.constant.int 24
    %int128_3044 = torch.constant.int 128
    %2456 = torch.prim.ListConstruct %int1_3040, %int4096_3041, %int3_3042, %int24_3043, %int128_3044 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2457 = torch.aten.view %2455, %2456 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_3045 = torch.constant.int 2
    %int0_3046 = torch.constant.int 0
    %int3_3047 = torch.constant.int 3
    %int1_3048 = torch.constant.int 1
    %int4_3049 = torch.constant.int 4
    %2458 = torch.prim.ListConstruct %int2_3045, %int0_3046, %int3_3047, %int1_3048, %int4_3049 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2459 = torch.aten.permute %2457, %2458 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_3050 = torch.constant.int 0
    %int0_3051 = torch.constant.int 0
    %2460 = torch.aten.select.int %2459, %int0_3050, %int0_3051 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_3052 = torch.constant.int 0
    %int1_3053 = torch.constant.int 1
    %2461 = torch.aten.select.int %2459, %int0_3052, %int1_3053 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_3054 = torch.constant.int 0
    %int2_3055 = torch.constant.int 2
    %2462 = torch.aten.select.int %2459, %int0_3054, %int2_3055 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_3056 = torch.constant.int 6
    %2463 = torch.prims.convert_element_type %2460, %int6_3056 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_3057 = torch.constant.int 2
    %2464 = torch.aten.pow.Tensor_Scalar %2463, %int2_3057 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_3058 = torch.constant.int -1
    %2465 = torch.prim.ListConstruct %int-1_3058 : (!torch.int) -> !torch.list<int>
    %true_3059 = torch.constant.bool true
    %none_3060 = torch.constant.none
    %2466 = torch.aten.mean.dim %2464, %2465, %true_3059, %none_3060 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_3061 = torch.constant.float 9.9999999999999995E-7
    %int1_3062 = torch.constant.int 1
    %2467 = torch.aten.add.Scalar %2466, %float9.999990e-07_3061, %int1_3062 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %2468 = torch.aten.rsqrt %2467 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %2469 = torch.aten.mul.Tensor %2463, %2468 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_3063 = torch.constant.int 5
    %2470 = torch.prims.convert_element_type %2469, %int5_3063 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.7.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.7.img_attn.norm.query_norm.scale : tensor<128xf16>
    %2471 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2472 = torch.aten.mul.Tensor %2470, %2471 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_3064 = torch.constant.int 6
    %2473 = torch.prims.convert_element_type %2461, %int6_3064 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_3065 = torch.constant.int 2
    %2474 = torch.aten.pow.Tensor_Scalar %2473, %int2_3065 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_3066 = torch.constant.int -1
    %2475 = torch.prim.ListConstruct %int-1_3066 : (!torch.int) -> !torch.list<int>
    %true_3067 = torch.constant.bool true
    %none_3068 = torch.constant.none
    %2476 = torch.aten.mean.dim %2474, %2475, %true_3067, %none_3068 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_3069 = torch.constant.float 9.9999999999999995E-7
    %int1_3070 = torch.constant.int 1
    %2477 = torch.aten.add.Scalar %2476, %float9.999990e-07_3069, %int1_3070 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %2478 = torch.aten.rsqrt %2477 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %2479 = torch.aten.mul.Tensor %2473, %2478 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_3071 = torch.constant.int 5
    %2480 = torch.prims.convert_element_type %2479, %int5_3071 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.7.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.7.img_attn.norm.key_norm.scale : tensor<128xf16>
    %2481 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2482 = torch.aten.mul.Tensor %2480, %2481 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_3072 = torch.constant.int 5
    %2483 = torch.prims.convert_element_type %2472, %int5_3072 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_3073 = torch.constant.int 5
    %2484 = torch.prims.convert_element_type %2482, %int5_3073 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_3074 = torch.constant.int 6
    %2485 = torch.prims.convert_element_type %2388, %int6_3074 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_3075 = torch.constant.int 2
    %2486 = torch.prim.ListConstruct %int2_3075 : (!torch.int) -> !torch.list<int>
    %int0_3076 = torch.constant.int 0
    %true_3077 = torch.constant.bool true
    %result0_3078, %result1_3079 = torch.aten.var_mean.correction %2485, %2486, %int0_3076, %true_3077 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_3080 = torch.constant.float 9.9999999999999995E-7
    %int1_3081 = torch.constant.int 1
    %2487 = torch.aten.add.Scalar %result0_3078, %float9.999990e-07_3080, %int1_3081 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %2488 = torch.aten.rsqrt %2487 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_3082 = torch.constant.int 1
    %2489 = torch.aten.sub.Tensor %2388, %result1_3079, %int1_3082 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %2490 = torch.aten.mul.Tensor %2489, %2488 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_3083 = torch.constant.int 5
    %2491 = torch.prims.convert_element_type %2490, %int5_3083 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_3084 = torch.constant.int 1
    %int1_3085 = torch.constant.int 1
    %2492 = torch.aten.add.Scalar %2426, %int1_3084, %int1_3085 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2493 = torch.aten.mul.Tensor %2492, %2491 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3086 = torch.constant.int 1
    %2494 = torch.aten.add.Tensor %2493, %2425, %int1_3086 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_3087 = torch.constant.int 512
    %int3072_3088 = torch.constant.int 3072
    %2495 = torch.prim.ListConstruct %int512_3087, %int3072_3088 : (!torch.int, !torch.int) -> !torch.list<int>
    %2496 = torch.aten.view %2494, %2495 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.7.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.7.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %2497 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_3089 = torch.constant.int 0
    %int1_3090 = torch.constant.int 1
    %2498 = torch.aten.transpose.int %2497, %int0_3089, %int1_3090 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.7.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.7.txt_attn.qkv.bias : tensor<9216xf16>
    %2499 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_3091 = torch.constant.int 6
    %2500 = torch.prims.convert_element_type %2499, %int6_3091 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_3092 = torch.constant.int 6
    %2501 = torch.prims.convert_element_type %2496, %int6_3092 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_3093 = torch.constant.int 6
    %2502 = torch.prims.convert_element_type %2498, %int6_3093 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %2503 = torch.aten.mm %2501, %2502 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_3094 = torch.constant.int 1
    %2504 = torch.aten.mul.Scalar %2503, %int1_3094 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_3095 = torch.constant.int 1
    %2505 = torch.aten.mul.Scalar %2500, %int1_3095 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_3096 = torch.constant.int 1
    %2506 = torch.aten.add.Tensor %2504, %2505, %int1_3096 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_3097 = torch.constant.int 5
    %2507 = torch.prims.convert_element_type %2506, %int5_3097 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_3098 = torch.constant.int 1
    %int512_3099 = torch.constant.int 512
    %int9216_3100 = torch.constant.int 9216
    %2508 = torch.prim.ListConstruct %int1_3098, %int512_3099, %int9216_3100 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2509 = torch.aten.view %2507, %2508 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_3101 = torch.constant.int 1
    %int512_3102 = torch.constant.int 512
    %int3_3103 = torch.constant.int 3
    %int24_3104 = torch.constant.int 24
    %int128_3105 = torch.constant.int 128
    %2510 = torch.prim.ListConstruct %int1_3101, %int512_3102, %int3_3103, %int24_3104, %int128_3105 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2511 = torch.aten.view %2509, %2510 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_3106 = torch.constant.int 2
    %int0_3107 = torch.constant.int 0
    %int3_3108 = torch.constant.int 3
    %int1_3109 = torch.constant.int 1
    %int4_3110 = torch.constant.int 4
    %2512 = torch.prim.ListConstruct %int2_3106, %int0_3107, %int3_3108, %int1_3109, %int4_3110 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2513 = torch.aten.permute %2511, %2512 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_3111 = torch.constant.int 0
    %int0_3112 = torch.constant.int 0
    %2514 = torch.aten.select.int %2513, %int0_3111, %int0_3112 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_3113 = torch.constant.int 0
    %int1_3114 = torch.constant.int 1
    %2515 = torch.aten.select.int %2513, %int0_3113, %int1_3114 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_3115 = torch.constant.int 0
    %int2_3116 = torch.constant.int 2
    %2516 = torch.aten.select.int %2513, %int0_3115, %int2_3116 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_3117 = torch.constant.int 6
    %2517 = torch.prims.convert_element_type %2514, %int6_3117 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_3118 = torch.constant.int 2
    %2518 = torch.aten.pow.Tensor_Scalar %2517, %int2_3118 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_3119 = torch.constant.int -1
    %2519 = torch.prim.ListConstruct %int-1_3119 : (!torch.int) -> !torch.list<int>
    %true_3120 = torch.constant.bool true
    %none_3121 = torch.constant.none
    %2520 = torch.aten.mean.dim %2518, %2519, %true_3120, %none_3121 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_3122 = torch.constant.float 9.9999999999999995E-7
    %int1_3123 = torch.constant.int 1
    %2521 = torch.aten.add.Scalar %2520, %float9.999990e-07_3122, %int1_3123 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %2522 = torch.aten.rsqrt %2521 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %2523 = torch.aten.mul.Tensor %2517, %2522 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_3124 = torch.constant.int 5
    %2524 = torch.prims.convert_element_type %2523, %int5_3124 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.7.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.7.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %2525 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2526 = torch.aten.mul.Tensor %2524, %2525 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_3125 = torch.constant.int 6
    %2527 = torch.prims.convert_element_type %2515, %int6_3125 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_3126 = torch.constant.int 2
    %2528 = torch.aten.pow.Tensor_Scalar %2527, %int2_3126 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_3127 = torch.constant.int -1
    %2529 = torch.prim.ListConstruct %int-1_3127 : (!torch.int) -> !torch.list<int>
    %true_3128 = torch.constant.bool true
    %none_3129 = torch.constant.none
    %2530 = torch.aten.mean.dim %2528, %2529, %true_3128, %none_3129 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_3130 = torch.constant.float 9.9999999999999995E-7
    %int1_3131 = torch.constant.int 1
    %2531 = torch.aten.add.Scalar %2530, %float9.999990e-07_3130, %int1_3131 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %2532 = torch.aten.rsqrt %2531 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %2533 = torch.aten.mul.Tensor %2527, %2532 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_3132 = torch.constant.int 5
    %2534 = torch.prims.convert_element_type %2533, %int5_3132 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.7.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.7.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %2535 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2536 = torch.aten.mul.Tensor %2534, %2535 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_3133 = torch.constant.int 5
    %2537 = torch.prims.convert_element_type %2526, %int5_3133 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_3134 = torch.constant.int 5
    %2538 = torch.prims.convert_element_type %2536, %int5_3134 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %2539 = torch.prim.ListConstruct %2537, %2483 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_3135 = torch.constant.int 2
    %2540 = torch.aten.cat %2539, %int2_3135 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %2541 = torch.prim.ListConstruct %2538, %2484 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_3136 = torch.constant.int 2
    %2542 = torch.aten.cat %2541, %int2_3136 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %2543 = torch.prim.ListConstruct %2516, %2462 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_3137 = torch.constant.int 2
    %2544 = torch.aten.cat %2543, %int2_3137 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_3138 = torch.constant.int 6
    %2545 = torch.prims.convert_element_type %2540, %int6_3138 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_3139 = torch.constant.int 1
    %int24_3140 = torch.constant.int 24
    %int4608_3141 = torch.constant.int 4608
    %int-1_3142 = torch.constant.int -1
    %int1_3143 = torch.constant.int 1
    %int2_3144 = torch.constant.int 2
    %2546 = torch.prim.ListConstruct %int1_3139, %int24_3140, %int4608_3141, %int-1_3142, %int1_3143, %int2_3144 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2547 = torch.aten.view %2545, %2546 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_3145 = torch.constant.int 6
    %2548 = torch.prims.convert_element_type %2542, %int6_3145 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_3146 = torch.constant.int 1
    %int24_3147 = torch.constant.int 24
    %int4608_3148 = torch.constant.int 4608
    %int-1_3149 = torch.constant.int -1
    %int1_3150 = torch.constant.int 1
    %int2_3151 = torch.constant.int 2
    %2549 = torch.prim.ListConstruct %int1_3146, %int24_3147, %int4608_3148, %int-1_3149, %int1_3150, %int2_3151 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2550 = torch.aten.view %2548, %2549 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_3152 = torch.constant.int 5
    %int0_3153 = torch.constant.int 0
    %2551 = torch.aten.select.int %211, %int5_3152, %int0_3153 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3154 = torch.constant.int 5
    %int0_3155 = torch.constant.int 0
    %2552 = torch.aten.select.int %2547, %int5_3154, %int0_3155 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2553 = torch.aten.mul.Tensor %2551, %2552 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_3156 = torch.constant.int 5
    %int1_3157 = torch.constant.int 1
    %2554 = torch.aten.select.int %211, %int5_3156, %int1_3157 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3158 = torch.constant.int 5
    %int1_3159 = torch.constant.int 1
    %2555 = torch.aten.select.int %2547, %int5_3158, %int1_3159 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2556 = torch.aten.mul.Tensor %2554, %2555 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_3160 = torch.constant.int 1
    %2557 = torch.aten.add.Tensor %2553, %2556, %int1_3160 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_3161 = torch.constant.int 5
    %int0_3162 = torch.constant.int 0
    %2558 = torch.aten.select.int %211, %int5_3161, %int0_3162 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3163 = torch.constant.int 5
    %int0_3164 = torch.constant.int 0
    %2559 = torch.aten.select.int %2550, %int5_3163, %int0_3164 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2560 = torch.aten.mul.Tensor %2558, %2559 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_3165 = torch.constant.int 5
    %int1_3166 = torch.constant.int 1
    %2561 = torch.aten.select.int %211, %int5_3165, %int1_3166 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3167 = torch.constant.int 5
    %int1_3168 = torch.constant.int 1
    %2562 = torch.aten.select.int %2550, %int5_3167, %int1_3168 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2563 = torch.aten.mul.Tensor %2561, %2562 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_3169 = torch.constant.int 1
    %2564 = torch.aten.add.Tensor %2560, %2563, %int1_3169 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_3170 = torch.constant.int 1
    %int24_3171 = torch.constant.int 24
    %int4608_3172 = torch.constant.int 4608
    %int128_3173 = torch.constant.int 128
    %2565 = torch.prim.ListConstruct %int1_3170, %int24_3171, %int4608_3172, %int128_3173 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2566 = torch.aten.view %2557, %2565 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_3174 = torch.constant.int 5
    %2567 = torch.prims.convert_element_type %2566, %int5_3174 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_3175 = torch.constant.int 1
    %int24_3176 = torch.constant.int 24
    %int4608_3177 = torch.constant.int 4608
    %int128_3178 = torch.constant.int 128
    %2568 = torch.prim.ListConstruct %int1_3175, %int24_3176, %int4608_3177, %int128_3178 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2569 = torch.aten.view %2564, %2568 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_3179 = torch.constant.int 5
    %2570 = torch.prims.convert_element_type %2569, %int5_3179 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_3180 = torch.constant.float 0.000000e+00
    %false_3181 = torch.constant.bool false
    %none_3182 = torch.constant.none
    %none_3183 = torch.constant.none
    %2571:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2567, %2570, %2544, %float0.000000e00_3180, %false_3181, %none_3182, %none_3183) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_3184 = torch.constant.int 0
    %int2_3185 = torch.constant.int 2
    %int1_3186 = torch.constant.int 1
    %int3_3187 = torch.constant.int 3
    %2572 = torch.prim.ListConstruct %int0_3184, %int2_3185, %int1_3186, %int3_3187 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2573 = torch.aten.permute %2571#0, %2572 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_3188 = torch.constant.int 1
    %int4608_3189 = torch.constant.int 4608
    %int3072_3190 = torch.constant.int 3072
    %2574 = torch.prim.ListConstruct %int1_3188, %int4608_3189, %int3072_3190 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2575 = torch.aten.view %2573, %2574 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_3191 = torch.constant.int 0
    %int0_3192 = torch.constant.int 0
    %int9223372036854775807_3193 = torch.constant.int 9223372036854775807
    %int1_3194 = torch.constant.int 1
    %2576 = torch.aten.slice.Tensor %2575, %int0_3191, %int0_3192, %int9223372036854775807_3193, %int1_3194 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_3195 = torch.constant.int 1
    %int0_3196 = torch.constant.int 0
    %int512_3197 = torch.constant.int 512
    %int1_3198 = torch.constant.int 1
    %2577 = torch.aten.slice.Tensor %2576, %int1_3195, %int0_3196, %int512_3197, %int1_3198 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_3199 = torch.constant.int 0
    %int0_3200 = torch.constant.int 0
    %int9223372036854775807_3201 = torch.constant.int 9223372036854775807
    %int1_3202 = torch.constant.int 1
    %2578 = torch.aten.slice.Tensor %2575, %int0_3199, %int0_3200, %int9223372036854775807_3201, %int1_3202 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_3203 = torch.constant.int 1
    %int512_3204 = torch.constant.int 512
    %int9223372036854775807_3205 = torch.constant.int 9223372036854775807
    %int1_3206 = torch.constant.int 1
    %2579 = torch.aten.slice.Tensor %2578, %int1_3203, %int512_3204, %int9223372036854775807_3205, %int1_3206 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_3207 = torch.constant.int 4096
    %int3072_3208 = torch.constant.int 3072
    %2580 = torch.prim.ListConstruct %int4096_3207, %int3072_3208 : (!torch.int, !torch.int) -> !torch.list<int>
    %2581 = torch.aten.view %2579, %2580 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.7.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.7.img_attn.proj.weight : tensor<3072x3072xf16>
    %2582 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_3209 = torch.constant.int 0
    %int1_3210 = torch.constant.int 1
    %2583 = torch.aten.transpose.int %2582, %int0_3209, %int1_3210 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.7.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.7.img_attn.proj.bias : tensor<3072xf16>
    %2584 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3211 = torch.constant.int 6
    %2585 = torch.prims.convert_element_type %2584, %int6_3211 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3212 = torch.constant.int 6
    %2586 = torch.prims.convert_element_type %2581, %int6_3212 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_3213 = torch.constant.int 6
    %2587 = torch.prims.convert_element_type %2583, %int6_3213 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %2588 = torch.aten.mm %2586, %2587 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_3214 = torch.constant.int 1
    %2589 = torch.aten.mul.Scalar %2588, %int1_3214 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_3215 = torch.constant.int 1
    %2590 = torch.aten.mul.Scalar %2585, %int1_3215 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3216 = torch.constant.int 1
    %2591 = torch.aten.add.Tensor %2589, %2590, %int1_3216 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_3217 = torch.constant.int 5
    %2592 = torch.prims.convert_element_type %2591, %int5_3217 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_3218 = torch.constant.int 1
    %int4096_3219 = torch.constant.int 4096
    %int3072_3220 = torch.constant.int 3072
    %2593 = torch.prim.ListConstruct %int1_3218, %int4096_3219, %int3072_3220 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2594 = torch.aten.view %2592, %2593 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %2595 = torch.aten.mul.Tensor %2406, %2594 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3221 = torch.constant.int 1
    %2596 = torch.aten.add.Tensor %2328, %2595, %int1_3221 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3222 = torch.constant.int 1
    %int1_3223 = torch.constant.int 1
    %2597 = torch.aten.add.Scalar %2408, %int1_3222, %int1_3223 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_3224 = torch.constant.int 6
    %2598 = torch.prims.convert_element_type %2596, %int6_3224 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_3225 = torch.constant.int 2
    %2599 = torch.prim.ListConstruct %int2_3225 : (!torch.int) -> !torch.list<int>
    %int0_3226 = torch.constant.int 0
    %true_3227 = torch.constant.bool true
    %result0_3228, %result1_3229 = torch.aten.var_mean.correction %2598, %2599, %int0_3226, %true_3227 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_3230 = torch.constant.float 9.9999999999999995E-7
    %int1_3231 = torch.constant.int 1
    %2600 = torch.aten.add.Scalar %result0_3228, %float9.999990e-07_3230, %int1_3231 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %2601 = torch.aten.rsqrt %2600 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_3232 = torch.constant.int 1
    %2602 = torch.aten.sub.Tensor %2596, %result1_3229, %int1_3232 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %2603 = torch.aten.mul.Tensor %2602, %2601 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_3233 = torch.constant.int 5
    %2604 = torch.prims.convert_element_type %2603, %int5_3233 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %2605 = torch.aten.mul.Tensor %2597, %2604 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3234 = torch.constant.int 1
    %2606 = torch.aten.add.Tensor %2605, %2407, %int1_3234 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_3235 = torch.constant.int 4096
    %int3072_3236 = torch.constant.int 3072
    %2607 = torch.prim.ListConstruct %int4096_3235, %int3072_3236 : (!torch.int, !torch.int) -> !torch.list<int>
    %2608 = torch.aten.view %2606, %2607 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.7.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.7.img_mlp.0.weight : tensor<12288x3072xf16>
    %2609 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_3237 = torch.constant.int 0
    %int1_3238 = torch.constant.int 1
    %2610 = torch.aten.transpose.int %2609, %int0_3237, %int1_3238 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.7.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.7.img_mlp.0.bias : tensor<12288xf16>
    %2611 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_3239 = torch.constant.int 6
    %2612 = torch.prims.convert_element_type %2611, %int6_3239 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_3240 = torch.constant.int 6
    %2613 = torch.prims.convert_element_type %2608, %int6_3240 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_3241 = torch.constant.int 6
    %2614 = torch.prims.convert_element_type %2610, %int6_3241 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %2615 = torch.aten.mm %2613, %2614 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_3242 = torch.constant.int 1
    %2616 = torch.aten.mul.Scalar %2615, %int1_3242 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_3243 = torch.constant.int 1
    %2617 = torch.aten.mul.Scalar %2612, %int1_3243 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_3244 = torch.constant.int 1
    %2618 = torch.aten.add.Tensor %2616, %2617, %int1_3244 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_3245 = torch.constant.int 5
    %2619 = torch.prims.convert_element_type %2618, %int5_3245 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_3246 = torch.constant.int 1
    %int4096_3247 = torch.constant.int 4096
    %int12288_3248 = torch.constant.int 12288
    %2620 = torch.prim.ListConstruct %int1_3246, %int4096_3247, %int12288_3248 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2621 = torch.aten.view %2619, %2620 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_3249 = torch.constant.str "tanh"
    %2622 = torch.aten.gelu %2621, %str_3249 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_3250 = torch.constant.int 4096
    %int12288_3251 = torch.constant.int 12288
    %2623 = torch.prim.ListConstruct %int4096_3250, %int12288_3251 : (!torch.int, !torch.int) -> !torch.list<int>
    %2624 = torch.aten.view %2622, %2623 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.7.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.7.img_mlp.2.weight : tensor<3072x12288xf16>
    %2625 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_3252 = torch.constant.int 0
    %int1_3253 = torch.constant.int 1
    %2626 = torch.aten.transpose.int %2625, %int0_3252, %int1_3253 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.7.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.7.img_mlp.2.bias : tensor<3072xf16>
    %2627 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3254 = torch.constant.int 6
    %2628 = torch.prims.convert_element_type %2627, %int6_3254 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3255 = torch.constant.int 6
    %2629 = torch.prims.convert_element_type %2624, %int6_3255 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_3256 = torch.constant.int 6
    %2630 = torch.prims.convert_element_type %2626, %int6_3256 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %2631 = torch.aten.mm %2629, %2630 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_3257 = torch.constant.int 1
    %2632 = torch.aten.mul.Scalar %2631, %int1_3257 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_3258 = torch.constant.int 1
    %2633 = torch.aten.mul.Scalar %2628, %int1_3258 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3259 = torch.constant.int 1
    %2634 = torch.aten.add.Tensor %2632, %2633, %int1_3259 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_3260 = torch.constant.int 5
    %2635 = torch.prims.convert_element_type %2634, %int5_3260 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_3261 = torch.constant.int 1
    %int4096_3262 = torch.constant.int 4096
    %int3072_3263 = torch.constant.int 3072
    %2636 = torch.prim.ListConstruct %int1_3261, %int4096_3262, %int3072_3263 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2637 = torch.aten.view %2635, %2636 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %2638 = torch.aten.mul.Tensor %2409, %2637 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3264 = torch.constant.int 1
    %2639 = torch.aten.add.Tensor %2596, %2638, %int1_3264 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_3265 = torch.constant.int 512
    %int3072_3266 = torch.constant.int 3072
    %2640 = torch.prim.ListConstruct %int512_3265, %int3072_3266 : (!torch.int, !torch.int) -> !torch.list<int>
    %2641 = torch.aten.view %2577, %2640 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.7.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.7.txt_attn.proj.weight : tensor<3072x3072xf16>
    %2642 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_3267 = torch.constant.int 0
    %int1_3268 = torch.constant.int 1
    %2643 = torch.aten.transpose.int %2642, %int0_3267, %int1_3268 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.7.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.7.txt_attn.proj.bias : tensor<3072xf16>
    %2644 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3269 = torch.constant.int 6
    %2645 = torch.prims.convert_element_type %2644, %int6_3269 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3270 = torch.constant.int 6
    %2646 = torch.prims.convert_element_type %2641, %int6_3270 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_3271 = torch.constant.int 6
    %2647 = torch.prims.convert_element_type %2643, %int6_3271 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %2648 = torch.aten.mm %2646, %2647 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_3272 = torch.constant.int 1
    %2649 = torch.aten.mul.Scalar %2648, %int1_3272 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_3273 = torch.constant.int 1
    %2650 = torch.aten.mul.Scalar %2645, %int1_3273 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3274 = torch.constant.int 1
    %2651 = torch.aten.add.Tensor %2649, %2650, %int1_3274 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_3275 = torch.constant.int 5
    %2652 = torch.prims.convert_element_type %2651, %int5_3275 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_3276 = torch.constant.int 1
    %int512_3277 = torch.constant.int 512
    %int3072_3278 = torch.constant.int 3072
    %2653 = torch.prim.ListConstruct %int1_3276, %int512_3277, %int3072_3278 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2654 = torch.aten.view %2652, %2653 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %2655 = torch.aten.mul.Tensor %2427, %2654 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3279 = torch.constant.int 1
    %2656 = torch.aten.add.Tensor %2388, %2655, %int1_3279 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_3280 = torch.constant.int 1
    %int1_3281 = torch.constant.int 1
    %2657 = torch.aten.add.Scalar %2429, %int1_3280, %int1_3281 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_3282 = torch.constant.int 6
    %2658 = torch.prims.convert_element_type %2656, %int6_3282 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_3283 = torch.constant.int 2
    %2659 = torch.prim.ListConstruct %int2_3283 : (!torch.int) -> !torch.list<int>
    %int0_3284 = torch.constant.int 0
    %true_3285 = torch.constant.bool true
    %result0_3286, %result1_3287 = torch.aten.var_mean.correction %2658, %2659, %int0_3284, %true_3285 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_3288 = torch.constant.float 9.9999999999999995E-7
    %int1_3289 = torch.constant.int 1
    %2660 = torch.aten.add.Scalar %result0_3286, %float9.999990e-07_3288, %int1_3289 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %2661 = torch.aten.rsqrt %2660 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_3290 = torch.constant.int 1
    %2662 = torch.aten.sub.Tensor %2656, %result1_3287, %int1_3290 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %2663 = torch.aten.mul.Tensor %2662, %2661 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_3291 = torch.constant.int 5
    %2664 = torch.prims.convert_element_type %2663, %int5_3291 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %2665 = torch.aten.mul.Tensor %2657, %2664 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3292 = torch.constant.int 1
    %2666 = torch.aten.add.Tensor %2665, %2428, %int1_3292 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_3293 = torch.constant.int 512
    %int3072_3294 = torch.constant.int 3072
    %2667 = torch.prim.ListConstruct %int512_3293, %int3072_3294 : (!torch.int, !torch.int) -> !torch.list<int>
    %2668 = torch.aten.view %2666, %2667 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.7.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.7.txt_mlp.0.weight : tensor<12288x3072xf16>
    %2669 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_3295 = torch.constant.int 0
    %int1_3296 = torch.constant.int 1
    %2670 = torch.aten.transpose.int %2669, %int0_3295, %int1_3296 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.7.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.7.txt_mlp.0.bias : tensor<12288xf16>
    %2671 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_3297 = torch.constant.int 6
    %2672 = torch.prims.convert_element_type %2671, %int6_3297 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_3298 = torch.constant.int 6
    %2673 = torch.prims.convert_element_type %2668, %int6_3298 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_3299 = torch.constant.int 6
    %2674 = torch.prims.convert_element_type %2670, %int6_3299 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %2675 = torch.aten.mm %2673, %2674 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_3300 = torch.constant.int 1
    %2676 = torch.aten.mul.Scalar %2675, %int1_3300 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_3301 = torch.constant.int 1
    %2677 = torch.aten.mul.Scalar %2672, %int1_3301 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_3302 = torch.constant.int 1
    %2678 = torch.aten.add.Tensor %2676, %2677, %int1_3302 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_3303 = torch.constant.int 5
    %2679 = torch.prims.convert_element_type %2678, %int5_3303 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_3304 = torch.constant.int 1
    %int512_3305 = torch.constant.int 512
    %int12288_3306 = torch.constant.int 12288
    %2680 = torch.prim.ListConstruct %int1_3304, %int512_3305, %int12288_3306 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2681 = torch.aten.view %2679, %2680 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_3307 = torch.constant.str "tanh"
    %2682 = torch.aten.gelu %2681, %str_3307 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_3308 = torch.constant.int 512
    %int12288_3309 = torch.constant.int 12288
    %2683 = torch.prim.ListConstruct %int512_3308, %int12288_3309 : (!torch.int, !torch.int) -> !torch.list<int>
    %2684 = torch.aten.view %2682, %2683 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.7.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.7.txt_mlp.2.weight : tensor<3072x12288xf16>
    %2685 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_3310 = torch.constant.int 0
    %int1_3311 = torch.constant.int 1
    %2686 = torch.aten.transpose.int %2685, %int0_3310, %int1_3311 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.7.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.7.txt_mlp.2.bias : tensor<3072xf16>
    %2687 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.7.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3312 = torch.constant.int 6
    %2688 = torch.prims.convert_element_type %2687, %int6_3312 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3313 = torch.constant.int 6
    %2689 = torch.prims.convert_element_type %2684, %int6_3313 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_3314 = torch.constant.int 6
    %2690 = torch.prims.convert_element_type %2686, %int6_3314 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %2691 = torch.aten.mm %2689, %2690 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_3315 = torch.constant.int 1
    %2692 = torch.aten.mul.Scalar %2691, %int1_3315 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_3316 = torch.constant.int 1
    %2693 = torch.aten.mul.Scalar %2688, %int1_3316 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3317 = torch.constant.int 1
    %2694 = torch.aten.add.Tensor %2692, %2693, %int1_3317 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_3318 = torch.constant.int 5
    %2695 = torch.prims.convert_element_type %2694, %int5_3318 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_3319 = torch.constant.int 1
    %int512_3320 = torch.constant.int 512
    %int3072_3321 = torch.constant.int 3072
    %2696 = torch.prim.ListConstruct %int1_3319, %int512_3320, %int3072_3321 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2697 = torch.aten.view %2695, %2696 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %2698 = torch.aten.mul.Tensor %2430, %2697 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3322 = torch.constant.int 1
    %2699 = torch.aten.add.Tensor %2656, %2698, %int1_3322 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %2700 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.8.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.8.img_mod.lin.weight : tensor<18432x3072xf16>
    %2701 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_3323 = torch.constant.int 0
    %int1_3324 = torch.constant.int 1
    %2702 = torch.aten.transpose.int %2701, %int0_3323, %int1_3324 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.8.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.8.img_mod.lin.bias : tensor<18432xf16>
    %2703 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_3325 = torch.constant.int 6
    %2704 = torch.prims.convert_element_type %2703, %int6_3325 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_3326 = torch.constant.int 6
    %2705 = torch.prims.convert_element_type %2700, %int6_3326 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_3327 = torch.constant.int 6
    %2706 = torch.prims.convert_element_type %2702, %int6_3327 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %2707 = torch.aten.mm %2705, %2706 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_3328 = torch.constant.int 1
    %2708 = torch.aten.mul.Scalar %2707, %int1_3328 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_3329 = torch.constant.int 1
    %2709 = torch.aten.mul.Scalar %2704, %int1_3329 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_3330 = torch.constant.int 1
    %2710 = torch.aten.add.Tensor %2708, %2709, %int1_3330 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_3331 = torch.constant.int 5
    %2711 = torch.prims.convert_element_type %2710, %int5_3331 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_3332 = torch.constant.int 0
    %int0_3333 = torch.constant.int 0
    %int9223372036854775807_3334 = torch.constant.int 9223372036854775807
    %int1_3335 = torch.constant.int 1
    %2712 = torch.aten.slice.Tensor %2711, %int0_3332, %int0_3333, %int9223372036854775807_3334, %int1_3335 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_3336 = torch.constant.int 1
    %2713 = torch.aten.unsqueeze %2712, %int1_3336 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_3337 = torch.constant.int 2
    %int0_3338 = torch.constant.int 0
    %int9223372036854775807_3339 = torch.constant.int 9223372036854775807
    %int1_3340 = torch.constant.int 1
    %2714 = torch.aten.slice.Tensor %2713, %int2_3337, %int0_3338, %int9223372036854775807_3339, %int1_3340 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_3341 = torch.constant.int -1
    %int0_3342 = torch.constant.int 0
    %int3072_3343 = torch.constant.int 3072
    %int1_3344 = torch.constant.int 1
    %2715 = torch.aten.slice.Tensor %2714, %int-1_3341, %int0_3342, %int3072_3343, %int1_3344 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3345 = torch.constant.int -1
    %int3072_3346 = torch.constant.int 3072
    %int6144_3347 = torch.constant.int 6144
    %int1_3348 = torch.constant.int 1
    %2716 = torch.aten.slice.Tensor %2714, %int-1_3345, %int3072_3346, %int6144_3347, %int1_3348 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3349 = torch.constant.int -1
    %int6144_3350 = torch.constant.int 6144
    %int9216_3351 = torch.constant.int 9216
    %int1_3352 = torch.constant.int 1
    %2717 = torch.aten.slice.Tensor %2714, %int-1_3349, %int6144_3350, %int9216_3351, %int1_3352 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3353 = torch.constant.int -1
    %int9216_3354 = torch.constant.int 9216
    %int12288_3355 = torch.constant.int 12288
    %int1_3356 = torch.constant.int 1
    %2718 = torch.aten.slice.Tensor %2714, %int-1_3353, %int9216_3354, %int12288_3355, %int1_3356 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3357 = torch.constant.int -1
    %int12288_3358 = torch.constant.int 12288
    %int15360_3359 = torch.constant.int 15360
    %int1_3360 = torch.constant.int 1
    %2719 = torch.aten.slice.Tensor %2714, %int-1_3357, %int12288_3358, %int15360_3359, %int1_3360 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3361 = torch.constant.int -1
    %int15360_3362 = torch.constant.int 15360
    %int18432_3363 = torch.constant.int 18432
    %int1_3364 = torch.constant.int 1
    %2720 = torch.aten.slice.Tensor %2714, %int-1_3361, %int15360_3362, %int18432_3363, %int1_3364 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2721 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.8.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.8.txt_mod.lin.weight : tensor<18432x3072xf16>
    %2722 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_3365 = torch.constant.int 0
    %int1_3366 = torch.constant.int 1
    %2723 = torch.aten.transpose.int %2722, %int0_3365, %int1_3366 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.8.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.8.txt_mod.lin.bias : tensor<18432xf16>
    %2724 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_3367 = torch.constant.int 6
    %2725 = torch.prims.convert_element_type %2724, %int6_3367 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_3368 = torch.constant.int 6
    %2726 = torch.prims.convert_element_type %2721, %int6_3368 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_3369 = torch.constant.int 6
    %2727 = torch.prims.convert_element_type %2723, %int6_3369 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %2728 = torch.aten.mm %2726, %2727 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_3370 = torch.constant.int 1
    %2729 = torch.aten.mul.Scalar %2728, %int1_3370 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_3371 = torch.constant.int 1
    %2730 = torch.aten.mul.Scalar %2725, %int1_3371 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_3372 = torch.constant.int 1
    %2731 = torch.aten.add.Tensor %2729, %2730, %int1_3372 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_3373 = torch.constant.int 5
    %2732 = torch.prims.convert_element_type %2731, %int5_3373 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_3374 = torch.constant.int 0
    %int0_3375 = torch.constant.int 0
    %int9223372036854775807_3376 = torch.constant.int 9223372036854775807
    %int1_3377 = torch.constant.int 1
    %2733 = torch.aten.slice.Tensor %2732, %int0_3374, %int0_3375, %int9223372036854775807_3376, %int1_3377 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_3378 = torch.constant.int 1
    %2734 = torch.aten.unsqueeze %2733, %int1_3378 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_3379 = torch.constant.int 2
    %int0_3380 = torch.constant.int 0
    %int9223372036854775807_3381 = torch.constant.int 9223372036854775807
    %int1_3382 = torch.constant.int 1
    %2735 = torch.aten.slice.Tensor %2734, %int2_3379, %int0_3380, %int9223372036854775807_3381, %int1_3382 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_3383 = torch.constant.int -1
    %int0_3384 = torch.constant.int 0
    %int3072_3385 = torch.constant.int 3072
    %int1_3386 = torch.constant.int 1
    %2736 = torch.aten.slice.Tensor %2735, %int-1_3383, %int0_3384, %int3072_3385, %int1_3386 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3387 = torch.constant.int -1
    %int3072_3388 = torch.constant.int 3072
    %int6144_3389 = torch.constant.int 6144
    %int1_3390 = torch.constant.int 1
    %2737 = torch.aten.slice.Tensor %2735, %int-1_3387, %int3072_3388, %int6144_3389, %int1_3390 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3391 = torch.constant.int -1
    %int6144_3392 = torch.constant.int 6144
    %int9216_3393 = torch.constant.int 9216
    %int1_3394 = torch.constant.int 1
    %2738 = torch.aten.slice.Tensor %2735, %int-1_3391, %int6144_3392, %int9216_3393, %int1_3394 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3395 = torch.constant.int -1
    %int9216_3396 = torch.constant.int 9216
    %int12288_3397 = torch.constant.int 12288
    %int1_3398 = torch.constant.int 1
    %2739 = torch.aten.slice.Tensor %2735, %int-1_3395, %int9216_3396, %int12288_3397, %int1_3398 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3399 = torch.constant.int -1
    %int12288_3400 = torch.constant.int 12288
    %int15360_3401 = torch.constant.int 15360
    %int1_3402 = torch.constant.int 1
    %2740 = torch.aten.slice.Tensor %2735, %int-1_3399, %int12288_3400, %int15360_3401, %int1_3402 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3403 = torch.constant.int -1
    %int15360_3404 = torch.constant.int 15360
    %int18432_3405 = torch.constant.int 18432
    %int1_3406 = torch.constant.int 1
    %2741 = torch.aten.slice.Tensor %2735, %int-1_3403, %int15360_3404, %int18432_3405, %int1_3406 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_3407 = torch.constant.int 6
    %2742 = torch.prims.convert_element_type %2639, %int6_3407 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_3408 = torch.constant.int 2
    %2743 = torch.prim.ListConstruct %int2_3408 : (!torch.int) -> !torch.list<int>
    %int0_3409 = torch.constant.int 0
    %true_3410 = torch.constant.bool true
    %result0_3411, %result1_3412 = torch.aten.var_mean.correction %2742, %2743, %int0_3409, %true_3410 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_3413 = torch.constant.float 9.9999999999999995E-7
    %int1_3414 = torch.constant.int 1
    %2744 = torch.aten.add.Scalar %result0_3411, %float9.999990e-07_3413, %int1_3414 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %2745 = torch.aten.rsqrt %2744 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_3415 = torch.constant.int 1
    %2746 = torch.aten.sub.Tensor %2639, %result1_3412, %int1_3415 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %2747 = torch.aten.mul.Tensor %2746, %2745 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_3416 = torch.constant.int 5
    %2748 = torch.prims.convert_element_type %2747, %int5_3416 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3417 = torch.constant.int 1
    %int1_3418 = torch.constant.int 1
    %2749 = torch.aten.add.Scalar %2716, %int1_3417, %int1_3418 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2750 = torch.aten.mul.Tensor %2749, %2748 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3419 = torch.constant.int 1
    %2751 = torch.aten.add.Tensor %2750, %2715, %int1_3419 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_3420 = torch.constant.int 4096
    %int3072_3421 = torch.constant.int 3072
    %2752 = torch.prim.ListConstruct %int4096_3420, %int3072_3421 : (!torch.int, !torch.int) -> !torch.list<int>
    %2753 = torch.aten.view %2751, %2752 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.8.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.8.img_attn.qkv.weight : tensor<9216x3072xf16>
    %2754 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_3422 = torch.constant.int 0
    %int1_3423 = torch.constant.int 1
    %2755 = torch.aten.transpose.int %2754, %int0_3422, %int1_3423 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.8.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.8.img_attn.qkv.bias : tensor<9216xf16>
    %2756 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_3424 = torch.constant.int 6
    %2757 = torch.prims.convert_element_type %2756, %int6_3424 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_3425 = torch.constant.int 6
    %2758 = torch.prims.convert_element_type %2753, %int6_3425 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_3426 = torch.constant.int 6
    %2759 = torch.prims.convert_element_type %2755, %int6_3426 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %2760 = torch.aten.mm %2758, %2759 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_3427 = torch.constant.int 1
    %2761 = torch.aten.mul.Scalar %2760, %int1_3427 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_3428 = torch.constant.int 1
    %2762 = torch.aten.mul.Scalar %2757, %int1_3428 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_3429 = torch.constant.int 1
    %2763 = torch.aten.add.Tensor %2761, %2762, %int1_3429 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_3430 = torch.constant.int 5
    %2764 = torch.prims.convert_element_type %2763, %int5_3430 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_3431 = torch.constant.int 1
    %int4096_3432 = torch.constant.int 4096
    %int9216_3433 = torch.constant.int 9216
    %2765 = torch.prim.ListConstruct %int1_3431, %int4096_3432, %int9216_3433 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2766 = torch.aten.view %2764, %2765 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_3434 = torch.constant.int 1
    %int4096_3435 = torch.constant.int 4096
    %int3_3436 = torch.constant.int 3
    %int24_3437 = torch.constant.int 24
    %int128_3438 = torch.constant.int 128
    %2767 = torch.prim.ListConstruct %int1_3434, %int4096_3435, %int3_3436, %int24_3437, %int128_3438 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2768 = torch.aten.view %2766, %2767 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_3439 = torch.constant.int 2
    %int0_3440 = torch.constant.int 0
    %int3_3441 = torch.constant.int 3
    %int1_3442 = torch.constant.int 1
    %int4_3443 = torch.constant.int 4
    %2769 = torch.prim.ListConstruct %int2_3439, %int0_3440, %int3_3441, %int1_3442, %int4_3443 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2770 = torch.aten.permute %2768, %2769 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_3444 = torch.constant.int 0
    %int0_3445 = torch.constant.int 0
    %2771 = torch.aten.select.int %2770, %int0_3444, %int0_3445 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_3446 = torch.constant.int 0
    %int1_3447 = torch.constant.int 1
    %2772 = torch.aten.select.int %2770, %int0_3446, %int1_3447 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_3448 = torch.constant.int 0
    %int2_3449 = torch.constant.int 2
    %2773 = torch.aten.select.int %2770, %int0_3448, %int2_3449 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_3450 = torch.constant.int 6
    %2774 = torch.prims.convert_element_type %2771, %int6_3450 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_3451 = torch.constant.int 2
    %2775 = torch.aten.pow.Tensor_Scalar %2774, %int2_3451 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_3452 = torch.constant.int -1
    %2776 = torch.prim.ListConstruct %int-1_3452 : (!torch.int) -> !torch.list<int>
    %true_3453 = torch.constant.bool true
    %none_3454 = torch.constant.none
    %2777 = torch.aten.mean.dim %2775, %2776, %true_3453, %none_3454 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_3455 = torch.constant.float 9.9999999999999995E-7
    %int1_3456 = torch.constant.int 1
    %2778 = torch.aten.add.Scalar %2777, %float9.999990e-07_3455, %int1_3456 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %2779 = torch.aten.rsqrt %2778 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %2780 = torch.aten.mul.Tensor %2774, %2779 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_3457 = torch.constant.int 5
    %2781 = torch.prims.convert_element_type %2780, %int5_3457 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.8.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.8.img_attn.norm.query_norm.scale : tensor<128xf16>
    %2782 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2783 = torch.aten.mul.Tensor %2781, %2782 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_3458 = torch.constant.int 6
    %2784 = torch.prims.convert_element_type %2772, %int6_3458 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_3459 = torch.constant.int 2
    %2785 = torch.aten.pow.Tensor_Scalar %2784, %int2_3459 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_3460 = torch.constant.int -1
    %2786 = torch.prim.ListConstruct %int-1_3460 : (!torch.int) -> !torch.list<int>
    %true_3461 = torch.constant.bool true
    %none_3462 = torch.constant.none
    %2787 = torch.aten.mean.dim %2785, %2786, %true_3461, %none_3462 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_3463 = torch.constant.float 9.9999999999999995E-7
    %int1_3464 = torch.constant.int 1
    %2788 = torch.aten.add.Scalar %2787, %float9.999990e-07_3463, %int1_3464 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %2789 = torch.aten.rsqrt %2788 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %2790 = torch.aten.mul.Tensor %2784, %2789 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_3465 = torch.constant.int 5
    %2791 = torch.prims.convert_element_type %2790, %int5_3465 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.8.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.8.img_attn.norm.key_norm.scale : tensor<128xf16>
    %2792 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2793 = torch.aten.mul.Tensor %2791, %2792 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_3466 = torch.constant.int 5
    %2794 = torch.prims.convert_element_type %2783, %int5_3466 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_3467 = torch.constant.int 5
    %2795 = torch.prims.convert_element_type %2793, %int5_3467 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_3468 = torch.constant.int 6
    %2796 = torch.prims.convert_element_type %2699, %int6_3468 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_3469 = torch.constant.int 2
    %2797 = torch.prim.ListConstruct %int2_3469 : (!torch.int) -> !torch.list<int>
    %int0_3470 = torch.constant.int 0
    %true_3471 = torch.constant.bool true
    %result0_3472, %result1_3473 = torch.aten.var_mean.correction %2796, %2797, %int0_3470, %true_3471 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_3474 = torch.constant.float 9.9999999999999995E-7
    %int1_3475 = torch.constant.int 1
    %2798 = torch.aten.add.Scalar %result0_3472, %float9.999990e-07_3474, %int1_3475 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %2799 = torch.aten.rsqrt %2798 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_3476 = torch.constant.int 1
    %2800 = torch.aten.sub.Tensor %2699, %result1_3473, %int1_3476 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %2801 = torch.aten.mul.Tensor %2800, %2799 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_3477 = torch.constant.int 5
    %2802 = torch.prims.convert_element_type %2801, %int5_3477 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_3478 = torch.constant.int 1
    %int1_3479 = torch.constant.int 1
    %2803 = torch.aten.add.Scalar %2737, %int1_3478, %int1_3479 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %2804 = torch.aten.mul.Tensor %2803, %2802 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3480 = torch.constant.int 1
    %2805 = torch.aten.add.Tensor %2804, %2736, %int1_3480 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_3481 = torch.constant.int 512
    %int3072_3482 = torch.constant.int 3072
    %2806 = torch.prim.ListConstruct %int512_3481, %int3072_3482 : (!torch.int, !torch.int) -> !torch.list<int>
    %2807 = torch.aten.view %2805, %2806 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.8.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.8.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %2808 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_3483 = torch.constant.int 0
    %int1_3484 = torch.constant.int 1
    %2809 = torch.aten.transpose.int %2808, %int0_3483, %int1_3484 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.8.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.8.txt_attn.qkv.bias : tensor<9216xf16>
    %2810 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_3485 = torch.constant.int 6
    %2811 = torch.prims.convert_element_type %2810, %int6_3485 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_3486 = torch.constant.int 6
    %2812 = torch.prims.convert_element_type %2807, %int6_3486 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_3487 = torch.constant.int 6
    %2813 = torch.prims.convert_element_type %2809, %int6_3487 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %2814 = torch.aten.mm %2812, %2813 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_3488 = torch.constant.int 1
    %2815 = torch.aten.mul.Scalar %2814, %int1_3488 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_3489 = torch.constant.int 1
    %2816 = torch.aten.mul.Scalar %2811, %int1_3489 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_3490 = torch.constant.int 1
    %2817 = torch.aten.add.Tensor %2815, %2816, %int1_3490 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_3491 = torch.constant.int 5
    %2818 = torch.prims.convert_element_type %2817, %int5_3491 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_3492 = torch.constant.int 1
    %int512_3493 = torch.constant.int 512
    %int9216_3494 = torch.constant.int 9216
    %2819 = torch.prim.ListConstruct %int1_3492, %int512_3493, %int9216_3494 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2820 = torch.aten.view %2818, %2819 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_3495 = torch.constant.int 1
    %int512_3496 = torch.constant.int 512
    %int3_3497 = torch.constant.int 3
    %int24_3498 = torch.constant.int 24
    %int128_3499 = torch.constant.int 128
    %2821 = torch.prim.ListConstruct %int1_3495, %int512_3496, %int3_3497, %int24_3498, %int128_3499 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2822 = torch.aten.view %2820, %2821 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_3500 = torch.constant.int 2
    %int0_3501 = torch.constant.int 0
    %int3_3502 = torch.constant.int 3
    %int1_3503 = torch.constant.int 1
    %int4_3504 = torch.constant.int 4
    %2823 = torch.prim.ListConstruct %int2_3500, %int0_3501, %int3_3502, %int1_3503, %int4_3504 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2824 = torch.aten.permute %2822, %2823 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_3505 = torch.constant.int 0
    %int0_3506 = torch.constant.int 0
    %2825 = torch.aten.select.int %2824, %int0_3505, %int0_3506 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_3507 = torch.constant.int 0
    %int1_3508 = torch.constant.int 1
    %2826 = torch.aten.select.int %2824, %int0_3507, %int1_3508 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_3509 = torch.constant.int 0
    %int2_3510 = torch.constant.int 2
    %2827 = torch.aten.select.int %2824, %int0_3509, %int2_3510 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_3511 = torch.constant.int 6
    %2828 = torch.prims.convert_element_type %2825, %int6_3511 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_3512 = torch.constant.int 2
    %2829 = torch.aten.pow.Tensor_Scalar %2828, %int2_3512 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_3513 = torch.constant.int -1
    %2830 = torch.prim.ListConstruct %int-1_3513 : (!torch.int) -> !torch.list<int>
    %true_3514 = torch.constant.bool true
    %none_3515 = torch.constant.none
    %2831 = torch.aten.mean.dim %2829, %2830, %true_3514, %none_3515 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_3516 = torch.constant.float 9.9999999999999995E-7
    %int1_3517 = torch.constant.int 1
    %2832 = torch.aten.add.Scalar %2831, %float9.999990e-07_3516, %int1_3517 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %2833 = torch.aten.rsqrt %2832 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %2834 = torch.aten.mul.Tensor %2828, %2833 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_3518 = torch.constant.int 5
    %2835 = torch.prims.convert_element_type %2834, %int5_3518 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.8.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.8.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %2836 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2837 = torch.aten.mul.Tensor %2835, %2836 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_3519 = torch.constant.int 6
    %2838 = torch.prims.convert_element_type %2826, %int6_3519 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_3520 = torch.constant.int 2
    %2839 = torch.aten.pow.Tensor_Scalar %2838, %int2_3520 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_3521 = torch.constant.int -1
    %2840 = torch.prim.ListConstruct %int-1_3521 : (!torch.int) -> !torch.list<int>
    %true_3522 = torch.constant.bool true
    %none_3523 = torch.constant.none
    %2841 = torch.aten.mean.dim %2839, %2840, %true_3522, %none_3523 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_3524 = torch.constant.float 9.9999999999999995E-7
    %int1_3525 = torch.constant.int 1
    %2842 = torch.aten.add.Scalar %2841, %float9.999990e-07_3524, %int1_3525 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %2843 = torch.aten.rsqrt %2842 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %2844 = torch.aten.mul.Tensor %2838, %2843 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_3526 = torch.constant.int 5
    %2845 = torch.prims.convert_element_type %2844, %int5_3526 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.8.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.8.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %2846 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %2847 = torch.aten.mul.Tensor %2845, %2846 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_3527 = torch.constant.int 5
    %2848 = torch.prims.convert_element_type %2837, %int5_3527 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_3528 = torch.constant.int 5
    %2849 = torch.prims.convert_element_type %2847, %int5_3528 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %2850 = torch.prim.ListConstruct %2848, %2794 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_3529 = torch.constant.int 2
    %2851 = torch.aten.cat %2850, %int2_3529 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %2852 = torch.prim.ListConstruct %2849, %2795 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_3530 = torch.constant.int 2
    %2853 = torch.aten.cat %2852, %int2_3530 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %2854 = torch.prim.ListConstruct %2827, %2773 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_3531 = torch.constant.int 2
    %2855 = torch.aten.cat %2854, %int2_3531 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_3532 = torch.constant.int 6
    %2856 = torch.prims.convert_element_type %2851, %int6_3532 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_3533 = torch.constant.int 1
    %int24_3534 = torch.constant.int 24
    %int4608_3535 = torch.constant.int 4608
    %int-1_3536 = torch.constant.int -1
    %int1_3537 = torch.constant.int 1
    %int2_3538 = torch.constant.int 2
    %2857 = torch.prim.ListConstruct %int1_3533, %int24_3534, %int4608_3535, %int-1_3536, %int1_3537, %int2_3538 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2858 = torch.aten.view %2856, %2857 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_3539 = torch.constant.int 6
    %2859 = torch.prims.convert_element_type %2853, %int6_3539 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_3540 = torch.constant.int 1
    %int24_3541 = torch.constant.int 24
    %int4608_3542 = torch.constant.int 4608
    %int-1_3543 = torch.constant.int -1
    %int1_3544 = torch.constant.int 1
    %int2_3545 = torch.constant.int 2
    %2860 = torch.prim.ListConstruct %int1_3540, %int24_3541, %int4608_3542, %int-1_3543, %int1_3544, %int2_3545 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2861 = torch.aten.view %2859, %2860 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_3546 = torch.constant.int 5
    %int0_3547 = torch.constant.int 0
    %2862 = torch.aten.select.int %211, %int5_3546, %int0_3547 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3548 = torch.constant.int 5
    %int0_3549 = torch.constant.int 0
    %2863 = torch.aten.select.int %2858, %int5_3548, %int0_3549 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2864 = torch.aten.mul.Tensor %2862, %2863 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_3550 = torch.constant.int 5
    %int1_3551 = torch.constant.int 1
    %2865 = torch.aten.select.int %211, %int5_3550, %int1_3551 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3552 = torch.constant.int 5
    %int1_3553 = torch.constant.int 1
    %2866 = torch.aten.select.int %2858, %int5_3552, %int1_3553 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2867 = torch.aten.mul.Tensor %2865, %2866 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_3554 = torch.constant.int 1
    %2868 = torch.aten.add.Tensor %2864, %2867, %int1_3554 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_3555 = torch.constant.int 5
    %int0_3556 = torch.constant.int 0
    %2869 = torch.aten.select.int %211, %int5_3555, %int0_3556 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3557 = torch.constant.int 5
    %int0_3558 = torch.constant.int 0
    %2870 = torch.aten.select.int %2861, %int5_3557, %int0_3558 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2871 = torch.aten.mul.Tensor %2869, %2870 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_3559 = torch.constant.int 5
    %int1_3560 = torch.constant.int 1
    %2872 = torch.aten.select.int %211, %int5_3559, %int1_3560 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3561 = torch.constant.int 5
    %int1_3562 = torch.constant.int 1
    %2873 = torch.aten.select.int %2861, %int5_3561, %int1_3562 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %2874 = torch.aten.mul.Tensor %2872, %2873 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_3563 = torch.constant.int 1
    %2875 = torch.aten.add.Tensor %2871, %2874, %int1_3563 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_3564 = torch.constant.int 1
    %int24_3565 = torch.constant.int 24
    %int4608_3566 = torch.constant.int 4608
    %int128_3567 = torch.constant.int 128
    %2876 = torch.prim.ListConstruct %int1_3564, %int24_3565, %int4608_3566, %int128_3567 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2877 = torch.aten.view %2868, %2876 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_3568 = torch.constant.int 5
    %2878 = torch.prims.convert_element_type %2877, %int5_3568 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_3569 = torch.constant.int 1
    %int24_3570 = torch.constant.int 24
    %int4608_3571 = torch.constant.int 4608
    %int128_3572 = torch.constant.int 128
    %2879 = torch.prim.ListConstruct %int1_3569, %int24_3570, %int4608_3571, %int128_3572 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2880 = torch.aten.view %2875, %2879 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_3573 = torch.constant.int 5
    %2881 = torch.prims.convert_element_type %2880, %int5_3573 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_3574 = torch.constant.float 0.000000e+00
    %false_3575 = torch.constant.bool false
    %none_3576 = torch.constant.none
    %none_3577 = torch.constant.none
    %2882:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%2878, %2881, %2855, %float0.000000e00_3574, %false_3575, %none_3576, %none_3577) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_3578 = torch.constant.int 0
    %int2_3579 = torch.constant.int 2
    %int1_3580 = torch.constant.int 1
    %int3_3581 = torch.constant.int 3
    %2883 = torch.prim.ListConstruct %int0_3578, %int2_3579, %int1_3580, %int3_3581 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2884 = torch.aten.permute %2882#0, %2883 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_3582 = torch.constant.int 1
    %int4608_3583 = torch.constant.int 4608
    %int3072_3584 = torch.constant.int 3072
    %2885 = torch.prim.ListConstruct %int1_3582, %int4608_3583, %int3072_3584 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2886 = torch.aten.view %2884, %2885 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_3585 = torch.constant.int 0
    %int0_3586 = torch.constant.int 0
    %int9223372036854775807_3587 = torch.constant.int 9223372036854775807
    %int1_3588 = torch.constant.int 1
    %2887 = torch.aten.slice.Tensor %2886, %int0_3585, %int0_3586, %int9223372036854775807_3587, %int1_3588 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_3589 = torch.constant.int 1
    %int0_3590 = torch.constant.int 0
    %int512_3591 = torch.constant.int 512
    %int1_3592 = torch.constant.int 1
    %2888 = torch.aten.slice.Tensor %2887, %int1_3589, %int0_3590, %int512_3591, %int1_3592 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_3593 = torch.constant.int 0
    %int0_3594 = torch.constant.int 0
    %int9223372036854775807_3595 = torch.constant.int 9223372036854775807
    %int1_3596 = torch.constant.int 1
    %2889 = torch.aten.slice.Tensor %2886, %int0_3593, %int0_3594, %int9223372036854775807_3595, %int1_3596 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_3597 = torch.constant.int 1
    %int512_3598 = torch.constant.int 512
    %int9223372036854775807_3599 = torch.constant.int 9223372036854775807
    %int1_3600 = torch.constant.int 1
    %2890 = torch.aten.slice.Tensor %2889, %int1_3597, %int512_3598, %int9223372036854775807_3599, %int1_3600 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_3601 = torch.constant.int 4096
    %int3072_3602 = torch.constant.int 3072
    %2891 = torch.prim.ListConstruct %int4096_3601, %int3072_3602 : (!torch.int, !torch.int) -> !torch.list<int>
    %2892 = torch.aten.view %2890, %2891 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.8.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.8.img_attn.proj.weight : tensor<3072x3072xf16>
    %2893 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_3603 = torch.constant.int 0
    %int1_3604 = torch.constant.int 1
    %2894 = torch.aten.transpose.int %2893, %int0_3603, %int1_3604 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.8.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.8.img_attn.proj.bias : tensor<3072xf16>
    %2895 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3605 = torch.constant.int 6
    %2896 = torch.prims.convert_element_type %2895, %int6_3605 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3606 = torch.constant.int 6
    %2897 = torch.prims.convert_element_type %2892, %int6_3606 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_3607 = torch.constant.int 6
    %2898 = torch.prims.convert_element_type %2894, %int6_3607 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %2899 = torch.aten.mm %2897, %2898 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_3608 = torch.constant.int 1
    %2900 = torch.aten.mul.Scalar %2899, %int1_3608 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_3609 = torch.constant.int 1
    %2901 = torch.aten.mul.Scalar %2896, %int1_3609 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3610 = torch.constant.int 1
    %2902 = torch.aten.add.Tensor %2900, %2901, %int1_3610 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_3611 = torch.constant.int 5
    %2903 = torch.prims.convert_element_type %2902, %int5_3611 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_3612 = torch.constant.int 1
    %int4096_3613 = torch.constant.int 4096
    %int3072_3614 = torch.constant.int 3072
    %2904 = torch.prim.ListConstruct %int1_3612, %int4096_3613, %int3072_3614 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2905 = torch.aten.view %2903, %2904 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %2906 = torch.aten.mul.Tensor %2717, %2905 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3615 = torch.constant.int 1
    %2907 = torch.aten.add.Tensor %2639, %2906, %int1_3615 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3616 = torch.constant.int 1
    %int1_3617 = torch.constant.int 1
    %2908 = torch.aten.add.Scalar %2719, %int1_3616, %int1_3617 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_3618 = torch.constant.int 6
    %2909 = torch.prims.convert_element_type %2907, %int6_3618 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_3619 = torch.constant.int 2
    %2910 = torch.prim.ListConstruct %int2_3619 : (!torch.int) -> !torch.list<int>
    %int0_3620 = torch.constant.int 0
    %true_3621 = torch.constant.bool true
    %result0_3622, %result1_3623 = torch.aten.var_mean.correction %2909, %2910, %int0_3620, %true_3621 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_3624 = torch.constant.float 9.9999999999999995E-7
    %int1_3625 = torch.constant.int 1
    %2911 = torch.aten.add.Scalar %result0_3622, %float9.999990e-07_3624, %int1_3625 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %2912 = torch.aten.rsqrt %2911 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_3626 = torch.constant.int 1
    %2913 = torch.aten.sub.Tensor %2907, %result1_3623, %int1_3626 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %2914 = torch.aten.mul.Tensor %2913, %2912 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_3627 = torch.constant.int 5
    %2915 = torch.prims.convert_element_type %2914, %int5_3627 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %2916 = torch.aten.mul.Tensor %2908, %2915 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3628 = torch.constant.int 1
    %2917 = torch.aten.add.Tensor %2916, %2718, %int1_3628 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_3629 = torch.constant.int 4096
    %int3072_3630 = torch.constant.int 3072
    %2918 = torch.prim.ListConstruct %int4096_3629, %int3072_3630 : (!torch.int, !torch.int) -> !torch.list<int>
    %2919 = torch.aten.view %2917, %2918 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.8.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.8.img_mlp.0.weight : tensor<12288x3072xf16>
    %2920 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_3631 = torch.constant.int 0
    %int1_3632 = torch.constant.int 1
    %2921 = torch.aten.transpose.int %2920, %int0_3631, %int1_3632 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.8.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.8.img_mlp.0.bias : tensor<12288xf16>
    %2922 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_3633 = torch.constant.int 6
    %2923 = torch.prims.convert_element_type %2922, %int6_3633 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_3634 = torch.constant.int 6
    %2924 = torch.prims.convert_element_type %2919, %int6_3634 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_3635 = torch.constant.int 6
    %2925 = torch.prims.convert_element_type %2921, %int6_3635 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %2926 = torch.aten.mm %2924, %2925 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_3636 = torch.constant.int 1
    %2927 = torch.aten.mul.Scalar %2926, %int1_3636 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_3637 = torch.constant.int 1
    %2928 = torch.aten.mul.Scalar %2923, %int1_3637 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_3638 = torch.constant.int 1
    %2929 = torch.aten.add.Tensor %2927, %2928, %int1_3638 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_3639 = torch.constant.int 5
    %2930 = torch.prims.convert_element_type %2929, %int5_3639 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_3640 = torch.constant.int 1
    %int4096_3641 = torch.constant.int 4096
    %int12288_3642 = torch.constant.int 12288
    %2931 = torch.prim.ListConstruct %int1_3640, %int4096_3641, %int12288_3642 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2932 = torch.aten.view %2930, %2931 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_3643 = torch.constant.str "tanh"
    %2933 = torch.aten.gelu %2932, %str_3643 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_3644 = torch.constant.int 4096
    %int12288_3645 = torch.constant.int 12288
    %2934 = torch.prim.ListConstruct %int4096_3644, %int12288_3645 : (!torch.int, !torch.int) -> !torch.list<int>
    %2935 = torch.aten.view %2933, %2934 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.8.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.8.img_mlp.2.weight : tensor<3072x12288xf16>
    %2936 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_3646 = torch.constant.int 0
    %int1_3647 = torch.constant.int 1
    %2937 = torch.aten.transpose.int %2936, %int0_3646, %int1_3647 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.8.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.8.img_mlp.2.bias : tensor<3072xf16>
    %2938 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3648 = torch.constant.int 6
    %2939 = torch.prims.convert_element_type %2938, %int6_3648 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3649 = torch.constant.int 6
    %2940 = torch.prims.convert_element_type %2935, %int6_3649 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_3650 = torch.constant.int 6
    %2941 = torch.prims.convert_element_type %2937, %int6_3650 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %2942 = torch.aten.mm %2940, %2941 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_3651 = torch.constant.int 1
    %2943 = torch.aten.mul.Scalar %2942, %int1_3651 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_3652 = torch.constant.int 1
    %2944 = torch.aten.mul.Scalar %2939, %int1_3652 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3653 = torch.constant.int 1
    %2945 = torch.aten.add.Tensor %2943, %2944, %int1_3653 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_3654 = torch.constant.int 5
    %2946 = torch.prims.convert_element_type %2945, %int5_3654 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_3655 = torch.constant.int 1
    %int4096_3656 = torch.constant.int 4096
    %int3072_3657 = torch.constant.int 3072
    %2947 = torch.prim.ListConstruct %int1_3655, %int4096_3656, %int3072_3657 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2948 = torch.aten.view %2946, %2947 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %2949 = torch.aten.mul.Tensor %2720, %2948 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3658 = torch.constant.int 1
    %2950 = torch.aten.add.Tensor %2907, %2949, %int1_3658 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_3659 = torch.constant.int 512
    %int3072_3660 = torch.constant.int 3072
    %2951 = torch.prim.ListConstruct %int512_3659, %int3072_3660 : (!torch.int, !torch.int) -> !torch.list<int>
    %2952 = torch.aten.view %2888, %2951 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.8.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.8.txt_attn.proj.weight : tensor<3072x3072xf16>
    %2953 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_3661 = torch.constant.int 0
    %int1_3662 = torch.constant.int 1
    %2954 = torch.aten.transpose.int %2953, %int0_3661, %int1_3662 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.8.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.8.txt_attn.proj.bias : tensor<3072xf16>
    %2955 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3663 = torch.constant.int 6
    %2956 = torch.prims.convert_element_type %2955, %int6_3663 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3664 = torch.constant.int 6
    %2957 = torch.prims.convert_element_type %2952, %int6_3664 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_3665 = torch.constant.int 6
    %2958 = torch.prims.convert_element_type %2954, %int6_3665 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %2959 = torch.aten.mm %2957, %2958 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_3666 = torch.constant.int 1
    %2960 = torch.aten.mul.Scalar %2959, %int1_3666 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_3667 = torch.constant.int 1
    %2961 = torch.aten.mul.Scalar %2956, %int1_3667 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3668 = torch.constant.int 1
    %2962 = torch.aten.add.Tensor %2960, %2961, %int1_3668 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_3669 = torch.constant.int 5
    %2963 = torch.prims.convert_element_type %2962, %int5_3669 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_3670 = torch.constant.int 1
    %int512_3671 = torch.constant.int 512
    %int3072_3672 = torch.constant.int 3072
    %2964 = torch.prim.ListConstruct %int1_3670, %int512_3671, %int3072_3672 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2965 = torch.aten.view %2963, %2964 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %2966 = torch.aten.mul.Tensor %2738, %2965 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3673 = torch.constant.int 1
    %2967 = torch.aten.add.Tensor %2699, %2966, %int1_3673 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_3674 = torch.constant.int 1
    %int1_3675 = torch.constant.int 1
    %2968 = torch.aten.add.Scalar %2740, %int1_3674, %int1_3675 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_3676 = torch.constant.int 6
    %2969 = torch.prims.convert_element_type %2967, %int6_3676 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_3677 = torch.constant.int 2
    %2970 = torch.prim.ListConstruct %int2_3677 : (!torch.int) -> !torch.list<int>
    %int0_3678 = torch.constant.int 0
    %true_3679 = torch.constant.bool true
    %result0_3680, %result1_3681 = torch.aten.var_mean.correction %2969, %2970, %int0_3678, %true_3679 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_3682 = torch.constant.float 9.9999999999999995E-7
    %int1_3683 = torch.constant.int 1
    %2971 = torch.aten.add.Scalar %result0_3680, %float9.999990e-07_3682, %int1_3683 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %2972 = torch.aten.rsqrt %2971 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_3684 = torch.constant.int 1
    %2973 = torch.aten.sub.Tensor %2967, %result1_3681, %int1_3684 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %2974 = torch.aten.mul.Tensor %2973, %2972 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_3685 = torch.constant.int 5
    %2975 = torch.prims.convert_element_type %2974, %int5_3685 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %2976 = torch.aten.mul.Tensor %2968, %2975 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3686 = torch.constant.int 1
    %2977 = torch.aten.add.Tensor %2976, %2739, %int1_3686 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_3687 = torch.constant.int 512
    %int3072_3688 = torch.constant.int 3072
    %2978 = torch.prim.ListConstruct %int512_3687, %int3072_3688 : (!torch.int, !torch.int) -> !torch.list<int>
    %2979 = torch.aten.view %2977, %2978 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.8.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.8.txt_mlp.0.weight : tensor<12288x3072xf16>
    %2980 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_3689 = torch.constant.int 0
    %int1_3690 = torch.constant.int 1
    %2981 = torch.aten.transpose.int %2980, %int0_3689, %int1_3690 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.8.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.8.txt_mlp.0.bias : tensor<12288xf16>
    %2982 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_3691 = torch.constant.int 6
    %2983 = torch.prims.convert_element_type %2982, %int6_3691 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_3692 = torch.constant.int 6
    %2984 = torch.prims.convert_element_type %2979, %int6_3692 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_3693 = torch.constant.int 6
    %2985 = torch.prims.convert_element_type %2981, %int6_3693 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %2986 = torch.aten.mm %2984, %2985 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_3694 = torch.constant.int 1
    %2987 = torch.aten.mul.Scalar %2986, %int1_3694 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_3695 = torch.constant.int 1
    %2988 = torch.aten.mul.Scalar %2983, %int1_3695 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_3696 = torch.constant.int 1
    %2989 = torch.aten.add.Tensor %2987, %2988, %int1_3696 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_3697 = torch.constant.int 5
    %2990 = torch.prims.convert_element_type %2989, %int5_3697 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_3698 = torch.constant.int 1
    %int512_3699 = torch.constant.int 512
    %int12288_3700 = torch.constant.int 12288
    %2991 = torch.prim.ListConstruct %int1_3698, %int512_3699, %int12288_3700 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %2992 = torch.aten.view %2990, %2991 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_3701 = torch.constant.str "tanh"
    %2993 = torch.aten.gelu %2992, %str_3701 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_3702 = torch.constant.int 512
    %int12288_3703 = torch.constant.int 12288
    %2994 = torch.prim.ListConstruct %int512_3702, %int12288_3703 : (!torch.int, !torch.int) -> !torch.list<int>
    %2995 = torch.aten.view %2993, %2994 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.8.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.8.txt_mlp.2.weight : tensor<3072x12288xf16>
    %2996 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_3704 = torch.constant.int 0
    %int1_3705 = torch.constant.int 1
    %2997 = torch.aten.transpose.int %2996, %int0_3704, %int1_3705 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.8.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.8.txt_mlp.2.bias : tensor<3072xf16>
    %2998 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.8.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3706 = torch.constant.int 6
    %2999 = torch.prims.convert_element_type %2998, %int6_3706 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_3707 = torch.constant.int 6
    %3000 = torch.prims.convert_element_type %2995, %int6_3707 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_3708 = torch.constant.int 6
    %3001 = torch.prims.convert_element_type %2997, %int6_3708 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %3002 = torch.aten.mm %3000, %3001 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_3709 = torch.constant.int 1
    %3003 = torch.aten.mul.Scalar %3002, %int1_3709 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_3710 = torch.constant.int 1
    %3004 = torch.aten.mul.Scalar %2999, %int1_3710 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_3711 = torch.constant.int 1
    %3005 = torch.aten.add.Tensor %3003, %3004, %int1_3711 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_3712 = torch.constant.int 5
    %3006 = torch.prims.convert_element_type %3005, %int5_3712 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_3713 = torch.constant.int 1
    %int512_3714 = torch.constant.int 512
    %int3072_3715 = torch.constant.int 3072
    %3007 = torch.prim.ListConstruct %int1_3713, %int512_3714, %int3072_3715 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3008 = torch.aten.view %3006, %3007 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %3009 = torch.aten.mul.Tensor %2741, %3008 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3716 = torch.constant.int 1
    %3010 = torch.aten.add.Tensor %2967, %3009, %int1_3716 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %3011 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.9.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.9.img_mod.lin.weight : tensor<18432x3072xf16>
    %3012 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_3717 = torch.constant.int 0
    %int1_3718 = torch.constant.int 1
    %3013 = torch.aten.transpose.int %3012, %int0_3717, %int1_3718 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.9.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.9.img_mod.lin.bias : tensor<18432xf16>
    %3014 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_3719 = torch.constant.int 6
    %3015 = torch.prims.convert_element_type %3014, %int6_3719 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_3720 = torch.constant.int 6
    %3016 = torch.prims.convert_element_type %3011, %int6_3720 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_3721 = torch.constant.int 6
    %3017 = torch.prims.convert_element_type %3013, %int6_3721 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %3018 = torch.aten.mm %3016, %3017 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_3722 = torch.constant.int 1
    %3019 = torch.aten.mul.Scalar %3018, %int1_3722 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_3723 = torch.constant.int 1
    %3020 = torch.aten.mul.Scalar %3015, %int1_3723 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_3724 = torch.constant.int 1
    %3021 = torch.aten.add.Tensor %3019, %3020, %int1_3724 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_3725 = torch.constant.int 5
    %3022 = torch.prims.convert_element_type %3021, %int5_3725 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_3726 = torch.constant.int 0
    %int0_3727 = torch.constant.int 0
    %int9223372036854775807_3728 = torch.constant.int 9223372036854775807
    %int1_3729 = torch.constant.int 1
    %3023 = torch.aten.slice.Tensor %3022, %int0_3726, %int0_3727, %int9223372036854775807_3728, %int1_3729 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_3730 = torch.constant.int 1
    %3024 = torch.aten.unsqueeze %3023, %int1_3730 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_3731 = torch.constant.int 2
    %int0_3732 = torch.constant.int 0
    %int9223372036854775807_3733 = torch.constant.int 9223372036854775807
    %int1_3734 = torch.constant.int 1
    %3025 = torch.aten.slice.Tensor %3024, %int2_3731, %int0_3732, %int9223372036854775807_3733, %int1_3734 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_3735 = torch.constant.int -1
    %int0_3736 = torch.constant.int 0
    %int3072_3737 = torch.constant.int 3072
    %int1_3738 = torch.constant.int 1
    %3026 = torch.aten.slice.Tensor %3025, %int-1_3735, %int0_3736, %int3072_3737, %int1_3738 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3739 = torch.constant.int -1
    %int3072_3740 = torch.constant.int 3072
    %int6144_3741 = torch.constant.int 6144
    %int1_3742 = torch.constant.int 1
    %3027 = torch.aten.slice.Tensor %3025, %int-1_3739, %int3072_3740, %int6144_3741, %int1_3742 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3743 = torch.constant.int -1
    %int6144_3744 = torch.constant.int 6144
    %int9216_3745 = torch.constant.int 9216
    %int1_3746 = torch.constant.int 1
    %3028 = torch.aten.slice.Tensor %3025, %int-1_3743, %int6144_3744, %int9216_3745, %int1_3746 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3747 = torch.constant.int -1
    %int9216_3748 = torch.constant.int 9216
    %int12288_3749 = torch.constant.int 12288
    %int1_3750 = torch.constant.int 1
    %3029 = torch.aten.slice.Tensor %3025, %int-1_3747, %int9216_3748, %int12288_3749, %int1_3750 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3751 = torch.constant.int -1
    %int12288_3752 = torch.constant.int 12288
    %int15360_3753 = torch.constant.int 15360
    %int1_3754 = torch.constant.int 1
    %3030 = torch.aten.slice.Tensor %3025, %int-1_3751, %int12288_3752, %int15360_3753, %int1_3754 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3755 = torch.constant.int -1
    %int15360_3756 = torch.constant.int 15360
    %int18432_3757 = torch.constant.int 18432
    %int1_3758 = torch.constant.int 1
    %3031 = torch.aten.slice.Tensor %3025, %int-1_3755, %int15360_3756, %int18432_3757, %int1_3758 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3032 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.9.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.9.txt_mod.lin.weight : tensor<18432x3072xf16>
    %3033 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_3759 = torch.constant.int 0
    %int1_3760 = torch.constant.int 1
    %3034 = torch.aten.transpose.int %3033, %int0_3759, %int1_3760 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.9.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.9.txt_mod.lin.bias : tensor<18432xf16>
    %3035 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_3761 = torch.constant.int 6
    %3036 = torch.prims.convert_element_type %3035, %int6_3761 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_3762 = torch.constant.int 6
    %3037 = torch.prims.convert_element_type %3032, %int6_3762 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_3763 = torch.constant.int 6
    %3038 = torch.prims.convert_element_type %3034, %int6_3763 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %3039 = torch.aten.mm %3037, %3038 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_3764 = torch.constant.int 1
    %3040 = torch.aten.mul.Scalar %3039, %int1_3764 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_3765 = torch.constant.int 1
    %3041 = torch.aten.mul.Scalar %3036, %int1_3765 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_3766 = torch.constant.int 1
    %3042 = torch.aten.add.Tensor %3040, %3041, %int1_3766 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_3767 = torch.constant.int 5
    %3043 = torch.prims.convert_element_type %3042, %int5_3767 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_3768 = torch.constant.int 0
    %int0_3769 = torch.constant.int 0
    %int9223372036854775807_3770 = torch.constant.int 9223372036854775807
    %int1_3771 = torch.constant.int 1
    %3044 = torch.aten.slice.Tensor %3043, %int0_3768, %int0_3769, %int9223372036854775807_3770, %int1_3771 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_3772 = torch.constant.int 1
    %3045 = torch.aten.unsqueeze %3044, %int1_3772 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_3773 = torch.constant.int 2
    %int0_3774 = torch.constant.int 0
    %int9223372036854775807_3775 = torch.constant.int 9223372036854775807
    %int1_3776 = torch.constant.int 1
    %3046 = torch.aten.slice.Tensor %3045, %int2_3773, %int0_3774, %int9223372036854775807_3775, %int1_3776 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_3777 = torch.constant.int -1
    %int0_3778 = torch.constant.int 0
    %int3072_3779 = torch.constant.int 3072
    %int1_3780 = torch.constant.int 1
    %3047 = torch.aten.slice.Tensor %3046, %int-1_3777, %int0_3778, %int3072_3779, %int1_3780 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3781 = torch.constant.int -1
    %int3072_3782 = torch.constant.int 3072
    %int6144_3783 = torch.constant.int 6144
    %int1_3784 = torch.constant.int 1
    %3048 = torch.aten.slice.Tensor %3046, %int-1_3781, %int3072_3782, %int6144_3783, %int1_3784 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3785 = torch.constant.int -1
    %int6144_3786 = torch.constant.int 6144
    %int9216_3787 = torch.constant.int 9216
    %int1_3788 = torch.constant.int 1
    %3049 = torch.aten.slice.Tensor %3046, %int-1_3785, %int6144_3786, %int9216_3787, %int1_3788 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3789 = torch.constant.int -1
    %int9216_3790 = torch.constant.int 9216
    %int12288_3791 = torch.constant.int 12288
    %int1_3792 = torch.constant.int 1
    %3050 = torch.aten.slice.Tensor %3046, %int-1_3789, %int9216_3790, %int12288_3791, %int1_3792 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3793 = torch.constant.int -1
    %int12288_3794 = torch.constant.int 12288
    %int15360_3795 = torch.constant.int 15360
    %int1_3796 = torch.constant.int 1
    %3051 = torch.aten.slice.Tensor %3046, %int-1_3793, %int12288_3794, %int15360_3795, %int1_3796 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_3797 = torch.constant.int -1
    %int15360_3798 = torch.constant.int 15360
    %int18432_3799 = torch.constant.int 18432
    %int1_3800 = torch.constant.int 1
    %3052 = torch.aten.slice.Tensor %3046, %int-1_3797, %int15360_3798, %int18432_3799, %int1_3800 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_3801 = torch.constant.int 6
    %3053 = torch.prims.convert_element_type %2950, %int6_3801 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_3802 = torch.constant.int 2
    %3054 = torch.prim.ListConstruct %int2_3802 : (!torch.int) -> !torch.list<int>
    %int0_3803 = torch.constant.int 0
    %true_3804 = torch.constant.bool true
    %result0_3805, %result1_3806 = torch.aten.var_mean.correction %3053, %3054, %int0_3803, %true_3804 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_3807 = torch.constant.float 9.9999999999999995E-7
    %int1_3808 = torch.constant.int 1
    %3055 = torch.aten.add.Scalar %result0_3805, %float9.999990e-07_3807, %int1_3808 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %3056 = torch.aten.rsqrt %3055 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_3809 = torch.constant.int 1
    %3057 = torch.aten.sub.Tensor %2950, %result1_3806, %int1_3809 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %3058 = torch.aten.mul.Tensor %3057, %3056 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_3810 = torch.constant.int 5
    %3059 = torch.prims.convert_element_type %3058, %int5_3810 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3811 = torch.constant.int 1
    %int1_3812 = torch.constant.int 1
    %3060 = torch.aten.add.Scalar %3027, %int1_3811, %int1_3812 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3061 = torch.aten.mul.Tensor %3060, %3059 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_3813 = torch.constant.int 1
    %3062 = torch.aten.add.Tensor %3061, %3026, %int1_3813 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_3814 = torch.constant.int 4096
    %int3072_3815 = torch.constant.int 3072
    %3063 = torch.prim.ListConstruct %int4096_3814, %int3072_3815 : (!torch.int, !torch.int) -> !torch.list<int>
    %3064 = torch.aten.view %3062, %3063 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.9.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.9.img_attn.qkv.weight : tensor<9216x3072xf16>
    %3065 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_3816 = torch.constant.int 0
    %int1_3817 = torch.constant.int 1
    %3066 = torch.aten.transpose.int %3065, %int0_3816, %int1_3817 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.9.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.9.img_attn.qkv.bias : tensor<9216xf16>
    %3067 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_3818 = torch.constant.int 6
    %3068 = torch.prims.convert_element_type %3067, %int6_3818 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_3819 = torch.constant.int 6
    %3069 = torch.prims.convert_element_type %3064, %int6_3819 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_3820 = torch.constant.int 6
    %3070 = torch.prims.convert_element_type %3066, %int6_3820 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %3071 = torch.aten.mm %3069, %3070 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_3821 = torch.constant.int 1
    %3072 = torch.aten.mul.Scalar %3071, %int1_3821 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_3822 = torch.constant.int 1
    %3073 = torch.aten.mul.Scalar %3068, %int1_3822 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_3823 = torch.constant.int 1
    %3074 = torch.aten.add.Tensor %3072, %3073, %int1_3823 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_3824 = torch.constant.int 5
    %3075 = torch.prims.convert_element_type %3074, %int5_3824 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_3825 = torch.constant.int 1
    %int4096_3826 = torch.constant.int 4096
    %int9216_3827 = torch.constant.int 9216
    %3076 = torch.prim.ListConstruct %int1_3825, %int4096_3826, %int9216_3827 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3077 = torch.aten.view %3075, %3076 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_3828 = torch.constant.int 1
    %int4096_3829 = torch.constant.int 4096
    %int3_3830 = torch.constant.int 3
    %int24_3831 = torch.constant.int 24
    %int128_3832 = torch.constant.int 128
    %3078 = torch.prim.ListConstruct %int1_3828, %int4096_3829, %int3_3830, %int24_3831, %int128_3832 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3079 = torch.aten.view %3077, %3078 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_3833 = torch.constant.int 2
    %int0_3834 = torch.constant.int 0
    %int3_3835 = torch.constant.int 3
    %int1_3836 = torch.constant.int 1
    %int4_3837 = torch.constant.int 4
    %3080 = torch.prim.ListConstruct %int2_3833, %int0_3834, %int3_3835, %int1_3836, %int4_3837 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3081 = torch.aten.permute %3079, %3080 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_3838 = torch.constant.int 0
    %int0_3839 = torch.constant.int 0
    %3082 = torch.aten.select.int %3081, %int0_3838, %int0_3839 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_3840 = torch.constant.int 0
    %int1_3841 = torch.constant.int 1
    %3083 = torch.aten.select.int %3081, %int0_3840, %int1_3841 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_3842 = torch.constant.int 0
    %int2_3843 = torch.constant.int 2
    %3084 = torch.aten.select.int %3081, %int0_3842, %int2_3843 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_3844 = torch.constant.int 6
    %3085 = torch.prims.convert_element_type %3082, %int6_3844 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_3845 = torch.constant.int 2
    %3086 = torch.aten.pow.Tensor_Scalar %3085, %int2_3845 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_3846 = torch.constant.int -1
    %3087 = torch.prim.ListConstruct %int-1_3846 : (!torch.int) -> !torch.list<int>
    %true_3847 = torch.constant.bool true
    %none_3848 = torch.constant.none
    %3088 = torch.aten.mean.dim %3086, %3087, %true_3847, %none_3848 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_3849 = torch.constant.float 9.9999999999999995E-7
    %int1_3850 = torch.constant.int 1
    %3089 = torch.aten.add.Scalar %3088, %float9.999990e-07_3849, %int1_3850 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %3090 = torch.aten.rsqrt %3089 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %3091 = torch.aten.mul.Tensor %3085, %3090 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_3851 = torch.constant.int 5
    %3092 = torch.prims.convert_element_type %3091, %int5_3851 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.9.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.9.img_attn.norm.query_norm.scale : tensor<128xf16>
    %3093 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3094 = torch.aten.mul.Tensor %3092, %3093 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_3852 = torch.constant.int 6
    %3095 = torch.prims.convert_element_type %3083, %int6_3852 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_3853 = torch.constant.int 2
    %3096 = torch.aten.pow.Tensor_Scalar %3095, %int2_3853 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_3854 = torch.constant.int -1
    %3097 = torch.prim.ListConstruct %int-1_3854 : (!torch.int) -> !torch.list<int>
    %true_3855 = torch.constant.bool true
    %none_3856 = torch.constant.none
    %3098 = torch.aten.mean.dim %3096, %3097, %true_3855, %none_3856 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_3857 = torch.constant.float 9.9999999999999995E-7
    %int1_3858 = torch.constant.int 1
    %3099 = torch.aten.add.Scalar %3098, %float9.999990e-07_3857, %int1_3858 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %3100 = torch.aten.rsqrt %3099 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %3101 = torch.aten.mul.Tensor %3095, %3100 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_3859 = torch.constant.int 5
    %3102 = torch.prims.convert_element_type %3101, %int5_3859 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.9.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.9.img_attn.norm.key_norm.scale : tensor<128xf16>
    %3103 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3104 = torch.aten.mul.Tensor %3102, %3103 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_3860 = torch.constant.int 5
    %3105 = torch.prims.convert_element_type %3094, %int5_3860 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_3861 = torch.constant.int 5
    %3106 = torch.prims.convert_element_type %3104, %int5_3861 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_3862 = torch.constant.int 6
    %3107 = torch.prims.convert_element_type %3010, %int6_3862 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_3863 = torch.constant.int 2
    %3108 = torch.prim.ListConstruct %int2_3863 : (!torch.int) -> !torch.list<int>
    %int0_3864 = torch.constant.int 0
    %true_3865 = torch.constant.bool true
    %result0_3866, %result1_3867 = torch.aten.var_mean.correction %3107, %3108, %int0_3864, %true_3865 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_3868 = torch.constant.float 9.9999999999999995E-7
    %int1_3869 = torch.constant.int 1
    %3109 = torch.aten.add.Scalar %result0_3866, %float9.999990e-07_3868, %int1_3869 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %3110 = torch.aten.rsqrt %3109 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_3870 = torch.constant.int 1
    %3111 = torch.aten.sub.Tensor %3010, %result1_3867, %int1_3870 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %3112 = torch.aten.mul.Tensor %3111, %3110 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_3871 = torch.constant.int 5
    %3113 = torch.prims.convert_element_type %3112, %int5_3871 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_3872 = torch.constant.int 1
    %int1_3873 = torch.constant.int 1
    %3114 = torch.aten.add.Scalar %3048, %int1_3872, %int1_3873 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3115 = torch.aten.mul.Tensor %3114, %3113 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_3874 = torch.constant.int 1
    %3116 = torch.aten.add.Tensor %3115, %3047, %int1_3874 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_3875 = torch.constant.int 512
    %int3072_3876 = torch.constant.int 3072
    %3117 = torch.prim.ListConstruct %int512_3875, %int3072_3876 : (!torch.int, !torch.int) -> !torch.list<int>
    %3118 = torch.aten.view %3116, %3117 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.9.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.9.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %3119 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_3877 = torch.constant.int 0
    %int1_3878 = torch.constant.int 1
    %3120 = torch.aten.transpose.int %3119, %int0_3877, %int1_3878 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.9.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.9.txt_attn.qkv.bias : tensor<9216xf16>
    %3121 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_3879 = torch.constant.int 6
    %3122 = torch.prims.convert_element_type %3121, %int6_3879 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_3880 = torch.constant.int 6
    %3123 = torch.prims.convert_element_type %3118, %int6_3880 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_3881 = torch.constant.int 6
    %3124 = torch.prims.convert_element_type %3120, %int6_3881 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %3125 = torch.aten.mm %3123, %3124 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_3882 = torch.constant.int 1
    %3126 = torch.aten.mul.Scalar %3125, %int1_3882 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_3883 = torch.constant.int 1
    %3127 = torch.aten.mul.Scalar %3122, %int1_3883 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_3884 = torch.constant.int 1
    %3128 = torch.aten.add.Tensor %3126, %3127, %int1_3884 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_3885 = torch.constant.int 5
    %3129 = torch.prims.convert_element_type %3128, %int5_3885 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_3886 = torch.constant.int 1
    %int512_3887 = torch.constant.int 512
    %int9216_3888 = torch.constant.int 9216
    %3130 = torch.prim.ListConstruct %int1_3886, %int512_3887, %int9216_3888 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3131 = torch.aten.view %3129, %3130 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_3889 = torch.constant.int 1
    %int512_3890 = torch.constant.int 512
    %int3_3891 = torch.constant.int 3
    %int24_3892 = torch.constant.int 24
    %int128_3893 = torch.constant.int 128
    %3132 = torch.prim.ListConstruct %int1_3889, %int512_3890, %int3_3891, %int24_3892, %int128_3893 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3133 = torch.aten.view %3131, %3132 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_3894 = torch.constant.int 2
    %int0_3895 = torch.constant.int 0
    %int3_3896 = torch.constant.int 3
    %int1_3897 = torch.constant.int 1
    %int4_3898 = torch.constant.int 4
    %3134 = torch.prim.ListConstruct %int2_3894, %int0_3895, %int3_3896, %int1_3897, %int4_3898 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3135 = torch.aten.permute %3133, %3134 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_3899 = torch.constant.int 0
    %int0_3900 = torch.constant.int 0
    %3136 = torch.aten.select.int %3135, %int0_3899, %int0_3900 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_3901 = torch.constant.int 0
    %int1_3902 = torch.constant.int 1
    %3137 = torch.aten.select.int %3135, %int0_3901, %int1_3902 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_3903 = torch.constant.int 0
    %int2_3904 = torch.constant.int 2
    %3138 = torch.aten.select.int %3135, %int0_3903, %int2_3904 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_3905 = torch.constant.int 6
    %3139 = torch.prims.convert_element_type %3136, %int6_3905 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_3906 = torch.constant.int 2
    %3140 = torch.aten.pow.Tensor_Scalar %3139, %int2_3906 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_3907 = torch.constant.int -1
    %3141 = torch.prim.ListConstruct %int-1_3907 : (!torch.int) -> !torch.list<int>
    %true_3908 = torch.constant.bool true
    %none_3909 = torch.constant.none
    %3142 = torch.aten.mean.dim %3140, %3141, %true_3908, %none_3909 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_3910 = torch.constant.float 9.9999999999999995E-7
    %int1_3911 = torch.constant.int 1
    %3143 = torch.aten.add.Scalar %3142, %float9.999990e-07_3910, %int1_3911 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %3144 = torch.aten.rsqrt %3143 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %3145 = torch.aten.mul.Tensor %3139, %3144 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_3912 = torch.constant.int 5
    %3146 = torch.prims.convert_element_type %3145, %int5_3912 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.9.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.9.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %3147 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3148 = torch.aten.mul.Tensor %3146, %3147 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_3913 = torch.constant.int 6
    %3149 = torch.prims.convert_element_type %3137, %int6_3913 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_3914 = torch.constant.int 2
    %3150 = torch.aten.pow.Tensor_Scalar %3149, %int2_3914 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_3915 = torch.constant.int -1
    %3151 = torch.prim.ListConstruct %int-1_3915 : (!torch.int) -> !torch.list<int>
    %true_3916 = torch.constant.bool true
    %none_3917 = torch.constant.none
    %3152 = torch.aten.mean.dim %3150, %3151, %true_3916, %none_3917 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_3918 = torch.constant.float 9.9999999999999995E-7
    %int1_3919 = torch.constant.int 1
    %3153 = torch.aten.add.Scalar %3152, %float9.999990e-07_3918, %int1_3919 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %3154 = torch.aten.rsqrt %3153 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %3155 = torch.aten.mul.Tensor %3149, %3154 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_3920 = torch.constant.int 5
    %3156 = torch.prims.convert_element_type %3155, %int5_3920 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.9.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.9.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %3157 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3158 = torch.aten.mul.Tensor %3156, %3157 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_3921 = torch.constant.int 5
    %3159 = torch.prims.convert_element_type %3148, %int5_3921 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_3922 = torch.constant.int 5
    %3160 = torch.prims.convert_element_type %3158, %int5_3922 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %3161 = torch.prim.ListConstruct %3159, %3105 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_3923 = torch.constant.int 2
    %3162 = torch.aten.cat %3161, %int2_3923 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %3163 = torch.prim.ListConstruct %3160, %3106 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_3924 = torch.constant.int 2
    %3164 = torch.aten.cat %3163, %int2_3924 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %3165 = torch.prim.ListConstruct %3138, %3084 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_3925 = torch.constant.int 2
    %3166 = torch.aten.cat %3165, %int2_3925 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_3926 = torch.constant.int 6
    %3167 = torch.prims.convert_element_type %3162, %int6_3926 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_3927 = torch.constant.int 1
    %int24_3928 = torch.constant.int 24
    %int4608_3929 = torch.constant.int 4608
    %int-1_3930 = torch.constant.int -1
    %int1_3931 = torch.constant.int 1
    %int2_3932 = torch.constant.int 2
    %3168 = torch.prim.ListConstruct %int1_3927, %int24_3928, %int4608_3929, %int-1_3930, %int1_3931, %int2_3932 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3169 = torch.aten.view %3167, %3168 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_3933 = torch.constant.int 6
    %3170 = torch.prims.convert_element_type %3164, %int6_3933 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_3934 = torch.constant.int 1
    %int24_3935 = torch.constant.int 24
    %int4608_3936 = torch.constant.int 4608
    %int-1_3937 = torch.constant.int -1
    %int1_3938 = torch.constant.int 1
    %int2_3939 = torch.constant.int 2
    %3171 = torch.prim.ListConstruct %int1_3934, %int24_3935, %int4608_3936, %int-1_3937, %int1_3938, %int2_3939 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3172 = torch.aten.view %3170, %3171 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_3940 = torch.constant.int 5
    %int0_3941 = torch.constant.int 0
    %3173 = torch.aten.select.int %211, %int5_3940, %int0_3941 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3942 = torch.constant.int 5
    %int0_3943 = torch.constant.int 0
    %3174 = torch.aten.select.int %3169, %int5_3942, %int0_3943 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3175 = torch.aten.mul.Tensor %3173, %3174 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_3944 = torch.constant.int 5
    %int1_3945 = torch.constant.int 1
    %3176 = torch.aten.select.int %211, %int5_3944, %int1_3945 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3946 = torch.constant.int 5
    %int1_3947 = torch.constant.int 1
    %3177 = torch.aten.select.int %3169, %int5_3946, %int1_3947 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3178 = torch.aten.mul.Tensor %3176, %3177 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_3948 = torch.constant.int 1
    %3179 = torch.aten.add.Tensor %3175, %3178, %int1_3948 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_3949 = torch.constant.int 5
    %int0_3950 = torch.constant.int 0
    %3180 = torch.aten.select.int %211, %int5_3949, %int0_3950 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3951 = torch.constant.int 5
    %int0_3952 = torch.constant.int 0
    %3181 = torch.aten.select.int %3172, %int5_3951, %int0_3952 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3182 = torch.aten.mul.Tensor %3180, %3181 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_3953 = torch.constant.int 5
    %int1_3954 = torch.constant.int 1
    %3183 = torch.aten.select.int %211, %int5_3953, %int1_3954 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_3955 = torch.constant.int 5
    %int1_3956 = torch.constant.int 1
    %3184 = torch.aten.select.int %3172, %int5_3955, %int1_3956 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3185 = torch.aten.mul.Tensor %3183, %3184 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_3957 = torch.constant.int 1
    %3186 = torch.aten.add.Tensor %3182, %3185, %int1_3957 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_3958 = torch.constant.int 1
    %int24_3959 = torch.constant.int 24
    %int4608_3960 = torch.constant.int 4608
    %int128_3961 = torch.constant.int 128
    %3187 = torch.prim.ListConstruct %int1_3958, %int24_3959, %int4608_3960, %int128_3961 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3188 = torch.aten.view %3179, %3187 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_3962 = torch.constant.int 5
    %3189 = torch.prims.convert_element_type %3188, %int5_3962 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_3963 = torch.constant.int 1
    %int24_3964 = torch.constant.int 24
    %int4608_3965 = torch.constant.int 4608
    %int128_3966 = torch.constant.int 128
    %3190 = torch.prim.ListConstruct %int1_3963, %int24_3964, %int4608_3965, %int128_3966 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3191 = torch.aten.view %3186, %3190 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_3967 = torch.constant.int 5
    %3192 = torch.prims.convert_element_type %3191, %int5_3967 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_3968 = torch.constant.float 0.000000e+00
    %false_3969 = torch.constant.bool false
    %none_3970 = torch.constant.none
    %none_3971 = torch.constant.none
    %3193:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%3189, %3192, %3166, %float0.000000e00_3968, %false_3969, %none_3970, %none_3971) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_3972 = torch.constant.int 0
    %int2_3973 = torch.constant.int 2
    %int1_3974 = torch.constant.int 1
    %int3_3975 = torch.constant.int 3
    %3194 = torch.prim.ListConstruct %int0_3972, %int2_3973, %int1_3974, %int3_3975 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3195 = torch.aten.permute %3193#0, %3194 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_3976 = torch.constant.int 1
    %int4608_3977 = torch.constant.int 4608
    %int3072_3978 = torch.constant.int 3072
    %3196 = torch.prim.ListConstruct %int1_3976, %int4608_3977, %int3072_3978 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3197 = torch.aten.view %3195, %3196 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_3979 = torch.constant.int 0
    %int0_3980 = torch.constant.int 0
    %int9223372036854775807_3981 = torch.constant.int 9223372036854775807
    %int1_3982 = torch.constant.int 1
    %3198 = torch.aten.slice.Tensor %3197, %int0_3979, %int0_3980, %int9223372036854775807_3981, %int1_3982 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_3983 = torch.constant.int 1
    %int0_3984 = torch.constant.int 0
    %int512_3985 = torch.constant.int 512
    %int1_3986 = torch.constant.int 1
    %3199 = torch.aten.slice.Tensor %3198, %int1_3983, %int0_3984, %int512_3985, %int1_3986 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_3987 = torch.constant.int 0
    %int0_3988 = torch.constant.int 0
    %int9223372036854775807_3989 = torch.constant.int 9223372036854775807
    %int1_3990 = torch.constant.int 1
    %3200 = torch.aten.slice.Tensor %3197, %int0_3987, %int0_3988, %int9223372036854775807_3989, %int1_3990 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_3991 = torch.constant.int 1
    %int512_3992 = torch.constant.int 512
    %int9223372036854775807_3993 = torch.constant.int 9223372036854775807
    %int1_3994 = torch.constant.int 1
    %3201 = torch.aten.slice.Tensor %3200, %int1_3991, %int512_3992, %int9223372036854775807_3993, %int1_3994 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_3995 = torch.constant.int 4096
    %int3072_3996 = torch.constant.int 3072
    %3202 = torch.prim.ListConstruct %int4096_3995, %int3072_3996 : (!torch.int, !torch.int) -> !torch.list<int>
    %3203 = torch.aten.view %3201, %3202 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.9.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.9.img_attn.proj.weight : tensor<3072x3072xf16>
    %3204 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_3997 = torch.constant.int 0
    %int1_3998 = torch.constant.int 1
    %3205 = torch.aten.transpose.int %3204, %int0_3997, %int1_3998 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.9.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.9.img_attn.proj.bias : tensor<3072xf16>
    %3206 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_3999 = torch.constant.int 6
    %3207 = torch.prims.convert_element_type %3206, %int6_3999 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4000 = torch.constant.int 6
    %3208 = torch.prims.convert_element_type %3203, %int6_4000 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_4001 = torch.constant.int 6
    %3209 = torch.prims.convert_element_type %3205, %int6_4001 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %3210 = torch.aten.mm %3208, %3209 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_4002 = torch.constant.int 1
    %3211 = torch.aten.mul.Scalar %3210, %int1_4002 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_4003 = torch.constant.int 1
    %3212 = torch.aten.mul.Scalar %3207, %int1_4003 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4004 = torch.constant.int 1
    %3213 = torch.aten.add.Tensor %3211, %3212, %int1_4004 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_4005 = torch.constant.int 5
    %3214 = torch.prims.convert_element_type %3213, %int5_4005 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_4006 = torch.constant.int 1
    %int4096_4007 = torch.constant.int 4096
    %int3072_4008 = torch.constant.int 3072
    %3215 = torch.prim.ListConstruct %int1_4006, %int4096_4007, %int3072_4008 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3216 = torch.aten.view %3214, %3215 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %3217 = torch.aten.mul.Tensor %3028, %3216 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4009 = torch.constant.int 1
    %3218 = torch.aten.add.Tensor %2950, %3217, %int1_4009 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4010 = torch.constant.int 1
    %int1_4011 = torch.constant.int 1
    %3219 = torch.aten.add.Scalar %3030, %int1_4010, %int1_4011 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_4012 = torch.constant.int 6
    %3220 = torch.prims.convert_element_type %3218, %int6_4012 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_4013 = torch.constant.int 2
    %3221 = torch.prim.ListConstruct %int2_4013 : (!torch.int) -> !torch.list<int>
    %int0_4014 = torch.constant.int 0
    %true_4015 = torch.constant.bool true
    %result0_4016, %result1_4017 = torch.aten.var_mean.correction %3220, %3221, %int0_4014, %true_4015 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_4018 = torch.constant.float 9.9999999999999995E-7
    %int1_4019 = torch.constant.int 1
    %3222 = torch.aten.add.Scalar %result0_4016, %float9.999990e-07_4018, %int1_4019 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %3223 = torch.aten.rsqrt %3222 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_4020 = torch.constant.int 1
    %3224 = torch.aten.sub.Tensor %3218, %result1_4017, %int1_4020 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %3225 = torch.aten.mul.Tensor %3224, %3223 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_4021 = torch.constant.int 5
    %3226 = torch.prims.convert_element_type %3225, %int5_4021 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %3227 = torch.aten.mul.Tensor %3219, %3226 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4022 = torch.constant.int 1
    %3228 = torch.aten.add.Tensor %3227, %3029, %int1_4022 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_4023 = torch.constant.int 4096
    %int3072_4024 = torch.constant.int 3072
    %3229 = torch.prim.ListConstruct %int4096_4023, %int3072_4024 : (!torch.int, !torch.int) -> !torch.list<int>
    %3230 = torch.aten.view %3228, %3229 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.9.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.9.img_mlp.0.weight : tensor<12288x3072xf16>
    %3231 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_4025 = torch.constant.int 0
    %int1_4026 = torch.constant.int 1
    %3232 = torch.aten.transpose.int %3231, %int0_4025, %int1_4026 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.9.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.9.img_mlp.0.bias : tensor<12288xf16>
    %3233 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_4027 = torch.constant.int 6
    %3234 = torch.prims.convert_element_type %3233, %int6_4027 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_4028 = torch.constant.int 6
    %3235 = torch.prims.convert_element_type %3230, %int6_4028 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_4029 = torch.constant.int 6
    %3236 = torch.prims.convert_element_type %3232, %int6_4029 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %3237 = torch.aten.mm %3235, %3236 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_4030 = torch.constant.int 1
    %3238 = torch.aten.mul.Scalar %3237, %int1_4030 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_4031 = torch.constant.int 1
    %3239 = torch.aten.mul.Scalar %3234, %int1_4031 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_4032 = torch.constant.int 1
    %3240 = torch.aten.add.Tensor %3238, %3239, %int1_4032 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_4033 = torch.constant.int 5
    %3241 = torch.prims.convert_element_type %3240, %int5_4033 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_4034 = torch.constant.int 1
    %int4096_4035 = torch.constant.int 4096
    %int12288_4036 = torch.constant.int 12288
    %3242 = torch.prim.ListConstruct %int1_4034, %int4096_4035, %int12288_4036 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3243 = torch.aten.view %3241, %3242 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_4037 = torch.constant.str "tanh"
    %3244 = torch.aten.gelu %3243, %str_4037 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_4038 = torch.constant.int 4096
    %int12288_4039 = torch.constant.int 12288
    %3245 = torch.prim.ListConstruct %int4096_4038, %int12288_4039 : (!torch.int, !torch.int) -> !torch.list<int>
    %3246 = torch.aten.view %3244, %3245 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.9.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.9.img_mlp.2.weight : tensor<3072x12288xf16>
    %3247 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_4040 = torch.constant.int 0
    %int1_4041 = torch.constant.int 1
    %3248 = torch.aten.transpose.int %3247, %int0_4040, %int1_4041 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.9.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.9.img_mlp.2.bias : tensor<3072xf16>
    %3249 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4042 = torch.constant.int 6
    %3250 = torch.prims.convert_element_type %3249, %int6_4042 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4043 = torch.constant.int 6
    %3251 = torch.prims.convert_element_type %3246, %int6_4043 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_4044 = torch.constant.int 6
    %3252 = torch.prims.convert_element_type %3248, %int6_4044 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %3253 = torch.aten.mm %3251, %3252 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_4045 = torch.constant.int 1
    %3254 = torch.aten.mul.Scalar %3253, %int1_4045 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_4046 = torch.constant.int 1
    %3255 = torch.aten.mul.Scalar %3250, %int1_4046 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4047 = torch.constant.int 1
    %3256 = torch.aten.add.Tensor %3254, %3255, %int1_4047 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_4048 = torch.constant.int 5
    %3257 = torch.prims.convert_element_type %3256, %int5_4048 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_4049 = torch.constant.int 1
    %int4096_4050 = torch.constant.int 4096
    %int3072_4051 = torch.constant.int 3072
    %3258 = torch.prim.ListConstruct %int1_4049, %int4096_4050, %int3072_4051 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3259 = torch.aten.view %3257, %3258 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %3260 = torch.aten.mul.Tensor %3031, %3259 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4052 = torch.constant.int 1
    %3261 = torch.aten.add.Tensor %3218, %3260, %int1_4052 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_4053 = torch.constant.int 512
    %int3072_4054 = torch.constant.int 3072
    %3262 = torch.prim.ListConstruct %int512_4053, %int3072_4054 : (!torch.int, !torch.int) -> !torch.list<int>
    %3263 = torch.aten.view %3199, %3262 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.9.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.9.txt_attn.proj.weight : tensor<3072x3072xf16>
    %3264 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_4055 = torch.constant.int 0
    %int1_4056 = torch.constant.int 1
    %3265 = torch.aten.transpose.int %3264, %int0_4055, %int1_4056 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.9.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.9.txt_attn.proj.bias : tensor<3072xf16>
    %3266 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4057 = torch.constant.int 6
    %3267 = torch.prims.convert_element_type %3266, %int6_4057 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4058 = torch.constant.int 6
    %3268 = torch.prims.convert_element_type %3263, %int6_4058 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_4059 = torch.constant.int 6
    %3269 = torch.prims.convert_element_type %3265, %int6_4059 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %3270 = torch.aten.mm %3268, %3269 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_4060 = torch.constant.int 1
    %3271 = torch.aten.mul.Scalar %3270, %int1_4060 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_4061 = torch.constant.int 1
    %3272 = torch.aten.mul.Scalar %3267, %int1_4061 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4062 = torch.constant.int 1
    %3273 = torch.aten.add.Tensor %3271, %3272, %int1_4062 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_4063 = torch.constant.int 5
    %3274 = torch.prims.convert_element_type %3273, %int5_4063 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_4064 = torch.constant.int 1
    %int512_4065 = torch.constant.int 512
    %int3072_4066 = torch.constant.int 3072
    %3275 = torch.prim.ListConstruct %int1_4064, %int512_4065, %int3072_4066 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3276 = torch.aten.view %3274, %3275 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %3277 = torch.aten.mul.Tensor %3049, %3276 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4067 = torch.constant.int 1
    %3278 = torch.aten.add.Tensor %3010, %3277, %int1_4067 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_4068 = torch.constant.int 1
    %int1_4069 = torch.constant.int 1
    %3279 = torch.aten.add.Scalar %3051, %int1_4068, %int1_4069 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_4070 = torch.constant.int 6
    %3280 = torch.prims.convert_element_type %3278, %int6_4070 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_4071 = torch.constant.int 2
    %3281 = torch.prim.ListConstruct %int2_4071 : (!torch.int) -> !torch.list<int>
    %int0_4072 = torch.constant.int 0
    %true_4073 = torch.constant.bool true
    %result0_4074, %result1_4075 = torch.aten.var_mean.correction %3280, %3281, %int0_4072, %true_4073 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_4076 = torch.constant.float 9.9999999999999995E-7
    %int1_4077 = torch.constant.int 1
    %3282 = torch.aten.add.Scalar %result0_4074, %float9.999990e-07_4076, %int1_4077 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %3283 = torch.aten.rsqrt %3282 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_4078 = torch.constant.int 1
    %3284 = torch.aten.sub.Tensor %3278, %result1_4075, %int1_4078 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %3285 = torch.aten.mul.Tensor %3284, %3283 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_4079 = torch.constant.int 5
    %3286 = torch.prims.convert_element_type %3285, %int5_4079 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %3287 = torch.aten.mul.Tensor %3279, %3286 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4080 = torch.constant.int 1
    %3288 = torch.aten.add.Tensor %3287, %3050, %int1_4080 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_4081 = torch.constant.int 512
    %int3072_4082 = torch.constant.int 3072
    %3289 = torch.prim.ListConstruct %int512_4081, %int3072_4082 : (!torch.int, !torch.int) -> !torch.list<int>
    %3290 = torch.aten.view %3288, %3289 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.9.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.9.txt_mlp.0.weight : tensor<12288x3072xf16>
    %3291 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_4083 = torch.constant.int 0
    %int1_4084 = torch.constant.int 1
    %3292 = torch.aten.transpose.int %3291, %int0_4083, %int1_4084 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.9.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.9.txt_mlp.0.bias : tensor<12288xf16>
    %3293 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_4085 = torch.constant.int 6
    %3294 = torch.prims.convert_element_type %3293, %int6_4085 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_4086 = torch.constant.int 6
    %3295 = torch.prims.convert_element_type %3290, %int6_4086 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_4087 = torch.constant.int 6
    %3296 = torch.prims.convert_element_type %3292, %int6_4087 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %3297 = torch.aten.mm %3295, %3296 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_4088 = torch.constant.int 1
    %3298 = torch.aten.mul.Scalar %3297, %int1_4088 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_4089 = torch.constant.int 1
    %3299 = torch.aten.mul.Scalar %3294, %int1_4089 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_4090 = torch.constant.int 1
    %3300 = torch.aten.add.Tensor %3298, %3299, %int1_4090 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_4091 = torch.constant.int 5
    %3301 = torch.prims.convert_element_type %3300, %int5_4091 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_4092 = torch.constant.int 1
    %int512_4093 = torch.constant.int 512
    %int12288_4094 = torch.constant.int 12288
    %3302 = torch.prim.ListConstruct %int1_4092, %int512_4093, %int12288_4094 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3303 = torch.aten.view %3301, %3302 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_4095 = torch.constant.str "tanh"
    %3304 = torch.aten.gelu %3303, %str_4095 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_4096 = torch.constant.int 512
    %int12288_4097 = torch.constant.int 12288
    %3305 = torch.prim.ListConstruct %int512_4096, %int12288_4097 : (!torch.int, !torch.int) -> !torch.list<int>
    %3306 = torch.aten.view %3304, %3305 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.9.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.9.txt_mlp.2.weight : tensor<3072x12288xf16>
    %3307 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_4098 = torch.constant.int 0
    %int1_4099 = torch.constant.int 1
    %3308 = torch.aten.transpose.int %3307, %int0_4098, %int1_4099 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.9.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.9.txt_mlp.2.bias : tensor<3072xf16>
    %3309 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.9.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4100 = torch.constant.int 6
    %3310 = torch.prims.convert_element_type %3309, %int6_4100 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4101 = torch.constant.int 6
    %3311 = torch.prims.convert_element_type %3306, %int6_4101 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_4102 = torch.constant.int 6
    %3312 = torch.prims.convert_element_type %3308, %int6_4102 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %3313 = torch.aten.mm %3311, %3312 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_4103 = torch.constant.int 1
    %3314 = torch.aten.mul.Scalar %3313, %int1_4103 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_4104 = torch.constant.int 1
    %3315 = torch.aten.mul.Scalar %3310, %int1_4104 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4105 = torch.constant.int 1
    %3316 = torch.aten.add.Tensor %3314, %3315, %int1_4105 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_4106 = torch.constant.int 5
    %3317 = torch.prims.convert_element_type %3316, %int5_4106 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_4107 = torch.constant.int 1
    %int512_4108 = torch.constant.int 512
    %int3072_4109 = torch.constant.int 3072
    %3318 = torch.prim.ListConstruct %int1_4107, %int512_4108, %int3072_4109 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3319 = torch.aten.view %3317, %3318 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %3320 = torch.aten.mul.Tensor %3052, %3319 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4110 = torch.constant.int 1
    %3321 = torch.aten.add.Tensor %3278, %3320, %int1_4110 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %3322 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.10.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.10.img_mod.lin.weight : tensor<18432x3072xf16>
    %3323 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_4111 = torch.constant.int 0
    %int1_4112 = torch.constant.int 1
    %3324 = torch.aten.transpose.int %3323, %int0_4111, %int1_4112 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.10.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.10.img_mod.lin.bias : tensor<18432xf16>
    %3325 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_4113 = torch.constant.int 6
    %3326 = torch.prims.convert_element_type %3325, %int6_4113 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_4114 = torch.constant.int 6
    %3327 = torch.prims.convert_element_type %3322, %int6_4114 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_4115 = torch.constant.int 6
    %3328 = torch.prims.convert_element_type %3324, %int6_4115 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %3329 = torch.aten.mm %3327, %3328 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_4116 = torch.constant.int 1
    %3330 = torch.aten.mul.Scalar %3329, %int1_4116 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_4117 = torch.constant.int 1
    %3331 = torch.aten.mul.Scalar %3326, %int1_4117 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_4118 = torch.constant.int 1
    %3332 = torch.aten.add.Tensor %3330, %3331, %int1_4118 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_4119 = torch.constant.int 5
    %3333 = torch.prims.convert_element_type %3332, %int5_4119 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_4120 = torch.constant.int 0
    %int0_4121 = torch.constant.int 0
    %int9223372036854775807_4122 = torch.constant.int 9223372036854775807
    %int1_4123 = torch.constant.int 1
    %3334 = torch.aten.slice.Tensor %3333, %int0_4120, %int0_4121, %int9223372036854775807_4122, %int1_4123 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_4124 = torch.constant.int 1
    %3335 = torch.aten.unsqueeze %3334, %int1_4124 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_4125 = torch.constant.int 2
    %int0_4126 = torch.constant.int 0
    %int9223372036854775807_4127 = torch.constant.int 9223372036854775807
    %int1_4128 = torch.constant.int 1
    %3336 = torch.aten.slice.Tensor %3335, %int2_4125, %int0_4126, %int9223372036854775807_4127, %int1_4128 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_4129 = torch.constant.int -1
    %int0_4130 = torch.constant.int 0
    %int3072_4131 = torch.constant.int 3072
    %int1_4132 = torch.constant.int 1
    %3337 = torch.aten.slice.Tensor %3336, %int-1_4129, %int0_4130, %int3072_4131, %int1_4132 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4133 = torch.constant.int -1
    %int3072_4134 = torch.constant.int 3072
    %int6144_4135 = torch.constant.int 6144
    %int1_4136 = torch.constant.int 1
    %3338 = torch.aten.slice.Tensor %3336, %int-1_4133, %int3072_4134, %int6144_4135, %int1_4136 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4137 = torch.constant.int -1
    %int6144_4138 = torch.constant.int 6144
    %int9216_4139 = torch.constant.int 9216
    %int1_4140 = torch.constant.int 1
    %3339 = torch.aten.slice.Tensor %3336, %int-1_4137, %int6144_4138, %int9216_4139, %int1_4140 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4141 = torch.constant.int -1
    %int9216_4142 = torch.constant.int 9216
    %int12288_4143 = torch.constant.int 12288
    %int1_4144 = torch.constant.int 1
    %3340 = torch.aten.slice.Tensor %3336, %int-1_4141, %int9216_4142, %int12288_4143, %int1_4144 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4145 = torch.constant.int -1
    %int12288_4146 = torch.constant.int 12288
    %int15360_4147 = torch.constant.int 15360
    %int1_4148 = torch.constant.int 1
    %3341 = torch.aten.slice.Tensor %3336, %int-1_4145, %int12288_4146, %int15360_4147, %int1_4148 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4149 = torch.constant.int -1
    %int15360_4150 = torch.constant.int 15360
    %int18432_4151 = torch.constant.int 18432
    %int1_4152 = torch.constant.int 1
    %3342 = torch.aten.slice.Tensor %3336, %int-1_4149, %int15360_4150, %int18432_4151, %int1_4152 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3343 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.10.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.10.txt_mod.lin.weight : tensor<18432x3072xf16>
    %3344 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_4153 = torch.constant.int 0
    %int1_4154 = torch.constant.int 1
    %3345 = torch.aten.transpose.int %3344, %int0_4153, %int1_4154 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.10.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.10.txt_mod.lin.bias : tensor<18432xf16>
    %3346 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_4155 = torch.constant.int 6
    %3347 = torch.prims.convert_element_type %3346, %int6_4155 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_4156 = torch.constant.int 6
    %3348 = torch.prims.convert_element_type %3343, %int6_4156 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_4157 = torch.constant.int 6
    %3349 = torch.prims.convert_element_type %3345, %int6_4157 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %3350 = torch.aten.mm %3348, %3349 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_4158 = torch.constant.int 1
    %3351 = torch.aten.mul.Scalar %3350, %int1_4158 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_4159 = torch.constant.int 1
    %3352 = torch.aten.mul.Scalar %3347, %int1_4159 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_4160 = torch.constant.int 1
    %3353 = torch.aten.add.Tensor %3351, %3352, %int1_4160 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_4161 = torch.constant.int 5
    %3354 = torch.prims.convert_element_type %3353, %int5_4161 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_4162 = torch.constant.int 0
    %int0_4163 = torch.constant.int 0
    %int9223372036854775807_4164 = torch.constant.int 9223372036854775807
    %int1_4165 = torch.constant.int 1
    %3355 = torch.aten.slice.Tensor %3354, %int0_4162, %int0_4163, %int9223372036854775807_4164, %int1_4165 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_4166 = torch.constant.int 1
    %3356 = torch.aten.unsqueeze %3355, %int1_4166 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_4167 = torch.constant.int 2
    %int0_4168 = torch.constant.int 0
    %int9223372036854775807_4169 = torch.constant.int 9223372036854775807
    %int1_4170 = torch.constant.int 1
    %3357 = torch.aten.slice.Tensor %3356, %int2_4167, %int0_4168, %int9223372036854775807_4169, %int1_4170 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_4171 = torch.constant.int -1
    %int0_4172 = torch.constant.int 0
    %int3072_4173 = torch.constant.int 3072
    %int1_4174 = torch.constant.int 1
    %3358 = torch.aten.slice.Tensor %3357, %int-1_4171, %int0_4172, %int3072_4173, %int1_4174 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4175 = torch.constant.int -1
    %int3072_4176 = torch.constant.int 3072
    %int6144_4177 = torch.constant.int 6144
    %int1_4178 = torch.constant.int 1
    %3359 = torch.aten.slice.Tensor %3357, %int-1_4175, %int3072_4176, %int6144_4177, %int1_4178 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4179 = torch.constant.int -1
    %int6144_4180 = torch.constant.int 6144
    %int9216_4181 = torch.constant.int 9216
    %int1_4182 = torch.constant.int 1
    %3360 = torch.aten.slice.Tensor %3357, %int-1_4179, %int6144_4180, %int9216_4181, %int1_4182 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4183 = torch.constant.int -1
    %int9216_4184 = torch.constant.int 9216
    %int12288_4185 = torch.constant.int 12288
    %int1_4186 = torch.constant.int 1
    %3361 = torch.aten.slice.Tensor %3357, %int-1_4183, %int9216_4184, %int12288_4185, %int1_4186 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4187 = torch.constant.int -1
    %int12288_4188 = torch.constant.int 12288
    %int15360_4189 = torch.constant.int 15360
    %int1_4190 = torch.constant.int 1
    %3362 = torch.aten.slice.Tensor %3357, %int-1_4187, %int12288_4188, %int15360_4189, %int1_4190 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4191 = torch.constant.int -1
    %int15360_4192 = torch.constant.int 15360
    %int18432_4193 = torch.constant.int 18432
    %int1_4194 = torch.constant.int 1
    %3363 = torch.aten.slice.Tensor %3357, %int-1_4191, %int15360_4192, %int18432_4193, %int1_4194 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_4195 = torch.constant.int 6
    %3364 = torch.prims.convert_element_type %3261, %int6_4195 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_4196 = torch.constant.int 2
    %3365 = torch.prim.ListConstruct %int2_4196 : (!torch.int) -> !torch.list<int>
    %int0_4197 = torch.constant.int 0
    %true_4198 = torch.constant.bool true
    %result0_4199, %result1_4200 = torch.aten.var_mean.correction %3364, %3365, %int0_4197, %true_4198 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_4201 = torch.constant.float 9.9999999999999995E-7
    %int1_4202 = torch.constant.int 1
    %3366 = torch.aten.add.Scalar %result0_4199, %float9.999990e-07_4201, %int1_4202 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %3367 = torch.aten.rsqrt %3366 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_4203 = torch.constant.int 1
    %3368 = torch.aten.sub.Tensor %3261, %result1_4200, %int1_4203 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %3369 = torch.aten.mul.Tensor %3368, %3367 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_4204 = torch.constant.int 5
    %3370 = torch.prims.convert_element_type %3369, %int5_4204 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4205 = torch.constant.int 1
    %int1_4206 = torch.constant.int 1
    %3371 = torch.aten.add.Scalar %3338, %int1_4205, %int1_4206 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3372 = torch.aten.mul.Tensor %3371, %3370 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4207 = torch.constant.int 1
    %3373 = torch.aten.add.Tensor %3372, %3337, %int1_4207 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_4208 = torch.constant.int 4096
    %int3072_4209 = torch.constant.int 3072
    %3374 = torch.prim.ListConstruct %int4096_4208, %int3072_4209 : (!torch.int, !torch.int) -> !torch.list<int>
    %3375 = torch.aten.view %3373, %3374 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.10.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.10.img_attn.qkv.weight : tensor<9216x3072xf16>
    %3376 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_4210 = torch.constant.int 0
    %int1_4211 = torch.constant.int 1
    %3377 = torch.aten.transpose.int %3376, %int0_4210, %int1_4211 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.10.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.10.img_attn.qkv.bias : tensor<9216xf16>
    %3378 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_4212 = torch.constant.int 6
    %3379 = torch.prims.convert_element_type %3378, %int6_4212 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_4213 = torch.constant.int 6
    %3380 = torch.prims.convert_element_type %3375, %int6_4213 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_4214 = torch.constant.int 6
    %3381 = torch.prims.convert_element_type %3377, %int6_4214 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %3382 = torch.aten.mm %3380, %3381 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_4215 = torch.constant.int 1
    %3383 = torch.aten.mul.Scalar %3382, %int1_4215 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_4216 = torch.constant.int 1
    %3384 = torch.aten.mul.Scalar %3379, %int1_4216 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_4217 = torch.constant.int 1
    %3385 = torch.aten.add.Tensor %3383, %3384, %int1_4217 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_4218 = torch.constant.int 5
    %3386 = torch.prims.convert_element_type %3385, %int5_4218 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_4219 = torch.constant.int 1
    %int4096_4220 = torch.constant.int 4096
    %int9216_4221 = torch.constant.int 9216
    %3387 = torch.prim.ListConstruct %int1_4219, %int4096_4220, %int9216_4221 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3388 = torch.aten.view %3386, %3387 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_4222 = torch.constant.int 1
    %int4096_4223 = torch.constant.int 4096
    %int3_4224 = torch.constant.int 3
    %int24_4225 = torch.constant.int 24
    %int128_4226 = torch.constant.int 128
    %3389 = torch.prim.ListConstruct %int1_4222, %int4096_4223, %int3_4224, %int24_4225, %int128_4226 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3390 = torch.aten.view %3388, %3389 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_4227 = torch.constant.int 2
    %int0_4228 = torch.constant.int 0
    %int3_4229 = torch.constant.int 3
    %int1_4230 = torch.constant.int 1
    %int4_4231 = torch.constant.int 4
    %3391 = torch.prim.ListConstruct %int2_4227, %int0_4228, %int3_4229, %int1_4230, %int4_4231 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3392 = torch.aten.permute %3390, %3391 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_4232 = torch.constant.int 0
    %int0_4233 = torch.constant.int 0
    %3393 = torch.aten.select.int %3392, %int0_4232, %int0_4233 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_4234 = torch.constant.int 0
    %int1_4235 = torch.constant.int 1
    %3394 = torch.aten.select.int %3392, %int0_4234, %int1_4235 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_4236 = torch.constant.int 0
    %int2_4237 = torch.constant.int 2
    %3395 = torch.aten.select.int %3392, %int0_4236, %int2_4237 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_4238 = torch.constant.int 6
    %3396 = torch.prims.convert_element_type %3393, %int6_4238 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_4239 = torch.constant.int 2
    %3397 = torch.aten.pow.Tensor_Scalar %3396, %int2_4239 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_4240 = torch.constant.int -1
    %3398 = torch.prim.ListConstruct %int-1_4240 : (!torch.int) -> !torch.list<int>
    %true_4241 = torch.constant.bool true
    %none_4242 = torch.constant.none
    %3399 = torch.aten.mean.dim %3397, %3398, %true_4241, %none_4242 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_4243 = torch.constant.float 9.9999999999999995E-7
    %int1_4244 = torch.constant.int 1
    %3400 = torch.aten.add.Scalar %3399, %float9.999990e-07_4243, %int1_4244 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %3401 = torch.aten.rsqrt %3400 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %3402 = torch.aten.mul.Tensor %3396, %3401 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_4245 = torch.constant.int 5
    %3403 = torch.prims.convert_element_type %3402, %int5_4245 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.10.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.10.img_attn.norm.query_norm.scale : tensor<128xf16>
    %3404 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3405 = torch.aten.mul.Tensor %3403, %3404 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_4246 = torch.constant.int 6
    %3406 = torch.prims.convert_element_type %3394, %int6_4246 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_4247 = torch.constant.int 2
    %3407 = torch.aten.pow.Tensor_Scalar %3406, %int2_4247 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_4248 = torch.constant.int -1
    %3408 = torch.prim.ListConstruct %int-1_4248 : (!torch.int) -> !torch.list<int>
    %true_4249 = torch.constant.bool true
    %none_4250 = torch.constant.none
    %3409 = torch.aten.mean.dim %3407, %3408, %true_4249, %none_4250 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_4251 = torch.constant.float 9.9999999999999995E-7
    %int1_4252 = torch.constant.int 1
    %3410 = torch.aten.add.Scalar %3409, %float9.999990e-07_4251, %int1_4252 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %3411 = torch.aten.rsqrt %3410 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %3412 = torch.aten.mul.Tensor %3406, %3411 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_4253 = torch.constant.int 5
    %3413 = torch.prims.convert_element_type %3412, %int5_4253 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.10.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.10.img_attn.norm.key_norm.scale : tensor<128xf16>
    %3414 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3415 = torch.aten.mul.Tensor %3413, %3414 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_4254 = torch.constant.int 5
    %3416 = torch.prims.convert_element_type %3405, %int5_4254 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_4255 = torch.constant.int 5
    %3417 = torch.prims.convert_element_type %3415, %int5_4255 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_4256 = torch.constant.int 6
    %3418 = torch.prims.convert_element_type %3321, %int6_4256 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_4257 = torch.constant.int 2
    %3419 = torch.prim.ListConstruct %int2_4257 : (!torch.int) -> !torch.list<int>
    %int0_4258 = torch.constant.int 0
    %true_4259 = torch.constant.bool true
    %result0_4260, %result1_4261 = torch.aten.var_mean.correction %3418, %3419, %int0_4258, %true_4259 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_4262 = torch.constant.float 9.9999999999999995E-7
    %int1_4263 = torch.constant.int 1
    %3420 = torch.aten.add.Scalar %result0_4260, %float9.999990e-07_4262, %int1_4263 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %3421 = torch.aten.rsqrt %3420 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_4264 = torch.constant.int 1
    %3422 = torch.aten.sub.Tensor %3321, %result1_4261, %int1_4264 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %3423 = torch.aten.mul.Tensor %3422, %3421 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_4265 = torch.constant.int 5
    %3424 = torch.prims.convert_element_type %3423, %int5_4265 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_4266 = torch.constant.int 1
    %int1_4267 = torch.constant.int 1
    %3425 = torch.aten.add.Scalar %3359, %int1_4266, %int1_4267 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3426 = torch.aten.mul.Tensor %3425, %3424 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4268 = torch.constant.int 1
    %3427 = torch.aten.add.Tensor %3426, %3358, %int1_4268 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_4269 = torch.constant.int 512
    %int3072_4270 = torch.constant.int 3072
    %3428 = torch.prim.ListConstruct %int512_4269, %int3072_4270 : (!torch.int, !torch.int) -> !torch.list<int>
    %3429 = torch.aten.view %3427, %3428 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.10.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.10.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %3430 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_4271 = torch.constant.int 0
    %int1_4272 = torch.constant.int 1
    %3431 = torch.aten.transpose.int %3430, %int0_4271, %int1_4272 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.10.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.10.txt_attn.qkv.bias : tensor<9216xf16>
    %3432 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_4273 = torch.constant.int 6
    %3433 = torch.prims.convert_element_type %3432, %int6_4273 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_4274 = torch.constant.int 6
    %3434 = torch.prims.convert_element_type %3429, %int6_4274 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_4275 = torch.constant.int 6
    %3435 = torch.prims.convert_element_type %3431, %int6_4275 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %3436 = torch.aten.mm %3434, %3435 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_4276 = torch.constant.int 1
    %3437 = torch.aten.mul.Scalar %3436, %int1_4276 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_4277 = torch.constant.int 1
    %3438 = torch.aten.mul.Scalar %3433, %int1_4277 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_4278 = torch.constant.int 1
    %3439 = torch.aten.add.Tensor %3437, %3438, %int1_4278 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_4279 = torch.constant.int 5
    %3440 = torch.prims.convert_element_type %3439, %int5_4279 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_4280 = torch.constant.int 1
    %int512_4281 = torch.constant.int 512
    %int9216_4282 = torch.constant.int 9216
    %3441 = torch.prim.ListConstruct %int1_4280, %int512_4281, %int9216_4282 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3442 = torch.aten.view %3440, %3441 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_4283 = torch.constant.int 1
    %int512_4284 = torch.constant.int 512
    %int3_4285 = torch.constant.int 3
    %int24_4286 = torch.constant.int 24
    %int128_4287 = torch.constant.int 128
    %3443 = torch.prim.ListConstruct %int1_4283, %int512_4284, %int3_4285, %int24_4286, %int128_4287 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3444 = torch.aten.view %3442, %3443 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_4288 = torch.constant.int 2
    %int0_4289 = torch.constant.int 0
    %int3_4290 = torch.constant.int 3
    %int1_4291 = torch.constant.int 1
    %int4_4292 = torch.constant.int 4
    %3445 = torch.prim.ListConstruct %int2_4288, %int0_4289, %int3_4290, %int1_4291, %int4_4292 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3446 = torch.aten.permute %3444, %3445 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_4293 = torch.constant.int 0
    %int0_4294 = torch.constant.int 0
    %3447 = torch.aten.select.int %3446, %int0_4293, %int0_4294 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_4295 = torch.constant.int 0
    %int1_4296 = torch.constant.int 1
    %3448 = torch.aten.select.int %3446, %int0_4295, %int1_4296 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_4297 = torch.constant.int 0
    %int2_4298 = torch.constant.int 2
    %3449 = torch.aten.select.int %3446, %int0_4297, %int2_4298 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_4299 = torch.constant.int 6
    %3450 = torch.prims.convert_element_type %3447, %int6_4299 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_4300 = torch.constant.int 2
    %3451 = torch.aten.pow.Tensor_Scalar %3450, %int2_4300 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_4301 = torch.constant.int -1
    %3452 = torch.prim.ListConstruct %int-1_4301 : (!torch.int) -> !torch.list<int>
    %true_4302 = torch.constant.bool true
    %none_4303 = torch.constant.none
    %3453 = torch.aten.mean.dim %3451, %3452, %true_4302, %none_4303 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_4304 = torch.constant.float 9.9999999999999995E-7
    %int1_4305 = torch.constant.int 1
    %3454 = torch.aten.add.Scalar %3453, %float9.999990e-07_4304, %int1_4305 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %3455 = torch.aten.rsqrt %3454 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %3456 = torch.aten.mul.Tensor %3450, %3455 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_4306 = torch.constant.int 5
    %3457 = torch.prims.convert_element_type %3456, %int5_4306 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.10.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.10.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %3458 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3459 = torch.aten.mul.Tensor %3457, %3458 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_4307 = torch.constant.int 6
    %3460 = torch.prims.convert_element_type %3448, %int6_4307 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_4308 = torch.constant.int 2
    %3461 = torch.aten.pow.Tensor_Scalar %3460, %int2_4308 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_4309 = torch.constant.int -1
    %3462 = torch.prim.ListConstruct %int-1_4309 : (!torch.int) -> !torch.list<int>
    %true_4310 = torch.constant.bool true
    %none_4311 = torch.constant.none
    %3463 = torch.aten.mean.dim %3461, %3462, %true_4310, %none_4311 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_4312 = torch.constant.float 9.9999999999999995E-7
    %int1_4313 = torch.constant.int 1
    %3464 = torch.aten.add.Scalar %3463, %float9.999990e-07_4312, %int1_4313 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %3465 = torch.aten.rsqrt %3464 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %3466 = torch.aten.mul.Tensor %3460, %3465 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_4314 = torch.constant.int 5
    %3467 = torch.prims.convert_element_type %3466, %int5_4314 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.10.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.10.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %3468 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3469 = torch.aten.mul.Tensor %3467, %3468 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_4315 = torch.constant.int 5
    %3470 = torch.prims.convert_element_type %3459, %int5_4315 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_4316 = torch.constant.int 5
    %3471 = torch.prims.convert_element_type %3469, %int5_4316 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %3472 = torch.prim.ListConstruct %3470, %3416 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_4317 = torch.constant.int 2
    %3473 = torch.aten.cat %3472, %int2_4317 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %3474 = torch.prim.ListConstruct %3471, %3417 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_4318 = torch.constant.int 2
    %3475 = torch.aten.cat %3474, %int2_4318 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %3476 = torch.prim.ListConstruct %3449, %3395 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_4319 = torch.constant.int 2
    %3477 = torch.aten.cat %3476, %int2_4319 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_4320 = torch.constant.int 6
    %3478 = torch.prims.convert_element_type %3473, %int6_4320 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_4321 = torch.constant.int 1
    %int24_4322 = torch.constant.int 24
    %int4608_4323 = torch.constant.int 4608
    %int-1_4324 = torch.constant.int -1
    %int1_4325 = torch.constant.int 1
    %int2_4326 = torch.constant.int 2
    %3479 = torch.prim.ListConstruct %int1_4321, %int24_4322, %int4608_4323, %int-1_4324, %int1_4325, %int2_4326 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3480 = torch.aten.view %3478, %3479 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_4327 = torch.constant.int 6
    %3481 = torch.prims.convert_element_type %3475, %int6_4327 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_4328 = torch.constant.int 1
    %int24_4329 = torch.constant.int 24
    %int4608_4330 = torch.constant.int 4608
    %int-1_4331 = torch.constant.int -1
    %int1_4332 = torch.constant.int 1
    %int2_4333 = torch.constant.int 2
    %3482 = torch.prim.ListConstruct %int1_4328, %int24_4329, %int4608_4330, %int-1_4331, %int1_4332, %int2_4333 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3483 = torch.aten.view %3481, %3482 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_4334 = torch.constant.int 5
    %int0_4335 = torch.constant.int 0
    %3484 = torch.aten.select.int %211, %int5_4334, %int0_4335 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4336 = torch.constant.int 5
    %int0_4337 = torch.constant.int 0
    %3485 = torch.aten.select.int %3480, %int5_4336, %int0_4337 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3486 = torch.aten.mul.Tensor %3484, %3485 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_4338 = torch.constant.int 5
    %int1_4339 = torch.constant.int 1
    %3487 = torch.aten.select.int %211, %int5_4338, %int1_4339 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4340 = torch.constant.int 5
    %int1_4341 = torch.constant.int 1
    %3488 = torch.aten.select.int %3480, %int5_4340, %int1_4341 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3489 = torch.aten.mul.Tensor %3487, %3488 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_4342 = torch.constant.int 1
    %3490 = torch.aten.add.Tensor %3486, %3489, %int1_4342 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_4343 = torch.constant.int 5
    %int0_4344 = torch.constant.int 0
    %3491 = torch.aten.select.int %211, %int5_4343, %int0_4344 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4345 = torch.constant.int 5
    %int0_4346 = torch.constant.int 0
    %3492 = torch.aten.select.int %3483, %int5_4345, %int0_4346 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3493 = torch.aten.mul.Tensor %3491, %3492 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_4347 = torch.constant.int 5
    %int1_4348 = torch.constant.int 1
    %3494 = torch.aten.select.int %211, %int5_4347, %int1_4348 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4349 = torch.constant.int 5
    %int1_4350 = torch.constant.int 1
    %3495 = torch.aten.select.int %3483, %int5_4349, %int1_4350 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3496 = torch.aten.mul.Tensor %3494, %3495 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_4351 = torch.constant.int 1
    %3497 = torch.aten.add.Tensor %3493, %3496, %int1_4351 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_4352 = torch.constant.int 1
    %int24_4353 = torch.constant.int 24
    %int4608_4354 = torch.constant.int 4608
    %int128_4355 = torch.constant.int 128
    %3498 = torch.prim.ListConstruct %int1_4352, %int24_4353, %int4608_4354, %int128_4355 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3499 = torch.aten.view %3490, %3498 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_4356 = torch.constant.int 5
    %3500 = torch.prims.convert_element_type %3499, %int5_4356 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_4357 = torch.constant.int 1
    %int24_4358 = torch.constant.int 24
    %int4608_4359 = torch.constant.int 4608
    %int128_4360 = torch.constant.int 128
    %3501 = torch.prim.ListConstruct %int1_4357, %int24_4358, %int4608_4359, %int128_4360 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3502 = torch.aten.view %3497, %3501 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_4361 = torch.constant.int 5
    %3503 = torch.prims.convert_element_type %3502, %int5_4361 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_4362 = torch.constant.float 0.000000e+00
    %false_4363 = torch.constant.bool false
    %none_4364 = torch.constant.none
    %none_4365 = torch.constant.none
    %3504:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%3500, %3503, %3477, %float0.000000e00_4362, %false_4363, %none_4364, %none_4365) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_4366 = torch.constant.int 0
    %int2_4367 = torch.constant.int 2
    %int1_4368 = torch.constant.int 1
    %int3_4369 = torch.constant.int 3
    %3505 = torch.prim.ListConstruct %int0_4366, %int2_4367, %int1_4368, %int3_4369 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3506 = torch.aten.permute %3504#0, %3505 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_4370 = torch.constant.int 1
    %int4608_4371 = torch.constant.int 4608
    %int3072_4372 = torch.constant.int 3072
    %3507 = torch.prim.ListConstruct %int1_4370, %int4608_4371, %int3072_4372 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3508 = torch.aten.view %3506, %3507 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_4373 = torch.constant.int 0
    %int0_4374 = torch.constant.int 0
    %int9223372036854775807_4375 = torch.constant.int 9223372036854775807
    %int1_4376 = torch.constant.int 1
    %3509 = torch.aten.slice.Tensor %3508, %int0_4373, %int0_4374, %int9223372036854775807_4375, %int1_4376 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_4377 = torch.constant.int 1
    %int0_4378 = torch.constant.int 0
    %int512_4379 = torch.constant.int 512
    %int1_4380 = torch.constant.int 1
    %3510 = torch.aten.slice.Tensor %3509, %int1_4377, %int0_4378, %int512_4379, %int1_4380 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_4381 = torch.constant.int 0
    %int0_4382 = torch.constant.int 0
    %int9223372036854775807_4383 = torch.constant.int 9223372036854775807
    %int1_4384 = torch.constant.int 1
    %3511 = torch.aten.slice.Tensor %3508, %int0_4381, %int0_4382, %int9223372036854775807_4383, %int1_4384 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_4385 = torch.constant.int 1
    %int512_4386 = torch.constant.int 512
    %int9223372036854775807_4387 = torch.constant.int 9223372036854775807
    %int1_4388 = torch.constant.int 1
    %3512 = torch.aten.slice.Tensor %3511, %int1_4385, %int512_4386, %int9223372036854775807_4387, %int1_4388 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_4389 = torch.constant.int 4096
    %int3072_4390 = torch.constant.int 3072
    %3513 = torch.prim.ListConstruct %int4096_4389, %int3072_4390 : (!torch.int, !torch.int) -> !torch.list<int>
    %3514 = torch.aten.view %3512, %3513 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.10.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.10.img_attn.proj.weight : tensor<3072x3072xf16>
    %3515 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_4391 = torch.constant.int 0
    %int1_4392 = torch.constant.int 1
    %3516 = torch.aten.transpose.int %3515, %int0_4391, %int1_4392 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.10.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.10.img_attn.proj.bias : tensor<3072xf16>
    %3517 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4393 = torch.constant.int 6
    %3518 = torch.prims.convert_element_type %3517, %int6_4393 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4394 = torch.constant.int 6
    %3519 = torch.prims.convert_element_type %3514, %int6_4394 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_4395 = torch.constant.int 6
    %3520 = torch.prims.convert_element_type %3516, %int6_4395 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %3521 = torch.aten.mm %3519, %3520 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_4396 = torch.constant.int 1
    %3522 = torch.aten.mul.Scalar %3521, %int1_4396 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_4397 = torch.constant.int 1
    %3523 = torch.aten.mul.Scalar %3518, %int1_4397 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4398 = torch.constant.int 1
    %3524 = torch.aten.add.Tensor %3522, %3523, %int1_4398 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_4399 = torch.constant.int 5
    %3525 = torch.prims.convert_element_type %3524, %int5_4399 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_4400 = torch.constant.int 1
    %int4096_4401 = torch.constant.int 4096
    %int3072_4402 = torch.constant.int 3072
    %3526 = torch.prim.ListConstruct %int1_4400, %int4096_4401, %int3072_4402 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3527 = torch.aten.view %3525, %3526 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %3528 = torch.aten.mul.Tensor %3339, %3527 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4403 = torch.constant.int 1
    %3529 = torch.aten.add.Tensor %3261, %3528, %int1_4403 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4404 = torch.constant.int 1
    %int1_4405 = torch.constant.int 1
    %3530 = torch.aten.add.Scalar %3341, %int1_4404, %int1_4405 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_4406 = torch.constant.int 6
    %3531 = torch.prims.convert_element_type %3529, %int6_4406 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_4407 = torch.constant.int 2
    %3532 = torch.prim.ListConstruct %int2_4407 : (!torch.int) -> !torch.list<int>
    %int0_4408 = torch.constant.int 0
    %true_4409 = torch.constant.bool true
    %result0_4410, %result1_4411 = torch.aten.var_mean.correction %3531, %3532, %int0_4408, %true_4409 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_4412 = torch.constant.float 9.9999999999999995E-7
    %int1_4413 = torch.constant.int 1
    %3533 = torch.aten.add.Scalar %result0_4410, %float9.999990e-07_4412, %int1_4413 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %3534 = torch.aten.rsqrt %3533 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_4414 = torch.constant.int 1
    %3535 = torch.aten.sub.Tensor %3529, %result1_4411, %int1_4414 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %3536 = torch.aten.mul.Tensor %3535, %3534 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_4415 = torch.constant.int 5
    %3537 = torch.prims.convert_element_type %3536, %int5_4415 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %3538 = torch.aten.mul.Tensor %3530, %3537 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4416 = torch.constant.int 1
    %3539 = torch.aten.add.Tensor %3538, %3340, %int1_4416 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_4417 = torch.constant.int 4096
    %int3072_4418 = torch.constant.int 3072
    %3540 = torch.prim.ListConstruct %int4096_4417, %int3072_4418 : (!torch.int, !torch.int) -> !torch.list<int>
    %3541 = torch.aten.view %3539, %3540 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.10.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.10.img_mlp.0.weight : tensor<12288x3072xf16>
    %3542 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_4419 = torch.constant.int 0
    %int1_4420 = torch.constant.int 1
    %3543 = torch.aten.transpose.int %3542, %int0_4419, %int1_4420 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.10.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.10.img_mlp.0.bias : tensor<12288xf16>
    %3544 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_4421 = torch.constant.int 6
    %3545 = torch.prims.convert_element_type %3544, %int6_4421 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_4422 = torch.constant.int 6
    %3546 = torch.prims.convert_element_type %3541, %int6_4422 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_4423 = torch.constant.int 6
    %3547 = torch.prims.convert_element_type %3543, %int6_4423 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %3548 = torch.aten.mm %3546, %3547 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_4424 = torch.constant.int 1
    %3549 = torch.aten.mul.Scalar %3548, %int1_4424 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_4425 = torch.constant.int 1
    %3550 = torch.aten.mul.Scalar %3545, %int1_4425 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_4426 = torch.constant.int 1
    %3551 = torch.aten.add.Tensor %3549, %3550, %int1_4426 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_4427 = torch.constant.int 5
    %3552 = torch.prims.convert_element_type %3551, %int5_4427 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_4428 = torch.constant.int 1
    %int4096_4429 = torch.constant.int 4096
    %int12288_4430 = torch.constant.int 12288
    %3553 = torch.prim.ListConstruct %int1_4428, %int4096_4429, %int12288_4430 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3554 = torch.aten.view %3552, %3553 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_4431 = torch.constant.str "tanh"
    %3555 = torch.aten.gelu %3554, %str_4431 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_4432 = torch.constant.int 4096
    %int12288_4433 = torch.constant.int 12288
    %3556 = torch.prim.ListConstruct %int4096_4432, %int12288_4433 : (!torch.int, !torch.int) -> !torch.list<int>
    %3557 = torch.aten.view %3555, %3556 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.10.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.10.img_mlp.2.weight : tensor<3072x12288xf16>
    %3558 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_4434 = torch.constant.int 0
    %int1_4435 = torch.constant.int 1
    %3559 = torch.aten.transpose.int %3558, %int0_4434, %int1_4435 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.10.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.10.img_mlp.2.bias : tensor<3072xf16>
    %3560 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4436 = torch.constant.int 6
    %3561 = torch.prims.convert_element_type %3560, %int6_4436 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4437 = torch.constant.int 6
    %3562 = torch.prims.convert_element_type %3557, %int6_4437 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_4438 = torch.constant.int 6
    %3563 = torch.prims.convert_element_type %3559, %int6_4438 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %3564 = torch.aten.mm %3562, %3563 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_4439 = torch.constant.int 1
    %3565 = torch.aten.mul.Scalar %3564, %int1_4439 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_4440 = torch.constant.int 1
    %3566 = torch.aten.mul.Scalar %3561, %int1_4440 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4441 = torch.constant.int 1
    %3567 = torch.aten.add.Tensor %3565, %3566, %int1_4441 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_4442 = torch.constant.int 5
    %3568 = torch.prims.convert_element_type %3567, %int5_4442 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_4443 = torch.constant.int 1
    %int4096_4444 = torch.constant.int 4096
    %int3072_4445 = torch.constant.int 3072
    %3569 = torch.prim.ListConstruct %int1_4443, %int4096_4444, %int3072_4445 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3570 = torch.aten.view %3568, %3569 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %3571 = torch.aten.mul.Tensor %3342, %3570 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4446 = torch.constant.int 1
    %3572 = torch.aten.add.Tensor %3529, %3571, %int1_4446 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_4447 = torch.constant.int 512
    %int3072_4448 = torch.constant.int 3072
    %3573 = torch.prim.ListConstruct %int512_4447, %int3072_4448 : (!torch.int, !torch.int) -> !torch.list<int>
    %3574 = torch.aten.view %3510, %3573 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.10.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.10.txt_attn.proj.weight : tensor<3072x3072xf16>
    %3575 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_4449 = torch.constant.int 0
    %int1_4450 = torch.constant.int 1
    %3576 = torch.aten.transpose.int %3575, %int0_4449, %int1_4450 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.10.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.10.txt_attn.proj.bias : tensor<3072xf16>
    %3577 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4451 = torch.constant.int 6
    %3578 = torch.prims.convert_element_type %3577, %int6_4451 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4452 = torch.constant.int 6
    %3579 = torch.prims.convert_element_type %3574, %int6_4452 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_4453 = torch.constant.int 6
    %3580 = torch.prims.convert_element_type %3576, %int6_4453 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %3581 = torch.aten.mm %3579, %3580 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_4454 = torch.constant.int 1
    %3582 = torch.aten.mul.Scalar %3581, %int1_4454 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_4455 = torch.constant.int 1
    %3583 = torch.aten.mul.Scalar %3578, %int1_4455 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4456 = torch.constant.int 1
    %3584 = torch.aten.add.Tensor %3582, %3583, %int1_4456 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_4457 = torch.constant.int 5
    %3585 = torch.prims.convert_element_type %3584, %int5_4457 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_4458 = torch.constant.int 1
    %int512_4459 = torch.constant.int 512
    %int3072_4460 = torch.constant.int 3072
    %3586 = torch.prim.ListConstruct %int1_4458, %int512_4459, %int3072_4460 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3587 = torch.aten.view %3585, %3586 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %3588 = torch.aten.mul.Tensor %3360, %3587 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4461 = torch.constant.int 1
    %3589 = torch.aten.add.Tensor %3321, %3588, %int1_4461 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_4462 = torch.constant.int 1
    %int1_4463 = torch.constant.int 1
    %3590 = torch.aten.add.Scalar %3362, %int1_4462, %int1_4463 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_4464 = torch.constant.int 6
    %3591 = torch.prims.convert_element_type %3589, %int6_4464 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_4465 = torch.constant.int 2
    %3592 = torch.prim.ListConstruct %int2_4465 : (!torch.int) -> !torch.list<int>
    %int0_4466 = torch.constant.int 0
    %true_4467 = torch.constant.bool true
    %result0_4468, %result1_4469 = torch.aten.var_mean.correction %3591, %3592, %int0_4466, %true_4467 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_4470 = torch.constant.float 9.9999999999999995E-7
    %int1_4471 = torch.constant.int 1
    %3593 = torch.aten.add.Scalar %result0_4468, %float9.999990e-07_4470, %int1_4471 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %3594 = torch.aten.rsqrt %3593 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_4472 = torch.constant.int 1
    %3595 = torch.aten.sub.Tensor %3589, %result1_4469, %int1_4472 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %3596 = torch.aten.mul.Tensor %3595, %3594 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_4473 = torch.constant.int 5
    %3597 = torch.prims.convert_element_type %3596, %int5_4473 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %3598 = torch.aten.mul.Tensor %3590, %3597 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4474 = torch.constant.int 1
    %3599 = torch.aten.add.Tensor %3598, %3361, %int1_4474 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_4475 = torch.constant.int 512
    %int3072_4476 = torch.constant.int 3072
    %3600 = torch.prim.ListConstruct %int512_4475, %int3072_4476 : (!torch.int, !torch.int) -> !torch.list<int>
    %3601 = torch.aten.view %3599, %3600 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.10.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.10.txt_mlp.0.weight : tensor<12288x3072xf16>
    %3602 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_4477 = torch.constant.int 0
    %int1_4478 = torch.constant.int 1
    %3603 = torch.aten.transpose.int %3602, %int0_4477, %int1_4478 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.10.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.10.txt_mlp.0.bias : tensor<12288xf16>
    %3604 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_4479 = torch.constant.int 6
    %3605 = torch.prims.convert_element_type %3604, %int6_4479 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_4480 = torch.constant.int 6
    %3606 = torch.prims.convert_element_type %3601, %int6_4480 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_4481 = torch.constant.int 6
    %3607 = torch.prims.convert_element_type %3603, %int6_4481 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %3608 = torch.aten.mm %3606, %3607 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_4482 = torch.constant.int 1
    %3609 = torch.aten.mul.Scalar %3608, %int1_4482 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_4483 = torch.constant.int 1
    %3610 = torch.aten.mul.Scalar %3605, %int1_4483 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_4484 = torch.constant.int 1
    %3611 = torch.aten.add.Tensor %3609, %3610, %int1_4484 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_4485 = torch.constant.int 5
    %3612 = torch.prims.convert_element_type %3611, %int5_4485 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_4486 = torch.constant.int 1
    %int512_4487 = torch.constant.int 512
    %int12288_4488 = torch.constant.int 12288
    %3613 = torch.prim.ListConstruct %int1_4486, %int512_4487, %int12288_4488 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3614 = torch.aten.view %3612, %3613 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_4489 = torch.constant.str "tanh"
    %3615 = torch.aten.gelu %3614, %str_4489 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_4490 = torch.constant.int 512
    %int12288_4491 = torch.constant.int 12288
    %3616 = torch.prim.ListConstruct %int512_4490, %int12288_4491 : (!torch.int, !torch.int) -> !torch.list<int>
    %3617 = torch.aten.view %3615, %3616 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.10.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.10.txt_mlp.2.weight : tensor<3072x12288xf16>
    %3618 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_4492 = torch.constant.int 0
    %int1_4493 = torch.constant.int 1
    %3619 = torch.aten.transpose.int %3618, %int0_4492, %int1_4493 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.10.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.10.txt_mlp.2.bias : tensor<3072xf16>
    %3620 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.10.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4494 = torch.constant.int 6
    %3621 = torch.prims.convert_element_type %3620, %int6_4494 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4495 = torch.constant.int 6
    %3622 = torch.prims.convert_element_type %3617, %int6_4495 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_4496 = torch.constant.int 6
    %3623 = torch.prims.convert_element_type %3619, %int6_4496 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %3624 = torch.aten.mm %3622, %3623 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_4497 = torch.constant.int 1
    %3625 = torch.aten.mul.Scalar %3624, %int1_4497 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_4498 = torch.constant.int 1
    %3626 = torch.aten.mul.Scalar %3621, %int1_4498 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4499 = torch.constant.int 1
    %3627 = torch.aten.add.Tensor %3625, %3626, %int1_4499 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_4500 = torch.constant.int 5
    %3628 = torch.prims.convert_element_type %3627, %int5_4500 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_4501 = torch.constant.int 1
    %int512_4502 = torch.constant.int 512
    %int3072_4503 = torch.constant.int 3072
    %3629 = torch.prim.ListConstruct %int1_4501, %int512_4502, %int3072_4503 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3630 = torch.aten.view %3628, %3629 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %3631 = torch.aten.mul.Tensor %3363, %3630 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4504 = torch.constant.int 1
    %3632 = torch.aten.add.Tensor %3589, %3631, %int1_4504 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %3633 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.11.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.11.img_mod.lin.weight : tensor<18432x3072xf16>
    %3634 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_4505 = torch.constant.int 0
    %int1_4506 = torch.constant.int 1
    %3635 = torch.aten.transpose.int %3634, %int0_4505, %int1_4506 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.11.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.11.img_mod.lin.bias : tensor<18432xf16>
    %3636 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_4507 = torch.constant.int 6
    %3637 = torch.prims.convert_element_type %3636, %int6_4507 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_4508 = torch.constant.int 6
    %3638 = torch.prims.convert_element_type %3633, %int6_4508 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_4509 = torch.constant.int 6
    %3639 = torch.prims.convert_element_type %3635, %int6_4509 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %3640 = torch.aten.mm %3638, %3639 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_4510 = torch.constant.int 1
    %3641 = torch.aten.mul.Scalar %3640, %int1_4510 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_4511 = torch.constant.int 1
    %3642 = torch.aten.mul.Scalar %3637, %int1_4511 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_4512 = torch.constant.int 1
    %3643 = torch.aten.add.Tensor %3641, %3642, %int1_4512 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_4513 = torch.constant.int 5
    %3644 = torch.prims.convert_element_type %3643, %int5_4513 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_4514 = torch.constant.int 0
    %int0_4515 = torch.constant.int 0
    %int9223372036854775807_4516 = torch.constant.int 9223372036854775807
    %int1_4517 = torch.constant.int 1
    %3645 = torch.aten.slice.Tensor %3644, %int0_4514, %int0_4515, %int9223372036854775807_4516, %int1_4517 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_4518 = torch.constant.int 1
    %3646 = torch.aten.unsqueeze %3645, %int1_4518 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_4519 = torch.constant.int 2
    %int0_4520 = torch.constant.int 0
    %int9223372036854775807_4521 = torch.constant.int 9223372036854775807
    %int1_4522 = torch.constant.int 1
    %3647 = torch.aten.slice.Tensor %3646, %int2_4519, %int0_4520, %int9223372036854775807_4521, %int1_4522 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_4523 = torch.constant.int -1
    %int0_4524 = torch.constant.int 0
    %int3072_4525 = torch.constant.int 3072
    %int1_4526 = torch.constant.int 1
    %3648 = torch.aten.slice.Tensor %3647, %int-1_4523, %int0_4524, %int3072_4525, %int1_4526 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4527 = torch.constant.int -1
    %int3072_4528 = torch.constant.int 3072
    %int6144_4529 = torch.constant.int 6144
    %int1_4530 = torch.constant.int 1
    %3649 = torch.aten.slice.Tensor %3647, %int-1_4527, %int3072_4528, %int6144_4529, %int1_4530 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4531 = torch.constant.int -1
    %int6144_4532 = torch.constant.int 6144
    %int9216_4533 = torch.constant.int 9216
    %int1_4534 = torch.constant.int 1
    %3650 = torch.aten.slice.Tensor %3647, %int-1_4531, %int6144_4532, %int9216_4533, %int1_4534 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4535 = torch.constant.int -1
    %int9216_4536 = torch.constant.int 9216
    %int12288_4537 = torch.constant.int 12288
    %int1_4538 = torch.constant.int 1
    %3651 = torch.aten.slice.Tensor %3647, %int-1_4535, %int9216_4536, %int12288_4537, %int1_4538 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4539 = torch.constant.int -1
    %int12288_4540 = torch.constant.int 12288
    %int15360_4541 = torch.constant.int 15360
    %int1_4542 = torch.constant.int 1
    %3652 = torch.aten.slice.Tensor %3647, %int-1_4539, %int12288_4540, %int15360_4541, %int1_4542 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4543 = torch.constant.int -1
    %int15360_4544 = torch.constant.int 15360
    %int18432_4545 = torch.constant.int 18432
    %int1_4546 = torch.constant.int 1
    %3653 = torch.aten.slice.Tensor %3647, %int-1_4543, %int15360_4544, %int18432_4545, %int1_4546 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3654 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.11.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.11.txt_mod.lin.weight : tensor<18432x3072xf16>
    %3655 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_4547 = torch.constant.int 0
    %int1_4548 = torch.constant.int 1
    %3656 = torch.aten.transpose.int %3655, %int0_4547, %int1_4548 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.11.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.11.txt_mod.lin.bias : tensor<18432xf16>
    %3657 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_4549 = torch.constant.int 6
    %3658 = torch.prims.convert_element_type %3657, %int6_4549 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_4550 = torch.constant.int 6
    %3659 = torch.prims.convert_element_type %3654, %int6_4550 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_4551 = torch.constant.int 6
    %3660 = torch.prims.convert_element_type %3656, %int6_4551 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %3661 = torch.aten.mm %3659, %3660 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_4552 = torch.constant.int 1
    %3662 = torch.aten.mul.Scalar %3661, %int1_4552 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_4553 = torch.constant.int 1
    %3663 = torch.aten.mul.Scalar %3658, %int1_4553 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_4554 = torch.constant.int 1
    %3664 = torch.aten.add.Tensor %3662, %3663, %int1_4554 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_4555 = torch.constant.int 5
    %3665 = torch.prims.convert_element_type %3664, %int5_4555 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_4556 = torch.constant.int 0
    %int0_4557 = torch.constant.int 0
    %int9223372036854775807_4558 = torch.constant.int 9223372036854775807
    %int1_4559 = torch.constant.int 1
    %3666 = torch.aten.slice.Tensor %3665, %int0_4556, %int0_4557, %int9223372036854775807_4558, %int1_4559 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_4560 = torch.constant.int 1
    %3667 = torch.aten.unsqueeze %3666, %int1_4560 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_4561 = torch.constant.int 2
    %int0_4562 = torch.constant.int 0
    %int9223372036854775807_4563 = torch.constant.int 9223372036854775807
    %int1_4564 = torch.constant.int 1
    %3668 = torch.aten.slice.Tensor %3667, %int2_4561, %int0_4562, %int9223372036854775807_4563, %int1_4564 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_4565 = torch.constant.int -1
    %int0_4566 = torch.constant.int 0
    %int3072_4567 = torch.constant.int 3072
    %int1_4568 = torch.constant.int 1
    %3669 = torch.aten.slice.Tensor %3668, %int-1_4565, %int0_4566, %int3072_4567, %int1_4568 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4569 = torch.constant.int -1
    %int3072_4570 = torch.constant.int 3072
    %int6144_4571 = torch.constant.int 6144
    %int1_4572 = torch.constant.int 1
    %3670 = torch.aten.slice.Tensor %3668, %int-1_4569, %int3072_4570, %int6144_4571, %int1_4572 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4573 = torch.constant.int -1
    %int6144_4574 = torch.constant.int 6144
    %int9216_4575 = torch.constant.int 9216
    %int1_4576 = torch.constant.int 1
    %3671 = torch.aten.slice.Tensor %3668, %int-1_4573, %int6144_4574, %int9216_4575, %int1_4576 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4577 = torch.constant.int -1
    %int9216_4578 = torch.constant.int 9216
    %int12288_4579 = torch.constant.int 12288
    %int1_4580 = torch.constant.int 1
    %3672 = torch.aten.slice.Tensor %3668, %int-1_4577, %int9216_4578, %int12288_4579, %int1_4580 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4581 = torch.constant.int -1
    %int12288_4582 = torch.constant.int 12288
    %int15360_4583 = torch.constant.int 15360
    %int1_4584 = torch.constant.int 1
    %3673 = torch.aten.slice.Tensor %3668, %int-1_4581, %int12288_4582, %int15360_4583, %int1_4584 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4585 = torch.constant.int -1
    %int15360_4586 = torch.constant.int 15360
    %int18432_4587 = torch.constant.int 18432
    %int1_4588 = torch.constant.int 1
    %3674 = torch.aten.slice.Tensor %3668, %int-1_4585, %int15360_4586, %int18432_4587, %int1_4588 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_4589 = torch.constant.int 6
    %3675 = torch.prims.convert_element_type %3572, %int6_4589 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_4590 = torch.constant.int 2
    %3676 = torch.prim.ListConstruct %int2_4590 : (!torch.int) -> !torch.list<int>
    %int0_4591 = torch.constant.int 0
    %true_4592 = torch.constant.bool true
    %result0_4593, %result1_4594 = torch.aten.var_mean.correction %3675, %3676, %int0_4591, %true_4592 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_4595 = torch.constant.float 9.9999999999999995E-7
    %int1_4596 = torch.constant.int 1
    %3677 = torch.aten.add.Scalar %result0_4593, %float9.999990e-07_4595, %int1_4596 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %3678 = torch.aten.rsqrt %3677 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_4597 = torch.constant.int 1
    %3679 = torch.aten.sub.Tensor %3572, %result1_4594, %int1_4597 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %3680 = torch.aten.mul.Tensor %3679, %3678 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_4598 = torch.constant.int 5
    %3681 = torch.prims.convert_element_type %3680, %int5_4598 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4599 = torch.constant.int 1
    %int1_4600 = torch.constant.int 1
    %3682 = torch.aten.add.Scalar %3649, %int1_4599, %int1_4600 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3683 = torch.aten.mul.Tensor %3682, %3681 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4601 = torch.constant.int 1
    %3684 = torch.aten.add.Tensor %3683, %3648, %int1_4601 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_4602 = torch.constant.int 4096
    %int3072_4603 = torch.constant.int 3072
    %3685 = torch.prim.ListConstruct %int4096_4602, %int3072_4603 : (!torch.int, !torch.int) -> !torch.list<int>
    %3686 = torch.aten.view %3684, %3685 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.11.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.11.img_attn.qkv.weight : tensor<9216x3072xf16>
    %3687 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_4604 = torch.constant.int 0
    %int1_4605 = torch.constant.int 1
    %3688 = torch.aten.transpose.int %3687, %int0_4604, %int1_4605 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.11.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.11.img_attn.qkv.bias : tensor<9216xf16>
    %3689 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_4606 = torch.constant.int 6
    %3690 = torch.prims.convert_element_type %3689, %int6_4606 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_4607 = torch.constant.int 6
    %3691 = torch.prims.convert_element_type %3686, %int6_4607 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_4608 = torch.constant.int 6
    %3692 = torch.prims.convert_element_type %3688, %int6_4608 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %3693 = torch.aten.mm %3691, %3692 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_4609 = torch.constant.int 1
    %3694 = torch.aten.mul.Scalar %3693, %int1_4609 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_4610 = torch.constant.int 1
    %3695 = torch.aten.mul.Scalar %3690, %int1_4610 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_4611 = torch.constant.int 1
    %3696 = torch.aten.add.Tensor %3694, %3695, %int1_4611 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_4612 = torch.constant.int 5
    %3697 = torch.prims.convert_element_type %3696, %int5_4612 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_4613 = torch.constant.int 1
    %int4096_4614 = torch.constant.int 4096
    %int9216_4615 = torch.constant.int 9216
    %3698 = torch.prim.ListConstruct %int1_4613, %int4096_4614, %int9216_4615 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3699 = torch.aten.view %3697, %3698 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_4616 = torch.constant.int 1
    %int4096_4617 = torch.constant.int 4096
    %int3_4618 = torch.constant.int 3
    %int24_4619 = torch.constant.int 24
    %int128_4620 = torch.constant.int 128
    %3700 = torch.prim.ListConstruct %int1_4616, %int4096_4617, %int3_4618, %int24_4619, %int128_4620 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3701 = torch.aten.view %3699, %3700 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_4621 = torch.constant.int 2
    %int0_4622 = torch.constant.int 0
    %int3_4623 = torch.constant.int 3
    %int1_4624 = torch.constant.int 1
    %int4_4625 = torch.constant.int 4
    %3702 = torch.prim.ListConstruct %int2_4621, %int0_4622, %int3_4623, %int1_4624, %int4_4625 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3703 = torch.aten.permute %3701, %3702 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_4626 = torch.constant.int 0
    %int0_4627 = torch.constant.int 0
    %3704 = torch.aten.select.int %3703, %int0_4626, %int0_4627 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_4628 = torch.constant.int 0
    %int1_4629 = torch.constant.int 1
    %3705 = torch.aten.select.int %3703, %int0_4628, %int1_4629 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_4630 = torch.constant.int 0
    %int2_4631 = torch.constant.int 2
    %3706 = torch.aten.select.int %3703, %int0_4630, %int2_4631 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_4632 = torch.constant.int 6
    %3707 = torch.prims.convert_element_type %3704, %int6_4632 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_4633 = torch.constant.int 2
    %3708 = torch.aten.pow.Tensor_Scalar %3707, %int2_4633 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_4634 = torch.constant.int -1
    %3709 = torch.prim.ListConstruct %int-1_4634 : (!torch.int) -> !torch.list<int>
    %true_4635 = torch.constant.bool true
    %none_4636 = torch.constant.none
    %3710 = torch.aten.mean.dim %3708, %3709, %true_4635, %none_4636 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_4637 = torch.constant.float 9.9999999999999995E-7
    %int1_4638 = torch.constant.int 1
    %3711 = torch.aten.add.Scalar %3710, %float9.999990e-07_4637, %int1_4638 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %3712 = torch.aten.rsqrt %3711 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %3713 = torch.aten.mul.Tensor %3707, %3712 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_4639 = torch.constant.int 5
    %3714 = torch.prims.convert_element_type %3713, %int5_4639 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.11.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.11.img_attn.norm.query_norm.scale : tensor<128xf16>
    %3715 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3716 = torch.aten.mul.Tensor %3714, %3715 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_4640 = torch.constant.int 6
    %3717 = torch.prims.convert_element_type %3705, %int6_4640 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_4641 = torch.constant.int 2
    %3718 = torch.aten.pow.Tensor_Scalar %3717, %int2_4641 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_4642 = torch.constant.int -1
    %3719 = torch.prim.ListConstruct %int-1_4642 : (!torch.int) -> !torch.list<int>
    %true_4643 = torch.constant.bool true
    %none_4644 = torch.constant.none
    %3720 = torch.aten.mean.dim %3718, %3719, %true_4643, %none_4644 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_4645 = torch.constant.float 9.9999999999999995E-7
    %int1_4646 = torch.constant.int 1
    %3721 = torch.aten.add.Scalar %3720, %float9.999990e-07_4645, %int1_4646 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %3722 = torch.aten.rsqrt %3721 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %3723 = torch.aten.mul.Tensor %3717, %3722 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_4647 = torch.constant.int 5
    %3724 = torch.prims.convert_element_type %3723, %int5_4647 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.11.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.11.img_attn.norm.key_norm.scale : tensor<128xf16>
    %3725 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3726 = torch.aten.mul.Tensor %3724, %3725 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_4648 = torch.constant.int 5
    %3727 = torch.prims.convert_element_type %3716, %int5_4648 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_4649 = torch.constant.int 5
    %3728 = torch.prims.convert_element_type %3726, %int5_4649 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_4650 = torch.constant.int 6
    %3729 = torch.prims.convert_element_type %3632, %int6_4650 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_4651 = torch.constant.int 2
    %3730 = torch.prim.ListConstruct %int2_4651 : (!torch.int) -> !torch.list<int>
    %int0_4652 = torch.constant.int 0
    %true_4653 = torch.constant.bool true
    %result0_4654, %result1_4655 = torch.aten.var_mean.correction %3729, %3730, %int0_4652, %true_4653 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_4656 = torch.constant.float 9.9999999999999995E-7
    %int1_4657 = torch.constant.int 1
    %3731 = torch.aten.add.Scalar %result0_4654, %float9.999990e-07_4656, %int1_4657 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %3732 = torch.aten.rsqrt %3731 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_4658 = torch.constant.int 1
    %3733 = torch.aten.sub.Tensor %3632, %result1_4655, %int1_4658 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %3734 = torch.aten.mul.Tensor %3733, %3732 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_4659 = torch.constant.int 5
    %3735 = torch.prims.convert_element_type %3734, %int5_4659 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_4660 = torch.constant.int 1
    %int1_4661 = torch.constant.int 1
    %3736 = torch.aten.add.Scalar %3670, %int1_4660, %int1_4661 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3737 = torch.aten.mul.Tensor %3736, %3735 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4662 = torch.constant.int 1
    %3738 = torch.aten.add.Tensor %3737, %3669, %int1_4662 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_4663 = torch.constant.int 512
    %int3072_4664 = torch.constant.int 3072
    %3739 = torch.prim.ListConstruct %int512_4663, %int3072_4664 : (!torch.int, !torch.int) -> !torch.list<int>
    %3740 = torch.aten.view %3738, %3739 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.11.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.11.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %3741 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_4665 = torch.constant.int 0
    %int1_4666 = torch.constant.int 1
    %3742 = torch.aten.transpose.int %3741, %int0_4665, %int1_4666 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.11.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.11.txt_attn.qkv.bias : tensor<9216xf16>
    %3743 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_4667 = torch.constant.int 6
    %3744 = torch.prims.convert_element_type %3743, %int6_4667 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_4668 = torch.constant.int 6
    %3745 = torch.prims.convert_element_type %3740, %int6_4668 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_4669 = torch.constant.int 6
    %3746 = torch.prims.convert_element_type %3742, %int6_4669 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %3747 = torch.aten.mm %3745, %3746 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_4670 = torch.constant.int 1
    %3748 = torch.aten.mul.Scalar %3747, %int1_4670 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_4671 = torch.constant.int 1
    %3749 = torch.aten.mul.Scalar %3744, %int1_4671 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_4672 = torch.constant.int 1
    %3750 = torch.aten.add.Tensor %3748, %3749, %int1_4672 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_4673 = torch.constant.int 5
    %3751 = torch.prims.convert_element_type %3750, %int5_4673 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_4674 = torch.constant.int 1
    %int512_4675 = torch.constant.int 512
    %int9216_4676 = torch.constant.int 9216
    %3752 = torch.prim.ListConstruct %int1_4674, %int512_4675, %int9216_4676 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3753 = torch.aten.view %3751, %3752 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_4677 = torch.constant.int 1
    %int512_4678 = torch.constant.int 512
    %int3_4679 = torch.constant.int 3
    %int24_4680 = torch.constant.int 24
    %int128_4681 = torch.constant.int 128
    %3754 = torch.prim.ListConstruct %int1_4677, %int512_4678, %int3_4679, %int24_4680, %int128_4681 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3755 = torch.aten.view %3753, %3754 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_4682 = torch.constant.int 2
    %int0_4683 = torch.constant.int 0
    %int3_4684 = torch.constant.int 3
    %int1_4685 = torch.constant.int 1
    %int4_4686 = torch.constant.int 4
    %3756 = torch.prim.ListConstruct %int2_4682, %int0_4683, %int3_4684, %int1_4685, %int4_4686 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3757 = torch.aten.permute %3755, %3756 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_4687 = torch.constant.int 0
    %int0_4688 = torch.constant.int 0
    %3758 = torch.aten.select.int %3757, %int0_4687, %int0_4688 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_4689 = torch.constant.int 0
    %int1_4690 = torch.constant.int 1
    %3759 = torch.aten.select.int %3757, %int0_4689, %int1_4690 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_4691 = torch.constant.int 0
    %int2_4692 = torch.constant.int 2
    %3760 = torch.aten.select.int %3757, %int0_4691, %int2_4692 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_4693 = torch.constant.int 6
    %3761 = torch.prims.convert_element_type %3758, %int6_4693 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_4694 = torch.constant.int 2
    %3762 = torch.aten.pow.Tensor_Scalar %3761, %int2_4694 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_4695 = torch.constant.int -1
    %3763 = torch.prim.ListConstruct %int-1_4695 : (!torch.int) -> !torch.list<int>
    %true_4696 = torch.constant.bool true
    %none_4697 = torch.constant.none
    %3764 = torch.aten.mean.dim %3762, %3763, %true_4696, %none_4697 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_4698 = torch.constant.float 9.9999999999999995E-7
    %int1_4699 = torch.constant.int 1
    %3765 = torch.aten.add.Scalar %3764, %float9.999990e-07_4698, %int1_4699 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %3766 = torch.aten.rsqrt %3765 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %3767 = torch.aten.mul.Tensor %3761, %3766 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_4700 = torch.constant.int 5
    %3768 = torch.prims.convert_element_type %3767, %int5_4700 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.11.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.11.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %3769 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3770 = torch.aten.mul.Tensor %3768, %3769 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_4701 = torch.constant.int 6
    %3771 = torch.prims.convert_element_type %3759, %int6_4701 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_4702 = torch.constant.int 2
    %3772 = torch.aten.pow.Tensor_Scalar %3771, %int2_4702 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_4703 = torch.constant.int -1
    %3773 = torch.prim.ListConstruct %int-1_4703 : (!torch.int) -> !torch.list<int>
    %true_4704 = torch.constant.bool true
    %none_4705 = torch.constant.none
    %3774 = torch.aten.mean.dim %3772, %3773, %true_4704, %none_4705 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_4706 = torch.constant.float 9.9999999999999995E-7
    %int1_4707 = torch.constant.int 1
    %3775 = torch.aten.add.Scalar %3774, %float9.999990e-07_4706, %int1_4707 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %3776 = torch.aten.rsqrt %3775 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %3777 = torch.aten.mul.Tensor %3771, %3776 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_4708 = torch.constant.int 5
    %3778 = torch.prims.convert_element_type %3777, %int5_4708 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.11.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.11.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %3779 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %3780 = torch.aten.mul.Tensor %3778, %3779 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_4709 = torch.constant.int 5
    %3781 = torch.prims.convert_element_type %3770, %int5_4709 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_4710 = torch.constant.int 5
    %3782 = torch.prims.convert_element_type %3780, %int5_4710 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %3783 = torch.prim.ListConstruct %3781, %3727 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_4711 = torch.constant.int 2
    %3784 = torch.aten.cat %3783, %int2_4711 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %3785 = torch.prim.ListConstruct %3782, %3728 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_4712 = torch.constant.int 2
    %3786 = torch.aten.cat %3785, %int2_4712 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %3787 = torch.prim.ListConstruct %3760, %3706 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_4713 = torch.constant.int 2
    %3788 = torch.aten.cat %3787, %int2_4713 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_4714 = torch.constant.int 6
    %3789 = torch.prims.convert_element_type %3784, %int6_4714 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_4715 = torch.constant.int 1
    %int24_4716 = torch.constant.int 24
    %int4608_4717 = torch.constant.int 4608
    %int-1_4718 = torch.constant.int -1
    %int1_4719 = torch.constant.int 1
    %int2_4720 = torch.constant.int 2
    %3790 = torch.prim.ListConstruct %int1_4715, %int24_4716, %int4608_4717, %int-1_4718, %int1_4719, %int2_4720 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3791 = torch.aten.view %3789, %3790 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_4721 = torch.constant.int 6
    %3792 = torch.prims.convert_element_type %3786, %int6_4721 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_4722 = torch.constant.int 1
    %int24_4723 = torch.constant.int 24
    %int4608_4724 = torch.constant.int 4608
    %int-1_4725 = torch.constant.int -1
    %int1_4726 = torch.constant.int 1
    %int2_4727 = torch.constant.int 2
    %3793 = torch.prim.ListConstruct %int1_4722, %int24_4723, %int4608_4724, %int-1_4725, %int1_4726, %int2_4727 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3794 = torch.aten.view %3792, %3793 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_4728 = torch.constant.int 5
    %int0_4729 = torch.constant.int 0
    %3795 = torch.aten.select.int %211, %int5_4728, %int0_4729 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4730 = torch.constant.int 5
    %int0_4731 = torch.constant.int 0
    %3796 = torch.aten.select.int %3791, %int5_4730, %int0_4731 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3797 = torch.aten.mul.Tensor %3795, %3796 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_4732 = torch.constant.int 5
    %int1_4733 = torch.constant.int 1
    %3798 = torch.aten.select.int %211, %int5_4732, %int1_4733 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4734 = torch.constant.int 5
    %int1_4735 = torch.constant.int 1
    %3799 = torch.aten.select.int %3791, %int5_4734, %int1_4735 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3800 = torch.aten.mul.Tensor %3798, %3799 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_4736 = torch.constant.int 1
    %3801 = torch.aten.add.Tensor %3797, %3800, %int1_4736 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_4737 = torch.constant.int 5
    %int0_4738 = torch.constant.int 0
    %3802 = torch.aten.select.int %211, %int5_4737, %int0_4738 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4739 = torch.constant.int 5
    %int0_4740 = torch.constant.int 0
    %3803 = torch.aten.select.int %3794, %int5_4739, %int0_4740 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3804 = torch.aten.mul.Tensor %3802, %3803 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_4741 = torch.constant.int 5
    %int1_4742 = torch.constant.int 1
    %3805 = torch.aten.select.int %211, %int5_4741, %int1_4742 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_4743 = torch.constant.int 5
    %int1_4744 = torch.constant.int 1
    %3806 = torch.aten.select.int %3794, %int5_4743, %int1_4744 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %3807 = torch.aten.mul.Tensor %3805, %3806 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_4745 = torch.constant.int 1
    %3808 = torch.aten.add.Tensor %3804, %3807, %int1_4745 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_4746 = torch.constant.int 1
    %int24_4747 = torch.constant.int 24
    %int4608_4748 = torch.constant.int 4608
    %int128_4749 = torch.constant.int 128
    %3809 = torch.prim.ListConstruct %int1_4746, %int24_4747, %int4608_4748, %int128_4749 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3810 = torch.aten.view %3801, %3809 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_4750 = torch.constant.int 5
    %3811 = torch.prims.convert_element_type %3810, %int5_4750 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_4751 = torch.constant.int 1
    %int24_4752 = torch.constant.int 24
    %int4608_4753 = torch.constant.int 4608
    %int128_4754 = torch.constant.int 128
    %3812 = torch.prim.ListConstruct %int1_4751, %int24_4752, %int4608_4753, %int128_4754 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3813 = torch.aten.view %3808, %3812 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_4755 = torch.constant.int 5
    %3814 = torch.prims.convert_element_type %3813, %int5_4755 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_4756 = torch.constant.float 0.000000e+00
    %false_4757 = torch.constant.bool false
    %none_4758 = torch.constant.none
    %none_4759 = torch.constant.none
    %3815:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%3811, %3814, %3788, %float0.000000e00_4756, %false_4757, %none_4758, %none_4759) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_4760 = torch.constant.int 0
    %int2_4761 = torch.constant.int 2
    %int1_4762 = torch.constant.int 1
    %int3_4763 = torch.constant.int 3
    %3816 = torch.prim.ListConstruct %int0_4760, %int2_4761, %int1_4762, %int3_4763 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3817 = torch.aten.permute %3815#0, %3816 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_4764 = torch.constant.int 1
    %int4608_4765 = torch.constant.int 4608
    %int3072_4766 = torch.constant.int 3072
    %3818 = torch.prim.ListConstruct %int1_4764, %int4608_4765, %int3072_4766 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3819 = torch.aten.view %3817, %3818 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_4767 = torch.constant.int 0
    %int0_4768 = torch.constant.int 0
    %int9223372036854775807_4769 = torch.constant.int 9223372036854775807
    %int1_4770 = torch.constant.int 1
    %3820 = torch.aten.slice.Tensor %3819, %int0_4767, %int0_4768, %int9223372036854775807_4769, %int1_4770 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_4771 = torch.constant.int 1
    %int0_4772 = torch.constant.int 0
    %int512_4773 = torch.constant.int 512
    %int1_4774 = torch.constant.int 1
    %3821 = torch.aten.slice.Tensor %3820, %int1_4771, %int0_4772, %int512_4773, %int1_4774 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_4775 = torch.constant.int 0
    %int0_4776 = torch.constant.int 0
    %int9223372036854775807_4777 = torch.constant.int 9223372036854775807
    %int1_4778 = torch.constant.int 1
    %3822 = torch.aten.slice.Tensor %3819, %int0_4775, %int0_4776, %int9223372036854775807_4777, %int1_4778 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_4779 = torch.constant.int 1
    %int512_4780 = torch.constant.int 512
    %int9223372036854775807_4781 = torch.constant.int 9223372036854775807
    %int1_4782 = torch.constant.int 1
    %3823 = torch.aten.slice.Tensor %3822, %int1_4779, %int512_4780, %int9223372036854775807_4781, %int1_4782 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_4783 = torch.constant.int 4096
    %int3072_4784 = torch.constant.int 3072
    %3824 = torch.prim.ListConstruct %int4096_4783, %int3072_4784 : (!torch.int, !torch.int) -> !torch.list<int>
    %3825 = torch.aten.view %3823, %3824 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.11.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.11.img_attn.proj.weight : tensor<3072x3072xf16>
    %3826 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_4785 = torch.constant.int 0
    %int1_4786 = torch.constant.int 1
    %3827 = torch.aten.transpose.int %3826, %int0_4785, %int1_4786 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.11.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.11.img_attn.proj.bias : tensor<3072xf16>
    %3828 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4787 = torch.constant.int 6
    %3829 = torch.prims.convert_element_type %3828, %int6_4787 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4788 = torch.constant.int 6
    %3830 = torch.prims.convert_element_type %3825, %int6_4788 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_4789 = torch.constant.int 6
    %3831 = torch.prims.convert_element_type %3827, %int6_4789 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %3832 = torch.aten.mm %3830, %3831 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_4790 = torch.constant.int 1
    %3833 = torch.aten.mul.Scalar %3832, %int1_4790 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_4791 = torch.constant.int 1
    %3834 = torch.aten.mul.Scalar %3829, %int1_4791 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4792 = torch.constant.int 1
    %3835 = torch.aten.add.Tensor %3833, %3834, %int1_4792 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_4793 = torch.constant.int 5
    %3836 = torch.prims.convert_element_type %3835, %int5_4793 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_4794 = torch.constant.int 1
    %int4096_4795 = torch.constant.int 4096
    %int3072_4796 = torch.constant.int 3072
    %3837 = torch.prim.ListConstruct %int1_4794, %int4096_4795, %int3072_4796 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3838 = torch.aten.view %3836, %3837 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %3839 = torch.aten.mul.Tensor %3650, %3838 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4797 = torch.constant.int 1
    %3840 = torch.aten.add.Tensor %3572, %3839, %int1_4797 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4798 = torch.constant.int 1
    %int1_4799 = torch.constant.int 1
    %3841 = torch.aten.add.Scalar %3652, %int1_4798, %int1_4799 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_4800 = torch.constant.int 6
    %3842 = torch.prims.convert_element_type %3840, %int6_4800 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_4801 = torch.constant.int 2
    %3843 = torch.prim.ListConstruct %int2_4801 : (!torch.int) -> !torch.list<int>
    %int0_4802 = torch.constant.int 0
    %true_4803 = torch.constant.bool true
    %result0_4804, %result1_4805 = torch.aten.var_mean.correction %3842, %3843, %int0_4802, %true_4803 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_4806 = torch.constant.float 9.9999999999999995E-7
    %int1_4807 = torch.constant.int 1
    %3844 = torch.aten.add.Scalar %result0_4804, %float9.999990e-07_4806, %int1_4807 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %3845 = torch.aten.rsqrt %3844 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_4808 = torch.constant.int 1
    %3846 = torch.aten.sub.Tensor %3840, %result1_4805, %int1_4808 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %3847 = torch.aten.mul.Tensor %3846, %3845 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_4809 = torch.constant.int 5
    %3848 = torch.prims.convert_element_type %3847, %int5_4809 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %3849 = torch.aten.mul.Tensor %3841, %3848 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4810 = torch.constant.int 1
    %3850 = torch.aten.add.Tensor %3849, %3651, %int1_4810 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_4811 = torch.constant.int 4096
    %int3072_4812 = torch.constant.int 3072
    %3851 = torch.prim.ListConstruct %int4096_4811, %int3072_4812 : (!torch.int, !torch.int) -> !torch.list<int>
    %3852 = torch.aten.view %3850, %3851 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.11.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.11.img_mlp.0.weight : tensor<12288x3072xf16>
    %3853 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_4813 = torch.constant.int 0
    %int1_4814 = torch.constant.int 1
    %3854 = torch.aten.transpose.int %3853, %int0_4813, %int1_4814 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.11.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.11.img_mlp.0.bias : tensor<12288xf16>
    %3855 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_4815 = torch.constant.int 6
    %3856 = torch.prims.convert_element_type %3855, %int6_4815 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_4816 = torch.constant.int 6
    %3857 = torch.prims.convert_element_type %3852, %int6_4816 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_4817 = torch.constant.int 6
    %3858 = torch.prims.convert_element_type %3854, %int6_4817 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %3859 = torch.aten.mm %3857, %3858 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_4818 = torch.constant.int 1
    %3860 = torch.aten.mul.Scalar %3859, %int1_4818 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_4819 = torch.constant.int 1
    %3861 = torch.aten.mul.Scalar %3856, %int1_4819 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_4820 = torch.constant.int 1
    %3862 = torch.aten.add.Tensor %3860, %3861, %int1_4820 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_4821 = torch.constant.int 5
    %3863 = torch.prims.convert_element_type %3862, %int5_4821 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_4822 = torch.constant.int 1
    %int4096_4823 = torch.constant.int 4096
    %int12288_4824 = torch.constant.int 12288
    %3864 = torch.prim.ListConstruct %int1_4822, %int4096_4823, %int12288_4824 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3865 = torch.aten.view %3863, %3864 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_4825 = torch.constant.str "tanh"
    %3866 = torch.aten.gelu %3865, %str_4825 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_4826 = torch.constant.int 4096
    %int12288_4827 = torch.constant.int 12288
    %3867 = torch.prim.ListConstruct %int4096_4826, %int12288_4827 : (!torch.int, !torch.int) -> !torch.list<int>
    %3868 = torch.aten.view %3866, %3867 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.11.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.11.img_mlp.2.weight : tensor<3072x12288xf16>
    %3869 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_4828 = torch.constant.int 0
    %int1_4829 = torch.constant.int 1
    %3870 = torch.aten.transpose.int %3869, %int0_4828, %int1_4829 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.11.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.11.img_mlp.2.bias : tensor<3072xf16>
    %3871 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4830 = torch.constant.int 6
    %3872 = torch.prims.convert_element_type %3871, %int6_4830 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4831 = torch.constant.int 6
    %3873 = torch.prims.convert_element_type %3868, %int6_4831 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_4832 = torch.constant.int 6
    %3874 = torch.prims.convert_element_type %3870, %int6_4832 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %3875 = torch.aten.mm %3873, %3874 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_4833 = torch.constant.int 1
    %3876 = torch.aten.mul.Scalar %3875, %int1_4833 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_4834 = torch.constant.int 1
    %3877 = torch.aten.mul.Scalar %3872, %int1_4834 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4835 = torch.constant.int 1
    %3878 = torch.aten.add.Tensor %3876, %3877, %int1_4835 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_4836 = torch.constant.int 5
    %3879 = torch.prims.convert_element_type %3878, %int5_4836 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_4837 = torch.constant.int 1
    %int4096_4838 = torch.constant.int 4096
    %int3072_4839 = torch.constant.int 3072
    %3880 = torch.prim.ListConstruct %int1_4837, %int4096_4838, %int3072_4839 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3881 = torch.aten.view %3879, %3880 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %3882 = torch.aten.mul.Tensor %3653, %3881 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4840 = torch.constant.int 1
    %3883 = torch.aten.add.Tensor %3840, %3882, %int1_4840 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_4841 = torch.constant.int 512
    %int3072_4842 = torch.constant.int 3072
    %3884 = torch.prim.ListConstruct %int512_4841, %int3072_4842 : (!torch.int, !torch.int) -> !torch.list<int>
    %3885 = torch.aten.view %3821, %3884 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.11.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.11.txt_attn.proj.weight : tensor<3072x3072xf16>
    %3886 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_4843 = torch.constant.int 0
    %int1_4844 = torch.constant.int 1
    %3887 = torch.aten.transpose.int %3886, %int0_4843, %int1_4844 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.11.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.11.txt_attn.proj.bias : tensor<3072xf16>
    %3888 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4845 = torch.constant.int 6
    %3889 = torch.prims.convert_element_type %3888, %int6_4845 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4846 = torch.constant.int 6
    %3890 = torch.prims.convert_element_type %3885, %int6_4846 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_4847 = torch.constant.int 6
    %3891 = torch.prims.convert_element_type %3887, %int6_4847 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %3892 = torch.aten.mm %3890, %3891 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_4848 = torch.constant.int 1
    %3893 = torch.aten.mul.Scalar %3892, %int1_4848 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_4849 = torch.constant.int 1
    %3894 = torch.aten.mul.Scalar %3889, %int1_4849 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4850 = torch.constant.int 1
    %3895 = torch.aten.add.Tensor %3893, %3894, %int1_4850 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_4851 = torch.constant.int 5
    %3896 = torch.prims.convert_element_type %3895, %int5_4851 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_4852 = torch.constant.int 1
    %int512_4853 = torch.constant.int 512
    %int3072_4854 = torch.constant.int 3072
    %3897 = torch.prim.ListConstruct %int1_4852, %int512_4853, %int3072_4854 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3898 = torch.aten.view %3896, %3897 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %3899 = torch.aten.mul.Tensor %3671, %3898 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4855 = torch.constant.int 1
    %3900 = torch.aten.add.Tensor %3632, %3899, %int1_4855 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_4856 = torch.constant.int 1
    %int1_4857 = torch.constant.int 1
    %3901 = torch.aten.add.Scalar %3673, %int1_4856, %int1_4857 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_4858 = torch.constant.int 6
    %3902 = torch.prims.convert_element_type %3900, %int6_4858 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_4859 = torch.constant.int 2
    %3903 = torch.prim.ListConstruct %int2_4859 : (!torch.int) -> !torch.list<int>
    %int0_4860 = torch.constant.int 0
    %true_4861 = torch.constant.bool true
    %result0_4862, %result1_4863 = torch.aten.var_mean.correction %3902, %3903, %int0_4860, %true_4861 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_4864 = torch.constant.float 9.9999999999999995E-7
    %int1_4865 = torch.constant.int 1
    %3904 = torch.aten.add.Scalar %result0_4862, %float9.999990e-07_4864, %int1_4865 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %3905 = torch.aten.rsqrt %3904 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_4866 = torch.constant.int 1
    %3906 = torch.aten.sub.Tensor %3900, %result1_4863, %int1_4866 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %3907 = torch.aten.mul.Tensor %3906, %3905 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_4867 = torch.constant.int 5
    %3908 = torch.prims.convert_element_type %3907, %int5_4867 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %3909 = torch.aten.mul.Tensor %3901, %3908 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4868 = torch.constant.int 1
    %3910 = torch.aten.add.Tensor %3909, %3672, %int1_4868 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_4869 = torch.constant.int 512
    %int3072_4870 = torch.constant.int 3072
    %3911 = torch.prim.ListConstruct %int512_4869, %int3072_4870 : (!torch.int, !torch.int) -> !torch.list<int>
    %3912 = torch.aten.view %3910, %3911 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.11.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.11.txt_mlp.0.weight : tensor<12288x3072xf16>
    %3913 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_4871 = torch.constant.int 0
    %int1_4872 = torch.constant.int 1
    %3914 = torch.aten.transpose.int %3913, %int0_4871, %int1_4872 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.11.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.11.txt_mlp.0.bias : tensor<12288xf16>
    %3915 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_4873 = torch.constant.int 6
    %3916 = torch.prims.convert_element_type %3915, %int6_4873 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_4874 = torch.constant.int 6
    %3917 = torch.prims.convert_element_type %3912, %int6_4874 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_4875 = torch.constant.int 6
    %3918 = torch.prims.convert_element_type %3914, %int6_4875 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %3919 = torch.aten.mm %3917, %3918 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_4876 = torch.constant.int 1
    %3920 = torch.aten.mul.Scalar %3919, %int1_4876 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_4877 = torch.constant.int 1
    %3921 = torch.aten.mul.Scalar %3916, %int1_4877 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_4878 = torch.constant.int 1
    %3922 = torch.aten.add.Tensor %3920, %3921, %int1_4878 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_4879 = torch.constant.int 5
    %3923 = torch.prims.convert_element_type %3922, %int5_4879 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_4880 = torch.constant.int 1
    %int512_4881 = torch.constant.int 512
    %int12288_4882 = torch.constant.int 12288
    %3924 = torch.prim.ListConstruct %int1_4880, %int512_4881, %int12288_4882 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3925 = torch.aten.view %3923, %3924 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_4883 = torch.constant.str "tanh"
    %3926 = torch.aten.gelu %3925, %str_4883 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_4884 = torch.constant.int 512
    %int12288_4885 = torch.constant.int 12288
    %3927 = torch.prim.ListConstruct %int512_4884, %int12288_4885 : (!torch.int, !torch.int) -> !torch.list<int>
    %3928 = torch.aten.view %3926, %3927 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.11.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.11.txt_mlp.2.weight : tensor<3072x12288xf16>
    %3929 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_4886 = torch.constant.int 0
    %int1_4887 = torch.constant.int 1
    %3930 = torch.aten.transpose.int %3929, %int0_4886, %int1_4887 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.11.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.11.txt_mlp.2.bias : tensor<3072xf16>
    %3931 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.11.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_4888 = torch.constant.int 6
    %3932 = torch.prims.convert_element_type %3931, %int6_4888 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_4889 = torch.constant.int 6
    %3933 = torch.prims.convert_element_type %3928, %int6_4889 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_4890 = torch.constant.int 6
    %3934 = torch.prims.convert_element_type %3930, %int6_4890 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %3935 = torch.aten.mm %3933, %3934 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_4891 = torch.constant.int 1
    %3936 = torch.aten.mul.Scalar %3935, %int1_4891 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_4892 = torch.constant.int 1
    %3937 = torch.aten.mul.Scalar %3932, %int1_4892 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_4893 = torch.constant.int 1
    %3938 = torch.aten.add.Tensor %3936, %3937, %int1_4893 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_4894 = torch.constant.int 5
    %3939 = torch.prims.convert_element_type %3938, %int5_4894 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_4895 = torch.constant.int 1
    %int512_4896 = torch.constant.int 512
    %int3072_4897 = torch.constant.int 3072
    %3940 = torch.prim.ListConstruct %int1_4895, %int512_4896, %int3072_4897 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %3941 = torch.aten.view %3939, %3940 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %3942 = torch.aten.mul.Tensor %3674, %3941 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_4898 = torch.constant.int 1
    %3943 = torch.aten.add.Tensor %3900, %3942, %int1_4898 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %3944 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.12.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.12.img_mod.lin.weight : tensor<18432x3072xf16>
    %3945 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_4899 = torch.constant.int 0
    %int1_4900 = torch.constant.int 1
    %3946 = torch.aten.transpose.int %3945, %int0_4899, %int1_4900 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.12.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.12.img_mod.lin.bias : tensor<18432xf16>
    %3947 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_4901 = torch.constant.int 6
    %3948 = torch.prims.convert_element_type %3947, %int6_4901 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_4902 = torch.constant.int 6
    %3949 = torch.prims.convert_element_type %3944, %int6_4902 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_4903 = torch.constant.int 6
    %3950 = torch.prims.convert_element_type %3946, %int6_4903 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %3951 = torch.aten.mm %3949, %3950 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_4904 = torch.constant.int 1
    %3952 = torch.aten.mul.Scalar %3951, %int1_4904 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_4905 = torch.constant.int 1
    %3953 = torch.aten.mul.Scalar %3948, %int1_4905 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_4906 = torch.constant.int 1
    %3954 = torch.aten.add.Tensor %3952, %3953, %int1_4906 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_4907 = torch.constant.int 5
    %3955 = torch.prims.convert_element_type %3954, %int5_4907 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_4908 = torch.constant.int 0
    %int0_4909 = torch.constant.int 0
    %int9223372036854775807_4910 = torch.constant.int 9223372036854775807
    %int1_4911 = torch.constant.int 1
    %3956 = torch.aten.slice.Tensor %3955, %int0_4908, %int0_4909, %int9223372036854775807_4910, %int1_4911 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_4912 = torch.constant.int 1
    %3957 = torch.aten.unsqueeze %3956, %int1_4912 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_4913 = torch.constant.int 2
    %int0_4914 = torch.constant.int 0
    %int9223372036854775807_4915 = torch.constant.int 9223372036854775807
    %int1_4916 = torch.constant.int 1
    %3958 = torch.aten.slice.Tensor %3957, %int2_4913, %int0_4914, %int9223372036854775807_4915, %int1_4916 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_4917 = torch.constant.int -1
    %int0_4918 = torch.constant.int 0
    %int3072_4919 = torch.constant.int 3072
    %int1_4920 = torch.constant.int 1
    %3959 = torch.aten.slice.Tensor %3958, %int-1_4917, %int0_4918, %int3072_4919, %int1_4920 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4921 = torch.constant.int -1
    %int3072_4922 = torch.constant.int 3072
    %int6144_4923 = torch.constant.int 6144
    %int1_4924 = torch.constant.int 1
    %3960 = torch.aten.slice.Tensor %3958, %int-1_4921, %int3072_4922, %int6144_4923, %int1_4924 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4925 = torch.constant.int -1
    %int6144_4926 = torch.constant.int 6144
    %int9216_4927 = torch.constant.int 9216
    %int1_4928 = torch.constant.int 1
    %3961 = torch.aten.slice.Tensor %3958, %int-1_4925, %int6144_4926, %int9216_4927, %int1_4928 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4929 = torch.constant.int -1
    %int9216_4930 = torch.constant.int 9216
    %int12288_4931 = torch.constant.int 12288
    %int1_4932 = torch.constant.int 1
    %3962 = torch.aten.slice.Tensor %3958, %int-1_4929, %int9216_4930, %int12288_4931, %int1_4932 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4933 = torch.constant.int -1
    %int12288_4934 = torch.constant.int 12288
    %int15360_4935 = torch.constant.int 15360
    %int1_4936 = torch.constant.int 1
    %3963 = torch.aten.slice.Tensor %3958, %int-1_4933, %int12288_4934, %int15360_4935, %int1_4936 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4937 = torch.constant.int -1
    %int15360_4938 = torch.constant.int 15360
    %int18432_4939 = torch.constant.int 18432
    %int1_4940 = torch.constant.int 1
    %3964 = torch.aten.slice.Tensor %3958, %int-1_4937, %int15360_4938, %int18432_4939, %int1_4940 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3965 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.12.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.12.txt_mod.lin.weight : tensor<18432x3072xf16>
    %3966 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_4941 = torch.constant.int 0
    %int1_4942 = torch.constant.int 1
    %3967 = torch.aten.transpose.int %3966, %int0_4941, %int1_4942 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.12.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.12.txt_mod.lin.bias : tensor<18432xf16>
    %3968 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_4943 = torch.constant.int 6
    %3969 = torch.prims.convert_element_type %3968, %int6_4943 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_4944 = torch.constant.int 6
    %3970 = torch.prims.convert_element_type %3965, %int6_4944 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_4945 = torch.constant.int 6
    %3971 = torch.prims.convert_element_type %3967, %int6_4945 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %3972 = torch.aten.mm %3970, %3971 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_4946 = torch.constant.int 1
    %3973 = torch.aten.mul.Scalar %3972, %int1_4946 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_4947 = torch.constant.int 1
    %3974 = torch.aten.mul.Scalar %3969, %int1_4947 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_4948 = torch.constant.int 1
    %3975 = torch.aten.add.Tensor %3973, %3974, %int1_4948 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_4949 = torch.constant.int 5
    %3976 = torch.prims.convert_element_type %3975, %int5_4949 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_4950 = torch.constant.int 0
    %int0_4951 = torch.constant.int 0
    %int9223372036854775807_4952 = torch.constant.int 9223372036854775807
    %int1_4953 = torch.constant.int 1
    %3977 = torch.aten.slice.Tensor %3976, %int0_4950, %int0_4951, %int9223372036854775807_4952, %int1_4953 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_4954 = torch.constant.int 1
    %3978 = torch.aten.unsqueeze %3977, %int1_4954 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_4955 = torch.constant.int 2
    %int0_4956 = torch.constant.int 0
    %int9223372036854775807_4957 = torch.constant.int 9223372036854775807
    %int1_4958 = torch.constant.int 1
    %3979 = torch.aten.slice.Tensor %3978, %int2_4955, %int0_4956, %int9223372036854775807_4957, %int1_4958 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_4959 = torch.constant.int -1
    %int0_4960 = torch.constant.int 0
    %int3072_4961 = torch.constant.int 3072
    %int1_4962 = torch.constant.int 1
    %3980 = torch.aten.slice.Tensor %3979, %int-1_4959, %int0_4960, %int3072_4961, %int1_4962 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4963 = torch.constant.int -1
    %int3072_4964 = torch.constant.int 3072
    %int6144_4965 = torch.constant.int 6144
    %int1_4966 = torch.constant.int 1
    %3981 = torch.aten.slice.Tensor %3979, %int-1_4963, %int3072_4964, %int6144_4965, %int1_4966 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4967 = torch.constant.int -1
    %int6144_4968 = torch.constant.int 6144
    %int9216_4969 = torch.constant.int 9216
    %int1_4970 = torch.constant.int 1
    %3982 = torch.aten.slice.Tensor %3979, %int-1_4967, %int6144_4968, %int9216_4969, %int1_4970 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4971 = torch.constant.int -1
    %int9216_4972 = torch.constant.int 9216
    %int12288_4973 = torch.constant.int 12288
    %int1_4974 = torch.constant.int 1
    %3983 = torch.aten.slice.Tensor %3979, %int-1_4971, %int9216_4972, %int12288_4973, %int1_4974 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4975 = torch.constant.int -1
    %int12288_4976 = torch.constant.int 12288
    %int15360_4977 = torch.constant.int 15360
    %int1_4978 = torch.constant.int 1
    %3984 = torch.aten.slice.Tensor %3979, %int-1_4975, %int12288_4976, %int15360_4977, %int1_4978 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_4979 = torch.constant.int -1
    %int15360_4980 = torch.constant.int 15360
    %int18432_4981 = torch.constant.int 18432
    %int1_4982 = torch.constant.int 1
    %3985 = torch.aten.slice.Tensor %3979, %int-1_4979, %int15360_4980, %int18432_4981, %int1_4982 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_4983 = torch.constant.int 6
    %3986 = torch.prims.convert_element_type %3883, %int6_4983 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_4984 = torch.constant.int 2
    %3987 = torch.prim.ListConstruct %int2_4984 : (!torch.int) -> !torch.list<int>
    %int0_4985 = torch.constant.int 0
    %true_4986 = torch.constant.bool true
    %result0_4987, %result1_4988 = torch.aten.var_mean.correction %3986, %3987, %int0_4985, %true_4986 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_4989 = torch.constant.float 9.9999999999999995E-7
    %int1_4990 = torch.constant.int 1
    %3988 = torch.aten.add.Scalar %result0_4987, %float9.999990e-07_4989, %int1_4990 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %3989 = torch.aten.rsqrt %3988 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_4991 = torch.constant.int 1
    %3990 = torch.aten.sub.Tensor %3883, %result1_4988, %int1_4991 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %3991 = torch.aten.mul.Tensor %3990, %3989 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_4992 = torch.constant.int 5
    %3992 = torch.prims.convert_element_type %3991, %int5_4992 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4993 = torch.constant.int 1
    %int1_4994 = torch.constant.int 1
    %3993 = torch.aten.add.Scalar %3960, %int1_4993, %int1_4994 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %3994 = torch.aten.mul.Tensor %3993, %3992 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_4995 = torch.constant.int 1
    %3995 = torch.aten.add.Tensor %3994, %3959, %int1_4995 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_4996 = torch.constant.int 4096
    %int3072_4997 = torch.constant.int 3072
    %3996 = torch.prim.ListConstruct %int4096_4996, %int3072_4997 : (!torch.int, !torch.int) -> !torch.list<int>
    %3997 = torch.aten.view %3995, %3996 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.12.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.12.img_attn.qkv.weight : tensor<9216x3072xf16>
    %3998 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_4998 = torch.constant.int 0
    %int1_4999 = torch.constant.int 1
    %3999 = torch.aten.transpose.int %3998, %int0_4998, %int1_4999 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.12.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.12.img_attn.qkv.bias : tensor<9216xf16>
    %4000 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_5000 = torch.constant.int 6
    %4001 = torch.prims.convert_element_type %4000, %int6_5000 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_5001 = torch.constant.int 6
    %4002 = torch.prims.convert_element_type %3997, %int6_5001 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_5002 = torch.constant.int 6
    %4003 = torch.prims.convert_element_type %3999, %int6_5002 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %4004 = torch.aten.mm %4002, %4003 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_5003 = torch.constant.int 1
    %4005 = torch.aten.mul.Scalar %4004, %int1_5003 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_5004 = torch.constant.int 1
    %4006 = torch.aten.mul.Scalar %4001, %int1_5004 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_5005 = torch.constant.int 1
    %4007 = torch.aten.add.Tensor %4005, %4006, %int1_5005 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_5006 = torch.constant.int 5
    %4008 = torch.prims.convert_element_type %4007, %int5_5006 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_5007 = torch.constant.int 1
    %int4096_5008 = torch.constant.int 4096
    %int9216_5009 = torch.constant.int 9216
    %4009 = torch.prim.ListConstruct %int1_5007, %int4096_5008, %int9216_5009 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4010 = torch.aten.view %4008, %4009 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_5010 = torch.constant.int 1
    %int4096_5011 = torch.constant.int 4096
    %int3_5012 = torch.constant.int 3
    %int24_5013 = torch.constant.int 24
    %int128_5014 = torch.constant.int 128
    %4011 = torch.prim.ListConstruct %int1_5010, %int4096_5011, %int3_5012, %int24_5013, %int128_5014 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4012 = torch.aten.view %4010, %4011 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_5015 = torch.constant.int 2
    %int0_5016 = torch.constant.int 0
    %int3_5017 = torch.constant.int 3
    %int1_5018 = torch.constant.int 1
    %int4_5019 = torch.constant.int 4
    %4013 = torch.prim.ListConstruct %int2_5015, %int0_5016, %int3_5017, %int1_5018, %int4_5019 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4014 = torch.aten.permute %4012, %4013 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_5020 = torch.constant.int 0
    %int0_5021 = torch.constant.int 0
    %4015 = torch.aten.select.int %4014, %int0_5020, %int0_5021 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_5022 = torch.constant.int 0
    %int1_5023 = torch.constant.int 1
    %4016 = torch.aten.select.int %4014, %int0_5022, %int1_5023 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_5024 = torch.constant.int 0
    %int2_5025 = torch.constant.int 2
    %4017 = torch.aten.select.int %4014, %int0_5024, %int2_5025 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_5026 = torch.constant.int 6
    %4018 = torch.prims.convert_element_type %4015, %int6_5026 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_5027 = torch.constant.int 2
    %4019 = torch.aten.pow.Tensor_Scalar %4018, %int2_5027 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_5028 = torch.constant.int -1
    %4020 = torch.prim.ListConstruct %int-1_5028 : (!torch.int) -> !torch.list<int>
    %true_5029 = torch.constant.bool true
    %none_5030 = torch.constant.none
    %4021 = torch.aten.mean.dim %4019, %4020, %true_5029, %none_5030 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_5031 = torch.constant.float 9.9999999999999995E-7
    %int1_5032 = torch.constant.int 1
    %4022 = torch.aten.add.Scalar %4021, %float9.999990e-07_5031, %int1_5032 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %4023 = torch.aten.rsqrt %4022 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %4024 = torch.aten.mul.Tensor %4018, %4023 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_5033 = torch.constant.int 5
    %4025 = torch.prims.convert_element_type %4024, %int5_5033 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.12.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.12.img_attn.norm.query_norm.scale : tensor<128xf16>
    %4026 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4027 = torch.aten.mul.Tensor %4025, %4026 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_5034 = torch.constant.int 6
    %4028 = torch.prims.convert_element_type %4016, %int6_5034 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_5035 = torch.constant.int 2
    %4029 = torch.aten.pow.Tensor_Scalar %4028, %int2_5035 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_5036 = torch.constant.int -1
    %4030 = torch.prim.ListConstruct %int-1_5036 : (!torch.int) -> !torch.list<int>
    %true_5037 = torch.constant.bool true
    %none_5038 = torch.constant.none
    %4031 = torch.aten.mean.dim %4029, %4030, %true_5037, %none_5038 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_5039 = torch.constant.float 9.9999999999999995E-7
    %int1_5040 = torch.constant.int 1
    %4032 = torch.aten.add.Scalar %4031, %float9.999990e-07_5039, %int1_5040 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %4033 = torch.aten.rsqrt %4032 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %4034 = torch.aten.mul.Tensor %4028, %4033 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_5041 = torch.constant.int 5
    %4035 = torch.prims.convert_element_type %4034, %int5_5041 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.12.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.12.img_attn.norm.key_norm.scale : tensor<128xf16>
    %4036 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4037 = torch.aten.mul.Tensor %4035, %4036 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_5042 = torch.constant.int 5
    %4038 = torch.prims.convert_element_type %4027, %int5_5042 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_5043 = torch.constant.int 5
    %4039 = torch.prims.convert_element_type %4037, %int5_5043 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_5044 = torch.constant.int 6
    %4040 = torch.prims.convert_element_type %3943, %int6_5044 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_5045 = torch.constant.int 2
    %4041 = torch.prim.ListConstruct %int2_5045 : (!torch.int) -> !torch.list<int>
    %int0_5046 = torch.constant.int 0
    %true_5047 = torch.constant.bool true
    %result0_5048, %result1_5049 = torch.aten.var_mean.correction %4040, %4041, %int0_5046, %true_5047 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_5050 = torch.constant.float 9.9999999999999995E-7
    %int1_5051 = torch.constant.int 1
    %4042 = torch.aten.add.Scalar %result0_5048, %float9.999990e-07_5050, %int1_5051 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %4043 = torch.aten.rsqrt %4042 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_5052 = torch.constant.int 1
    %4044 = torch.aten.sub.Tensor %3943, %result1_5049, %int1_5052 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %4045 = torch.aten.mul.Tensor %4044, %4043 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_5053 = torch.constant.int 5
    %4046 = torch.prims.convert_element_type %4045, %int5_5053 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_5054 = torch.constant.int 1
    %int1_5055 = torch.constant.int 1
    %4047 = torch.aten.add.Scalar %3981, %int1_5054, %int1_5055 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4048 = torch.aten.mul.Tensor %4047, %4046 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5056 = torch.constant.int 1
    %4049 = torch.aten.add.Tensor %4048, %3980, %int1_5056 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_5057 = torch.constant.int 512
    %int3072_5058 = torch.constant.int 3072
    %4050 = torch.prim.ListConstruct %int512_5057, %int3072_5058 : (!torch.int, !torch.int) -> !torch.list<int>
    %4051 = torch.aten.view %4049, %4050 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.12.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.12.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %4052 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_5059 = torch.constant.int 0
    %int1_5060 = torch.constant.int 1
    %4053 = torch.aten.transpose.int %4052, %int0_5059, %int1_5060 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.12.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.12.txt_attn.qkv.bias : tensor<9216xf16>
    %4054 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_5061 = torch.constant.int 6
    %4055 = torch.prims.convert_element_type %4054, %int6_5061 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_5062 = torch.constant.int 6
    %4056 = torch.prims.convert_element_type %4051, %int6_5062 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_5063 = torch.constant.int 6
    %4057 = torch.prims.convert_element_type %4053, %int6_5063 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %4058 = torch.aten.mm %4056, %4057 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_5064 = torch.constant.int 1
    %4059 = torch.aten.mul.Scalar %4058, %int1_5064 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_5065 = torch.constant.int 1
    %4060 = torch.aten.mul.Scalar %4055, %int1_5065 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_5066 = torch.constant.int 1
    %4061 = torch.aten.add.Tensor %4059, %4060, %int1_5066 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_5067 = torch.constant.int 5
    %4062 = torch.prims.convert_element_type %4061, %int5_5067 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_5068 = torch.constant.int 1
    %int512_5069 = torch.constant.int 512
    %int9216_5070 = torch.constant.int 9216
    %4063 = torch.prim.ListConstruct %int1_5068, %int512_5069, %int9216_5070 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4064 = torch.aten.view %4062, %4063 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_5071 = torch.constant.int 1
    %int512_5072 = torch.constant.int 512
    %int3_5073 = torch.constant.int 3
    %int24_5074 = torch.constant.int 24
    %int128_5075 = torch.constant.int 128
    %4065 = torch.prim.ListConstruct %int1_5071, %int512_5072, %int3_5073, %int24_5074, %int128_5075 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4066 = torch.aten.view %4064, %4065 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_5076 = torch.constant.int 2
    %int0_5077 = torch.constant.int 0
    %int3_5078 = torch.constant.int 3
    %int1_5079 = torch.constant.int 1
    %int4_5080 = torch.constant.int 4
    %4067 = torch.prim.ListConstruct %int2_5076, %int0_5077, %int3_5078, %int1_5079, %int4_5080 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4068 = torch.aten.permute %4066, %4067 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_5081 = torch.constant.int 0
    %int0_5082 = torch.constant.int 0
    %4069 = torch.aten.select.int %4068, %int0_5081, %int0_5082 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_5083 = torch.constant.int 0
    %int1_5084 = torch.constant.int 1
    %4070 = torch.aten.select.int %4068, %int0_5083, %int1_5084 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_5085 = torch.constant.int 0
    %int2_5086 = torch.constant.int 2
    %4071 = torch.aten.select.int %4068, %int0_5085, %int2_5086 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_5087 = torch.constant.int 6
    %4072 = torch.prims.convert_element_type %4069, %int6_5087 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_5088 = torch.constant.int 2
    %4073 = torch.aten.pow.Tensor_Scalar %4072, %int2_5088 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_5089 = torch.constant.int -1
    %4074 = torch.prim.ListConstruct %int-1_5089 : (!torch.int) -> !torch.list<int>
    %true_5090 = torch.constant.bool true
    %none_5091 = torch.constant.none
    %4075 = torch.aten.mean.dim %4073, %4074, %true_5090, %none_5091 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_5092 = torch.constant.float 9.9999999999999995E-7
    %int1_5093 = torch.constant.int 1
    %4076 = torch.aten.add.Scalar %4075, %float9.999990e-07_5092, %int1_5093 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %4077 = torch.aten.rsqrt %4076 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %4078 = torch.aten.mul.Tensor %4072, %4077 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_5094 = torch.constant.int 5
    %4079 = torch.prims.convert_element_type %4078, %int5_5094 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.12.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.12.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %4080 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4081 = torch.aten.mul.Tensor %4079, %4080 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_5095 = torch.constant.int 6
    %4082 = torch.prims.convert_element_type %4070, %int6_5095 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_5096 = torch.constant.int 2
    %4083 = torch.aten.pow.Tensor_Scalar %4082, %int2_5096 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_5097 = torch.constant.int -1
    %4084 = torch.prim.ListConstruct %int-1_5097 : (!torch.int) -> !torch.list<int>
    %true_5098 = torch.constant.bool true
    %none_5099 = torch.constant.none
    %4085 = torch.aten.mean.dim %4083, %4084, %true_5098, %none_5099 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_5100 = torch.constant.float 9.9999999999999995E-7
    %int1_5101 = torch.constant.int 1
    %4086 = torch.aten.add.Scalar %4085, %float9.999990e-07_5100, %int1_5101 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %4087 = torch.aten.rsqrt %4086 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %4088 = torch.aten.mul.Tensor %4082, %4087 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_5102 = torch.constant.int 5
    %4089 = torch.prims.convert_element_type %4088, %int5_5102 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.12.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.12.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %4090 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4091 = torch.aten.mul.Tensor %4089, %4090 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_5103 = torch.constant.int 5
    %4092 = torch.prims.convert_element_type %4081, %int5_5103 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_5104 = torch.constant.int 5
    %4093 = torch.prims.convert_element_type %4091, %int5_5104 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %4094 = torch.prim.ListConstruct %4092, %4038 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_5105 = torch.constant.int 2
    %4095 = torch.aten.cat %4094, %int2_5105 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %4096 = torch.prim.ListConstruct %4093, %4039 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_5106 = torch.constant.int 2
    %4097 = torch.aten.cat %4096, %int2_5106 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %4098 = torch.prim.ListConstruct %4071, %4017 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_5107 = torch.constant.int 2
    %4099 = torch.aten.cat %4098, %int2_5107 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_5108 = torch.constant.int 6
    %4100 = torch.prims.convert_element_type %4095, %int6_5108 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_5109 = torch.constant.int 1
    %int24_5110 = torch.constant.int 24
    %int4608_5111 = torch.constant.int 4608
    %int-1_5112 = torch.constant.int -1
    %int1_5113 = torch.constant.int 1
    %int2_5114 = torch.constant.int 2
    %4101 = torch.prim.ListConstruct %int1_5109, %int24_5110, %int4608_5111, %int-1_5112, %int1_5113, %int2_5114 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4102 = torch.aten.view %4100, %4101 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_5115 = torch.constant.int 6
    %4103 = torch.prims.convert_element_type %4097, %int6_5115 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_5116 = torch.constant.int 1
    %int24_5117 = torch.constant.int 24
    %int4608_5118 = torch.constant.int 4608
    %int-1_5119 = torch.constant.int -1
    %int1_5120 = torch.constant.int 1
    %int2_5121 = torch.constant.int 2
    %4104 = torch.prim.ListConstruct %int1_5116, %int24_5117, %int4608_5118, %int-1_5119, %int1_5120, %int2_5121 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4105 = torch.aten.view %4103, %4104 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_5122 = torch.constant.int 5
    %int0_5123 = torch.constant.int 0
    %4106 = torch.aten.select.int %211, %int5_5122, %int0_5123 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5124 = torch.constant.int 5
    %int0_5125 = torch.constant.int 0
    %4107 = torch.aten.select.int %4102, %int5_5124, %int0_5125 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4108 = torch.aten.mul.Tensor %4106, %4107 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_5126 = torch.constant.int 5
    %int1_5127 = torch.constant.int 1
    %4109 = torch.aten.select.int %211, %int5_5126, %int1_5127 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5128 = torch.constant.int 5
    %int1_5129 = torch.constant.int 1
    %4110 = torch.aten.select.int %4102, %int5_5128, %int1_5129 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4111 = torch.aten.mul.Tensor %4109, %4110 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_5130 = torch.constant.int 1
    %4112 = torch.aten.add.Tensor %4108, %4111, %int1_5130 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_5131 = torch.constant.int 5
    %int0_5132 = torch.constant.int 0
    %4113 = torch.aten.select.int %211, %int5_5131, %int0_5132 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5133 = torch.constant.int 5
    %int0_5134 = torch.constant.int 0
    %4114 = torch.aten.select.int %4105, %int5_5133, %int0_5134 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4115 = torch.aten.mul.Tensor %4113, %4114 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_5135 = torch.constant.int 5
    %int1_5136 = torch.constant.int 1
    %4116 = torch.aten.select.int %211, %int5_5135, %int1_5136 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5137 = torch.constant.int 5
    %int1_5138 = torch.constant.int 1
    %4117 = torch.aten.select.int %4105, %int5_5137, %int1_5138 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4118 = torch.aten.mul.Tensor %4116, %4117 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_5139 = torch.constant.int 1
    %4119 = torch.aten.add.Tensor %4115, %4118, %int1_5139 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_5140 = torch.constant.int 1
    %int24_5141 = torch.constant.int 24
    %int4608_5142 = torch.constant.int 4608
    %int128_5143 = torch.constant.int 128
    %4120 = torch.prim.ListConstruct %int1_5140, %int24_5141, %int4608_5142, %int128_5143 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4121 = torch.aten.view %4112, %4120 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_5144 = torch.constant.int 5
    %4122 = torch.prims.convert_element_type %4121, %int5_5144 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_5145 = torch.constant.int 1
    %int24_5146 = torch.constant.int 24
    %int4608_5147 = torch.constant.int 4608
    %int128_5148 = torch.constant.int 128
    %4123 = torch.prim.ListConstruct %int1_5145, %int24_5146, %int4608_5147, %int128_5148 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4124 = torch.aten.view %4119, %4123 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_5149 = torch.constant.int 5
    %4125 = torch.prims.convert_element_type %4124, %int5_5149 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_5150 = torch.constant.float 0.000000e+00
    %false_5151 = torch.constant.bool false
    %none_5152 = torch.constant.none
    %none_5153 = torch.constant.none
    %4126:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%4122, %4125, %4099, %float0.000000e00_5150, %false_5151, %none_5152, %none_5153) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_5154 = torch.constant.int 0
    %int2_5155 = torch.constant.int 2
    %int1_5156 = torch.constant.int 1
    %int3_5157 = torch.constant.int 3
    %4127 = torch.prim.ListConstruct %int0_5154, %int2_5155, %int1_5156, %int3_5157 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4128 = torch.aten.permute %4126#0, %4127 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_5158 = torch.constant.int 1
    %int4608_5159 = torch.constant.int 4608
    %int3072_5160 = torch.constant.int 3072
    %4129 = torch.prim.ListConstruct %int1_5158, %int4608_5159, %int3072_5160 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4130 = torch.aten.view %4128, %4129 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_5161 = torch.constant.int 0
    %int0_5162 = torch.constant.int 0
    %int9223372036854775807_5163 = torch.constant.int 9223372036854775807
    %int1_5164 = torch.constant.int 1
    %4131 = torch.aten.slice.Tensor %4130, %int0_5161, %int0_5162, %int9223372036854775807_5163, %int1_5164 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_5165 = torch.constant.int 1
    %int0_5166 = torch.constant.int 0
    %int512_5167 = torch.constant.int 512
    %int1_5168 = torch.constant.int 1
    %4132 = torch.aten.slice.Tensor %4131, %int1_5165, %int0_5166, %int512_5167, %int1_5168 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_5169 = torch.constant.int 0
    %int0_5170 = torch.constant.int 0
    %int9223372036854775807_5171 = torch.constant.int 9223372036854775807
    %int1_5172 = torch.constant.int 1
    %4133 = torch.aten.slice.Tensor %4130, %int0_5169, %int0_5170, %int9223372036854775807_5171, %int1_5172 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_5173 = torch.constant.int 1
    %int512_5174 = torch.constant.int 512
    %int9223372036854775807_5175 = torch.constant.int 9223372036854775807
    %int1_5176 = torch.constant.int 1
    %4134 = torch.aten.slice.Tensor %4133, %int1_5173, %int512_5174, %int9223372036854775807_5175, %int1_5176 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_5177 = torch.constant.int 4096
    %int3072_5178 = torch.constant.int 3072
    %4135 = torch.prim.ListConstruct %int4096_5177, %int3072_5178 : (!torch.int, !torch.int) -> !torch.list<int>
    %4136 = torch.aten.view %4134, %4135 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.12.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.12.img_attn.proj.weight : tensor<3072x3072xf16>
    %4137 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_5179 = torch.constant.int 0
    %int1_5180 = torch.constant.int 1
    %4138 = torch.aten.transpose.int %4137, %int0_5179, %int1_5180 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.12.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.12.img_attn.proj.bias : tensor<3072xf16>
    %4139 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5181 = torch.constant.int 6
    %4140 = torch.prims.convert_element_type %4139, %int6_5181 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5182 = torch.constant.int 6
    %4141 = torch.prims.convert_element_type %4136, %int6_5182 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_5183 = torch.constant.int 6
    %4142 = torch.prims.convert_element_type %4138, %int6_5183 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %4143 = torch.aten.mm %4141, %4142 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_5184 = torch.constant.int 1
    %4144 = torch.aten.mul.Scalar %4143, %int1_5184 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_5185 = torch.constant.int 1
    %4145 = torch.aten.mul.Scalar %4140, %int1_5185 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5186 = torch.constant.int 1
    %4146 = torch.aten.add.Tensor %4144, %4145, %int1_5186 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_5187 = torch.constant.int 5
    %4147 = torch.prims.convert_element_type %4146, %int5_5187 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_5188 = torch.constant.int 1
    %int4096_5189 = torch.constant.int 4096
    %int3072_5190 = torch.constant.int 3072
    %4148 = torch.prim.ListConstruct %int1_5188, %int4096_5189, %int3072_5190 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4149 = torch.aten.view %4147, %4148 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %4150 = torch.aten.mul.Tensor %3961, %4149 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5191 = torch.constant.int 1
    %4151 = torch.aten.add.Tensor %3883, %4150, %int1_5191 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5192 = torch.constant.int 1
    %int1_5193 = torch.constant.int 1
    %4152 = torch.aten.add.Scalar %3963, %int1_5192, %int1_5193 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_5194 = torch.constant.int 6
    %4153 = torch.prims.convert_element_type %4151, %int6_5194 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_5195 = torch.constant.int 2
    %4154 = torch.prim.ListConstruct %int2_5195 : (!torch.int) -> !torch.list<int>
    %int0_5196 = torch.constant.int 0
    %true_5197 = torch.constant.bool true
    %result0_5198, %result1_5199 = torch.aten.var_mean.correction %4153, %4154, %int0_5196, %true_5197 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_5200 = torch.constant.float 9.9999999999999995E-7
    %int1_5201 = torch.constant.int 1
    %4155 = torch.aten.add.Scalar %result0_5198, %float9.999990e-07_5200, %int1_5201 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %4156 = torch.aten.rsqrt %4155 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_5202 = torch.constant.int 1
    %4157 = torch.aten.sub.Tensor %4151, %result1_5199, %int1_5202 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %4158 = torch.aten.mul.Tensor %4157, %4156 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_5203 = torch.constant.int 5
    %4159 = torch.prims.convert_element_type %4158, %int5_5203 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %4160 = torch.aten.mul.Tensor %4152, %4159 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5204 = torch.constant.int 1
    %4161 = torch.aten.add.Tensor %4160, %3962, %int1_5204 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_5205 = torch.constant.int 4096
    %int3072_5206 = torch.constant.int 3072
    %4162 = torch.prim.ListConstruct %int4096_5205, %int3072_5206 : (!torch.int, !torch.int) -> !torch.list<int>
    %4163 = torch.aten.view %4161, %4162 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.12.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.12.img_mlp.0.weight : tensor<12288x3072xf16>
    %4164 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_5207 = torch.constant.int 0
    %int1_5208 = torch.constant.int 1
    %4165 = torch.aten.transpose.int %4164, %int0_5207, %int1_5208 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.12.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.12.img_mlp.0.bias : tensor<12288xf16>
    %4166 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_5209 = torch.constant.int 6
    %4167 = torch.prims.convert_element_type %4166, %int6_5209 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_5210 = torch.constant.int 6
    %4168 = torch.prims.convert_element_type %4163, %int6_5210 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_5211 = torch.constant.int 6
    %4169 = torch.prims.convert_element_type %4165, %int6_5211 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %4170 = torch.aten.mm %4168, %4169 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_5212 = torch.constant.int 1
    %4171 = torch.aten.mul.Scalar %4170, %int1_5212 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_5213 = torch.constant.int 1
    %4172 = torch.aten.mul.Scalar %4167, %int1_5213 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_5214 = torch.constant.int 1
    %4173 = torch.aten.add.Tensor %4171, %4172, %int1_5214 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_5215 = torch.constant.int 5
    %4174 = torch.prims.convert_element_type %4173, %int5_5215 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_5216 = torch.constant.int 1
    %int4096_5217 = torch.constant.int 4096
    %int12288_5218 = torch.constant.int 12288
    %4175 = torch.prim.ListConstruct %int1_5216, %int4096_5217, %int12288_5218 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4176 = torch.aten.view %4174, %4175 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_5219 = torch.constant.str "tanh"
    %4177 = torch.aten.gelu %4176, %str_5219 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_5220 = torch.constant.int 4096
    %int12288_5221 = torch.constant.int 12288
    %4178 = torch.prim.ListConstruct %int4096_5220, %int12288_5221 : (!torch.int, !torch.int) -> !torch.list<int>
    %4179 = torch.aten.view %4177, %4178 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.12.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.12.img_mlp.2.weight : tensor<3072x12288xf16>
    %4180 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_5222 = torch.constant.int 0
    %int1_5223 = torch.constant.int 1
    %4181 = torch.aten.transpose.int %4180, %int0_5222, %int1_5223 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.12.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.12.img_mlp.2.bias : tensor<3072xf16>
    %4182 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5224 = torch.constant.int 6
    %4183 = torch.prims.convert_element_type %4182, %int6_5224 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5225 = torch.constant.int 6
    %4184 = torch.prims.convert_element_type %4179, %int6_5225 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_5226 = torch.constant.int 6
    %4185 = torch.prims.convert_element_type %4181, %int6_5226 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %4186 = torch.aten.mm %4184, %4185 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_5227 = torch.constant.int 1
    %4187 = torch.aten.mul.Scalar %4186, %int1_5227 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_5228 = torch.constant.int 1
    %4188 = torch.aten.mul.Scalar %4183, %int1_5228 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5229 = torch.constant.int 1
    %4189 = torch.aten.add.Tensor %4187, %4188, %int1_5229 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_5230 = torch.constant.int 5
    %4190 = torch.prims.convert_element_type %4189, %int5_5230 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_5231 = torch.constant.int 1
    %int4096_5232 = torch.constant.int 4096
    %int3072_5233 = torch.constant.int 3072
    %4191 = torch.prim.ListConstruct %int1_5231, %int4096_5232, %int3072_5233 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4192 = torch.aten.view %4190, %4191 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %4193 = torch.aten.mul.Tensor %3964, %4192 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5234 = torch.constant.int 1
    %4194 = torch.aten.add.Tensor %4151, %4193, %int1_5234 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_5235 = torch.constant.int 512
    %int3072_5236 = torch.constant.int 3072
    %4195 = torch.prim.ListConstruct %int512_5235, %int3072_5236 : (!torch.int, !torch.int) -> !torch.list<int>
    %4196 = torch.aten.view %4132, %4195 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.12.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.12.txt_attn.proj.weight : tensor<3072x3072xf16>
    %4197 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_5237 = torch.constant.int 0
    %int1_5238 = torch.constant.int 1
    %4198 = torch.aten.transpose.int %4197, %int0_5237, %int1_5238 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.12.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.12.txt_attn.proj.bias : tensor<3072xf16>
    %4199 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5239 = torch.constant.int 6
    %4200 = torch.prims.convert_element_type %4199, %int6_5239 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5240 = torch.constant.int 6
    %4201 = torch.prims.convert_element_type %4196, %int6_5240 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_5241 = torch.constant.int 6
    %4202 = torch.prims.convert_element_type %4198, %int6_5241 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %4203 = torch.aten.mm %4201, %4202 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_5242 = torch.constant.int 1
    %4204 = torch.aten.mul.Scalar %4203, %int1_5242 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_5243 = torch.constant.int 1
    %4205 = torch.aten.mul.Scalar %4200, %int1_5243 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5244 = torch.constant.int 1
    %4206 = torch.aten.add.Tensor %4204, %4205, %int1_5244 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_5245 = torch.constant.int 5
    %4207 = torch.prims.convert_element_type %4206, %int5_5245 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_5246 = torch.constant.int 1
    %int512_5247 = torch.constant.int 512
    %int3072_5248 = torch.constant.int 3072
    %4208 = torch.prim.ListConstruct %int1_5246, %int512_5247, %int3072_5248 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4209 = torch.aten.view %4207, %4208 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %4210 = torch.aten.mul.Tensor %3982, %4209 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5249 = torch.constant.int 1
    %4211 = torch.aten.add.Tensor %3943, %4210, %int1_5249 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_5250 = torch.constant.int 1
    %int1_5251 = torch.constant.int 1
    %4212 = torch.aten.add.Scalar %3984, %int1_5250, %int1_5251 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_5252 = torch.constant.int 6
    %4213 = torch.prims.convert_element_type %4211, %int6_5252 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_5253 = torch.constant.int 2
    %4214 = torch.prim.ListConstruct %int2_5253 : (!torch.int) -> !torch.list<int>
    %int0_5254 = torch.constant.int 0
    %true_5255 = torch.constant.bool true
    %result0_5256, %result1_5257 = torch.aten.var_mean.correction %4213, %4214, %int0_5254, %true_5255 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_5258 = torch.constant.float 9.9999999999999995E-7
    %int1_5259 = torch.constant.int 1
    %4215 = torch.aten.add.Scalar %result0_5256, %float9.999990e-07_5258, %int1_5259 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %4216 = torch.aten.rsqrt %4215 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_5260 = torch.constant.int 1
    %4217 = torch.aten.sub.Tensor %4211, %result1_5257, %int1_5260 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %4218 = torch.aten.mul.Tensor %4217, %4216 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_5261 = torch.constant.int 5
    %4219 = torch.prims.convert_element_type %4218, %int5_5261 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %4220 = torch.aten.mul.Tensor %4212, %4219 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5262 = torch.constant.int 1
    %4221 = torch.aten.add.Tensor %4220, %3983, %int1_5262 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_5263 = torch.constant.int 512
    %int3072_5264 = torch.constant.int 3072
    %4222 = torch.prim.ListConstruct %int512_5263, %int3072_5264 : (!torch.int, !torch.int) -> !torch.list<int>
    %4223 = torch.aten.view %4221, %4222 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.12.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.12.txt_mlp.0.weight : tensor<12288x3072xf16>
    %4224 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_5265 = torch.constant.int 0
    %int1_5266 = torch.constant.int 1
    %4225 = torch.aten.transpose.int %4224, %int0_5265, %int1_5266 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.12.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.12.txt_mlp.0.bias : tensor<12288xf16>
    %4226 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_5267 = torch.constant.int 6
    %4227 = torch.prims.convert_element_type %4226, %int6_5267 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_5268 = torch.constant.int 6
    %4228 = torch.prims.convert_element_type %4223, %int6_5268 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_5269 = torch.constant.int 6
    %4229 = torch.prims.convert_element_type %4225, %int6_5269 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %4230 = torch.aten.mm %4228, %4229 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_5270 = torch.constant.int 1
    %4231 = torch.aten.mul.Scalar %4230, %int1_5270 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_5271 = torch.constant.int 1
    %4232 = torch.aten.mul.Scalar %4227, %int1_5271 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_5272 = torch.constant.int 1
    %4233 = torch.aten.add.Tensor %4231, %4232, %int1_5272 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_5273 = torch.constant.int 5
    %4234 = torch.prims.convert_element_type %4233, %int5_5273 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_5274 = torch.constant.int 1
    %int512_5275 = torch.constant.int 512
    %int12288_5276 = torch.constant.int 12288
    %4235 = torch.prim.ListConstruct %int1_5274, %int512_5275, %int12288_5276 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4236 = torch.aten.view %4234, %4235 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_5277 = torch.constant.str "tanh"
    %4237 = torch.aten.gelu %4236, %str_5277 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_5278 = torch.constant.int 512
    %int12288_5279 = torch.constant.int 12288
    %4238 = torch.prim.ListConstruct %int512_5278, %int12288_5279 : (!torch.int, !torch.int) -> !torch.list<int>
    %4239 = torch.aten.view %4237, %4238 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.12.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.12.txt_mlp.2.weight : tensor<3072x12288xf16>
    %4240 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_5280 = torch.constant.int 0
    %int1_5281 = torch.constant.int 1
    %4241 = torch.aten.transpose.int %4240, %int0_5280, %int1_5281 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.12.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.12.txt_mlp.2.bias : tensor<3072xf16>
    %4242 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.12.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5282 = torch.constant.int 6
    %4243 = torch.prims.convert_element_type %4242, %int6_5282 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5283 = torch.constant.int 6
    %4244 = torch.prims.convert_element_type %4239, %int6_5283 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_5284 = torch.constant.int 6
    %4245 = torch.prims.convert_element_type %4241, %int6_5284 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %4246 = torch.aten.mm %4244, %4245 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_5285 = torch.constant.int 1
    %4247 = torch.aten.mul.Scalar %4246, %int1_5285 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_5286 = torch.constant.int 1
    %4248 = torch.aten.mul.Scalar %4243, %int1_5286 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5287 = torch.constant.int 1
    %4249 = torch.aten.add.Tensor %4247, %4248, %int1_5287 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_5288 = torch.constant.int 5
    %4250 = torch.prims.convert_element_type %4249, %int5_5288 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_5289 = torch.constant.int 1
    %int512_5290 = torch.constant.int 512
    %int3072_5291 = torch.constant.int 3072
    %4251 = torch.prim.ListConstruct %int1_5289, %int512_5290, %int3072_5291 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4252 = torch.aten.view %4250, %4251 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %4253 = torch.aten.mul.Tensor %3985, %4252 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5292 = torch.constant.int 1
    %4254 = torch.aten.add.Tensor %4211, %4253, %int1_5292 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %4255 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.13.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.13.img_mod.lin.weight : tensor<18432x3072xf16>
    %4256 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_5293 = torch.constant.int 0
    %int1_5294 = torch.constant.int 1
    %4257 = torch.aten.transpose.int %4256, %int0_5293, %int1_5294 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.13.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.13.img_mod.lin.bias : tensor<18432xf16>
    %4258 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_5295 = torch.constant.int 6
    %4259 = torch.prims.convert_element_type %4258, %int6_5295 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_5296 = torch.constant.int 6
    %4260 = torch.prims.convert_element_type %4255, %int6_5296 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_5297 = torch.constant.int 6
    %4261 = torch.prims.convert_element_type %4257, %int6_5297 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %4262 = torch.aten.mm %4260, %4261 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_5298 = torch.constant.int 1
    %4263 = torch.aten.mul.Scalar %4262, %int1_5298 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_5299 = torch.constant.int 1
    %4264 = torch.aten.mul.Scalar %4259, %int1_5299 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_5300 = torch.constant.int 1
    %4265 = torch.aten.add.Tensor %4263, %4264, %int1_5300 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_5301 = torch.constant.int 5
    %4266 = torch.prims.convert_element_type %4265, %int5_5301 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_5302 = torch.constant.int 0
    %int0_5303 = torch.constant.int 0
    %int9223372036854775807_5304 = torch.constant.int 9223372036854775807
    %int1_5305 = torch.constant.int 1
    %4267 = torch.aten.slice.Tensor %4266, %int0_5302, %int0_5303, %int9223372036854775807_5304, %int1_5305 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_5306 = torch.constant.int 1
    %4268 = torch.aten.unsqueeze %4267, %int1_5306 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_5307 = torch.constant.int 2
    %int0_5308 = torch.constant.int 0
    %int9223372036854775807_5309 = torch.constant.int 9223372036854775807
    %int1_5310 = torch.constant.int 1
    %4269 = torch.aten.slice.Tensor %4268, %int2_5307, %int0_5308, %int9223372036854775807_5309, %int1_5310 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_5311 = torch.constant.int -1
    %int0_5312 = torch.constant.int 0
    %int3072_5313 = torch.constant.int 3072
    %int1_5314 = torch.constant.int 1
    %4270 = torch.aten.slice.Tensor %4269, %int-1_5311, %int0_5312, %int3072_5313, %int1_5314 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5315 = torch.constant.int -1
    %int3072_5316 = torch.constant.int 3072
    %int6144_5317 = torch.constant.int 6144
    %int1_5318 = torch.constant.int 1
    %4271 = torch.aten.slice.Tensor %4269, %int-1_5315, %int3072_5316, %int6144_5317, %int1_5318 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5319 = torch.constant.int -1
    %int6144_5320 = torch.constant.int 6144
    %int9216_5321 = torch.constant.int 9216
    %int1_5322 = torch.constant.int 1
    %4272 = torch.aten.slice.Tensor %4269, %int-1_5319, %int6144_5320, %int9216_5321, %int1_5322 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5323 = torch.constant.int -1
    %int9216_5324 = torch.constant.int 9216
    %int12288_5325 = torch.constant.int 12288
    %int1_5326 = torch.constant.int 1
    %4273 = torch.aten.slice.Tensor %4269, %int-1_5323, %int9216_5324, %int12288_5325, %int1_5326 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5327 = torch.constant.int -1
    %int12288_5328 = torch.constant.int 12288
    %int15360_5329 = torch.constant.int 15360
    %int1_5330 = torch.constant.int 1
    %4274 = torch.aten.slice.Tensor %4269, %int-1_5327, %int12288_5328, %int15360_5329, %int1_5330 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5331 = torch.constant.int -1
    %int15360_5332 = torch.constant.int 15360
    %int18432_5333 = torch.constant.int 18432
    %int1_5334 = torch.constant.int 1
    %4275 = torch.aten.slice.Tensor %4269, %int-1_5331, %int15360_5332, %int18432_5333, %int1_5334 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4276 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.13.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.13.txt_mod.lin.weight : tensor<18432x3072xf16>
    %4277 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_5335 = torch.constant.int 0
    %int1_5336 = torch.constant.int 1
    %4278 = torch.aten.transpose.int %4277, %int0_5335, %int1_5336 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.13.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.13.txt_mod.lin.bias : tensor<18432xf16>
    %4279 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_5337 = torch.constant.int 6
    %4280 = torch.prims.convert_element_type %4279, %int6_5337 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_5338 = torch.constant.int 6
    %4281 = torch.prims.convert_element_type %4276, %int6_5338 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_5339 = torch.constant.int 6
    %4282 = torch.prims.convert_element_type %4278, %int6_5339 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %4283 = torch.aten.mm %4281, %4282 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_5340 = torch.constant.int 1
    %4284 = torch.aten.mul.Scalar %4283, %int1_5340 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_5341 = torch.constant.int 1
    %4285 = torch.aten.mul.Scalar %4280, %int1_5341 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_5342 = torch.constant.int 1
    %4286 = torch.aten.add.Tensor %4284, %4285, %int1_5342 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_5343 = torch.constant.int 5
    %4287 = torch.prims.convert_element_type %4286, %int5_5343 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_5344 = torch.constant.int 0
    %int0_5345 = torch.constant.int 0
    %int9223372036854775807_5346 = torch.constant.int 9223372036854775807
    %int1_5347 = torch.constant.int 1
    %4288 = torch.aten.slice.Tensor %4287, %int0_5344, %int0_5345, %int9223372036854775807_5346, %int1_5347 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_5348 = torch.constant.int 1
    %4289 = torch.aten.unsqueeze %4288, %int1_5348 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_5349 = torch.constant.int 2
    %int0_5350 = torch.constant.int 0
    %int9223372036854775807_5351 = torch.constant.int 9223372036854775807
    %int1_5352 = torch.constant.int 1
    %4290 = torch.aten.slice.Tensor %4289, %int2_5349, %int0_5350, %int9223372036854775807_5351, %int1_5352 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_5353 = torch.constant.int -1
    %int0_5354 = torch.constant.int 0
    %int3072_5355 = torch.constant.int 3072
    %int1_5356 = torch.constant.int 1
    %4291 = torch.aten.slice.Tensor %4290, %int-1_5353, %int0_5354, %int3072_5355, %int1_5356 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5357 = torch.constant.int -1
    %int3072_5358 = torch.constant.int 3072
    %int6144_5359 = torch.constant.int 6144
    %int1_5360 = torch.constant.int 1
    %4292 = torch.aten.slice.Tensor %4290, %int-1_5357, %int3072_5358, %int6144_5359, %int1_5360 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5361 = torch.constant.int -1
    %int6144_5362 = torch.constant.int 6144
    %int9216_5363 = torch.constant.int 9216
    %int1_5364 = torch.constant.int 1
    %4293 = torch.aten.slice.Tensor %4290, %int-1_5361, %int6144_5362, %int9216_5363, %int1_5364 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5365 = torch.constant.int -1
    %int9216_5366 = torch.constant.int 9216
    %int12288_5367 = torch.constant.int 12288
    %int1_5368 = torch.constant.int 1
    %4294 = torch.aten.slice.Tensor %4290, %int-1_5365, %int9216_5366, %int12288_5367, %int1_5368 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5369 = torch.constant.int -1
    %int12288_5370 = torch.constant.int 12288
    %int15360_5371 = torch.constant.int 15360
    %int1_5372 = torch.constant.int 1
    %4295 = torch.aten.slice.Tensor %4290, %int-1_5369, %int12288_5370, %int15360_5371, %int1_5372 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5373 = torch.constant.int -1
    %int15360_5374 = torch.constant.int 15360
    %int18432_5375 = torch.constant.int 18432
    %int1_5376 = torch.constant.int 1
    %4296 = torch.aten.slice.Tensor %4290, %int-1_5373, %int15360_5374, %int18432_5375, %int1_5376 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_5377 = torch.constant.int 6
    %4297 = torch.prims.convert_element_type %4194, %int6_5377 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_5378 = torch.constant.int 2
    %4298 = torch.prim.ListConstruct %int2_5378 : (!torch.int) -> !torch.list<int>
    %int0_5379 = torch.constant.int 0
    %true_5380 = torch.constant.bool true
    %result0_5381, %result1_5382 = torch.aten.var_mean.correction %4297, %4298, %int0_5379, %true_5380 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_5383 = torch.constant.float 9.9999999999999995E-7
    %int1_5384 = torch.constant.int 1
    %4299 = torch.aten.add.Scalar %result0_5381, %float9.999990e-07_5383, %int1_5384 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %4300 = torch.aten.rsqrt %4299 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_5385 = torch.constant.int 1
    %4301 = torch.aten.sub.Tensor %4194, %result1_5382, %int1_5385 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %4302 = torch.aten.mul.Tensor %4301, %4300 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_5386 = torch.constant.int 5
    %4303 = torch.prims.convert_element_type %4302, %int5_5386 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5387 = torch.constant.int 1
    %int1_5388 = torch.constant.int 1
    %4304 = torch.aten.add.Scalar %4271, %int1_5387, %int1_5388 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4305 = torch.aten.mul.Tensor %4304, %4303 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5389 = torch.constant.int 1
    %4306 = torch.aten.add.Tensor %4305, %4270, %int1_5389 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_5390 = torch.constant.int 4096
    %int3072_5391 = torch.constant.int 3072
    %4307 = torch.prim.ListConstruct %int4096_5390, %int3072_5391 : (!torch.int, !torch.int) -> !torch.list<int>
    %4308 = torch.aten.view %4306, %4307 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.13.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.13.img_attn.qkv.weight : tensor<9216x3072xf16>
    %4309 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_5392 = torch.constant.int 0
    %int1_5393 = torch.constant.int 1
    %4310 = torch.aten.transpose.int %4309, %int0_5392, %int1_5393 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.13.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.13.img_attn.qkv.bias : tensor<9216xf16>
    %4311 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_5394 = torch.constant.int 6
    %4312 = torch.prims.convert_element_type %4311, %int6_5394 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_5395 = torch.constant.int 6
    %4313 = torch.prims.convert_element_type %4308, %int6_5395 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_5396 = torch.constant.int 6
    %4314 = torch.prims.convert_element_type %4310, %int6_5396 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %4315 = torch.aten.mm %4313, %4314 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_5397 = torch.constant.int 1
    %4316 = torch.aten.mul.Scalar %4315, %int1_5397 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_5398 = torch.constant.int 1
    %4317 = torch.aten.mul.Scalar %4312, %int1_5398 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_5399 = torch.constant.int 1
    %4318 = torch.aten.add.Tensor %4316, %4317, %int1_5399 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_5400 = torch.constant.int 5
    %4319 = torch.prims.convert_element_type %4318, %int5_5400 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_5401 = torch.constant.int 1
    %int4096_5402 = torch.constant.int 4096
    %int9216_5403 = torch.constant.int 9216
    %4320 = torch.prim.ListConstruct %int1_5401, %int4096_5402, %int9216_5403 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4321 = torch.aten.view %4319, %4320 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_5404 = torch.constant.int 1
    %int4096_5405 = torch.constant.int 4096
    %int3_5406 = torch.constant.int 3
    %int24_5407 = torch.constant.int 24
    %int128_5408 = torch.constant.int 128
    %4322 = torch.prim.ListConstruct %int1_5404, %int4096_5405, %int3_5406, %int24_5407, %int128_5408 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4323 = torch.aten.view %4321, %4322 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_5409 = torch.constant.int 2
    %int0_5410 = torch.constant.int 0
    %int3_5411 = torch.constant.int 3
    %int1_5412 = torch.constant.int 1
    %int4_5413 = torch.constant.int 4
    %4324 = torch.prim.ListConstruct %int2_5409, %int0_5410, %int3_5411, %int1_5412, %int4_5413 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4325 = torch.aten.permute %4323, %4324 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_5414 = torch.constant.int 0
    %int0_5415 = torch.constant.int 0
    %4326 = torch.aten.select.int %4325, %int0_5414, %int0_5415 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_5416 = torch.constant.int 0
    %int1_5417 = torch.constant.int 1
    %4327 = torch.aten.select.int %4325, %int0_5416, %int1_5417 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_5418 = torch.constant.int 0
    %int2_5419 = torch.constant.int 2
    %4328 = torch.aten.select.int %4325, %int0_5418, %int2_5419 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_5420 = torch.constant.int 6
    %4329 = torch.prims.convert_element_type %4326, %int6_5420 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_5421 = torch.constant.int 2
    %4330 = torch.aten.pow.Tensor_Scalar %4329, %int2_5421 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_5422 = torch.constant.int -1
    %4331 = torch.prim.ListConstruct %int-1_5422 : (!torch.int) -> !torch.list<int>
    %true_5423 = torch.constant.bool true
    %none_5424 = torch.constant.none
    %4332 = torch.aten.mean.dim %4330, %4331, %true_5423, %none_5424 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_5425 = torch.constant.float 9.9999999999999995E-7
    %int1_5426 = torch.constant.int 1
    %4333 = torch.aten.add.Scalar %4332, %float9.999990e-07_5425, %int1_5426 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %4334 = torch.aten.rsqrt %4333 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %4335 = torch.aten.mul.Tensor %4329, %4334 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_5427 = torch.constant.int 5
    %4336 = torch.prims.convert_element_type %4335, %int5_5427 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.13.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.13.img_attn.norm.query_norm.scale : tensor<128xf16>
    %4337 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4338 = torch.aten.mul.Tensor %4336, %4337 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_5428 = torch.constant.int 6
    %4339 = torch.prims.convert_element_type %4327, %int6_5428 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_5429 = torch.constant.int 2
    %4340 = torch.aten.pow.Tensor_Scalar %4339, %int2_5429 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_5430 = torch.constant.int -1
    %4341 = torch.prim.ListConstruct %int-1_5430 : (!torch.int) -> !torch.list<int>
    %true_5431 = torch.constant.bool true
    %none_5432 = torch.constant.none
    %4342 = torch.aten.mean.dim %4340, %4341, %true_5431, %none_5432 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_5433 = torch.constant.float 9.9999999999999995E-7
    %int1_5434 = torch.constant.int 1
    %4343 = torch.aten.add.Scalar %4342, %float9.999990e-07_5433, %int1_5434 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %4344 = torch.aten.rsqrt %4343 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %4345 = torch.aten.mul.Tensor %4339, %4344 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_5435 = torch.constant.int 5
    %4346 = torch.prims.convert_element_type %4345, %int5_5435 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.13.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.13.img_attn.norm.key_norm.scale : tensor<128xf16>
    %4347 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4348 = torch.aten.mul.Tensor %4346, %4347 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_5436 = torch.constant.int 5
    %4349 = torch.prims.convert_element_type %4338, %int5_5436 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_5437 = torch.constant.int 5
    %4350 = torch.prims.convert_element_type %4348, %int5_5437 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_5438 = torch.constant.int 6
    %4351 = torch.prims.convert_element_type %4254, %int6_5438 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_5439 = torch.constant.int 2
    %4352 = torch.prim.ListConstruct %int2_5439 : (!torch.int) -> !torch.list<int>
    %int0_5440 = torch.constant.int 0
    %true_5441 = torch.constant.bool true
    %result0_5442, %result1_5443 = torch.aten.var_mean.correction %4351, %4352, %int0_5440, %true_5441 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_5444 = torch.constant.float 9.9999999999999995E-7
    %int1_5445 = torch.constant.int 1
    %4353 = torch.aten.add.Scalar %result0_5442, %float9.999990e-07_5444, %int1_5445 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %4354 = torch.aten.rsqrt %4353 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_5446 = torch.constant.int 1
    %4355 = torch.aten.sub.Tensor %4254, %result1_5443, %int1_5446 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %4356 = torch.aten.mul.Tensor %4355, %4354 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_5447 = torch.constant.int 5
    %4357 = torch.prims.convert_element_type %4356, %int5_5447 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_5448 = torch.constant.int 1
    %int1_5449 = torch.constant.int 1
    %4358 = torch.aten.add.Scalar %4292, %int1_5448, %int1_5449 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4359 = torch.aten.mul.Tensor %4358, %4357 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5450 = torch.constant.int 1
    %4360 = torch.aten.add.Tensor %4359, %4291, %int1_5450 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_5451 = torch.constant.int 512
    %int3072_5452 = torch.constant.int 3072
    %4361 = torch.prim.ListConstruct %int512_5451, %int3072_5452 : (!torch.int, !torch.int) -> !torch.list<int>
    %4362 = torch.aten.view %4360, %4361 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.13.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.13.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %4363 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_5453 = torch.constant.int 0
    %int1_5454 = torch.constant.int 1
    %4364 = torch.aten.transpose.int %4363, %int0_5453, %int1_5454 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.13.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.13.txt_attn.qkv.bias : tensor<9216xf16>
    %4365 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_5455 = torch.constant.int 6
    %4366 = torch.prims.convert_element_type %4365, %int6_5455 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_5456 = torch.constant.int 6
    %4367 = torch.prims.convert_element_type %4362, %int6_5456 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_5457 = torch.constant.int 6
    %4368 = torch.prims.convert_element_type %4364, %int6_5457 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %4369 = torch.aten.mm %4367, %4368 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_5458 = torch.constant.int 1
    %4370 = torch.aten.mul.Scalar %4369, %int1_5458 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_5459 = torch.constant.int 1
    %4371 = torch.aten.mul.Scalar %4366, %int1_5459 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_5460 = torch.constant.int 1
    %4372 = torch.aten.add.Tensor %4370, %4371, %int1_5460 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_5461 = torch.constant.int 5
    %4373 = torch.prims.convert_element_type %4372, %int5_5461 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_5462 = torch.constant.int 1
    %int512_5463 = torch.constant.int 512
    %int9216_5464 = torch.constant.int 9216
    %4374 = torch.prim.ListConstruct %int1_5462, %int512_5463, %int9216_5464 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4375 = torch.aten.view %4373, %4374 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_5465 = torch.constant.int 1
    %int512_5466 = torch.constant.int 512
    %int3_5467 = torch.constant.int 3
    %int24_5468 = torch.constant.int 24
    %int128_5469 = torch.constant.int 128
    %4376 = torch.prim.ListConstruct %int1_5465, %int512_5466, %int3_5467, %int24_5468, %int128_5469 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4377 = torch.aten.view %4375, %4376 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_5470 = torch.constant.int 2
    %int0_5471 = torch.constant.int 0
    %int3_5472 = torch.constant.int 3
    %int1_5473 = torch.constant.int 1
    %int4_5474 = torch.constant.int 4
    %4378 = torch.prim.ListConstruct %int2_5470, %int0_5471, %int3_5472, %int1_5473, %int4_5474 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4379 = torch.aten.permute %4377, %4378 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_5475 = torch.constant.int 0
    %int0_5476 = torch.constant.int 0
    %4380 = torch.aten.select.int %4379, %int0_5475, %int0_5476 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_5477 = torch.constant.int 0
    %int1_5478 = torch.constant.int 1
    %4381 = torch.aten.select.int %4379, %int0_5477, %int1_5478 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_5479 = torch.constant.int 0
    %int2_5480 = torch.constant.int 2
    %4382 = torch.aten.select.int %4379, %int0_5479, %int2_5480 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_5481 = torch.constant.int 6
    %4383 = torch.prims.convert_element_type %4380, %int6_5481 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_5482 = torch.constant.int 2
    %4384 = torch.aten.pow.Tensor_Scalar %4383, %int2_5482 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_5483 = torch.constant.int -1
    %4385 = torch.prim.ListConstruct %int-1_5483 : (!torch.int) -> !torch.list<int>
    %true_5484 = torch.constant.bool true
    %none_5485 = torch.constant.none
    %4386 = torch.aten.mean.dim %4384, %4385, %true_5484, %none_5485 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_5486 = torch.constant.float 9.9999999999999995E-7
    %int1_5487 = torch.constant.int 1
    %4387 = torch.aten.add.Scalar %4386, %float9.999990e-07_5486, %int1_5487 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %4388 = torch.aten.rsqrt %4387 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %4389 = torch.aten.mul.Tensor %4383, %4388 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_5488 = torch.constant.int 5
    %4390 = torch.prims.convert_element_type %4389, %int5_5488 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.13.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.13.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %4391 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4392 = torch.aten.mul.Tensor %4390, %4391 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_5489 = torch.constant.int 6
    %4393 = torch.prims.convert_element_type %4381, %int6_5489 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_5490 = torch.constant.int 2
    %4394 = torch.aten.pow.Tensor_Scalar %4393, %int2_5490 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_5491 = torch.constant.int -1
    %4395 = torch.prim.ListConstruct %int-1_5491 : (!torch.int) -> !torch.list<int>
    %true_5492 = torch.constant.bool true
    %none_5493 = torch.constant.none
    %4396 = torch.aten.mean.dim %4394, %4395, %true_5492, %none_5493 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_5494 = torch.constant.float 9.9999999999999995E-7
    %int1_5495 = torch.constant.int 1
    %4397 = torch.aten.add.Scalar %4396, %float9.999990e-07_5494, %int1_5495 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %4398 = torch.aten.rsqrt %4397 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %4399 = torch.aten.mul.Tensor %4393, %4398 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_5496 = torch.constant.int 5
    %4400 = torch.prims.convert_element_type %4399, %int5_5496 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.13.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.13.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %4401 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4402 = torch.aten.mul.Tensor %4400, %4401 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_5497 = torch.constant.int 5
    %4403 = torch.prims.convert_element_type %4392, %int5_5497 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_5498 = torch.constant.int 5
    %4404 = torch.prims.convert_element_type %4402, %int5_5498 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %4405 = torch.prim.ListConstruct %4403, %4349 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_5499 = torch.constant.int 2
    %4406 = torch.aten.cat %4405, %int2_5499 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %4407 = torch.prim.ListConstruct %4404, %4350 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_5500 = torch.constant.int 2
    %4408 = torch.aten.cat %4407, %int2_5500 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %4409 = torch.prim.ListConstruct %4382, %4328 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_5501 = torch.constant.int 2
    %4410 = torch.aten.cat %4409, %int2_5501 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_5502 = torch.constant.int 6
    %4411 = torch.prims.convert_element_type %4406, %int6_5502 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_5503 = torch.constant.int 1
    %int24_5504 = torch.constant.int 24
    %int4608_5505 = torch.constant.int 4608
    %int-1_5506 = torch.constant.int -1
    %int1_5507 = torch.constant.int 1
    %int2_5508 = torch.constant.int 2
    %4412 = torch.prim.ListConstruct %int1_5503, %int24_5504, %int4608_5505, %int-1_5506, %int1_5507, %int2_5508 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4413 = torch.aten.view %4411, %4412 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_5509 = torch.constant.int 6
    %4414 = torch.prims.convert_element_type %4408, %int6_5509 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_5510 = torch.constant.int 1
    %int24_5511 = torch.constant.int 24
    %int4608_5512 = torch.constant.int 4608
    %int-1_5513 = torch.constant.int -1
    %int1_5514 = torch.constant.int 1
    %int2_5515 = torch.constant.int 2
    %4415 = torch.prim.ListConstruct %int1_5510, %int24_5511, %int4608_5512, %int-1_5513, %int1_5514, %int2_5515 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4416 = torch.aten.view %4414, %4415 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_5516 = torch.constant.int 5
    %int0_5517 = torch.constant.int 0
    %4417 = torch.aten.select.int %211, %int5_5516, %int0_5517 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5518 = torch.constant.int 5
    %int0_5519 = torch.constant.int 0
    %4418 = torch.aten.select.int %4413, %int5_5518, %int0_5519 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4419 = torch.aten.mul.Tensor %4417, %4418 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_5520 = torch.constant.int 5
    %int1_5521 = torch.constant.int 1
    %4420 = torch.aten.select.int %211, %int5_5520, %int1_5521 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5522 = torch.constant.int 5
    %int1_5523 = torch.constant.int 1
    %4421 = torch.aten.select.int %4413, %int5_5522, %int1_5523 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4422 = torch.aten.mul.Tensor %4420, %4421 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_5524 = torch.constant.int 1
    %4423 = torch.aten.add.Tensor %4419, %4422, %int1_5524 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_5525 = torch.constant.int 5
    %int0_5526 = torch.constant.int 0
    %4424 = torch.aten.select.int %211, %int5_5525, %int0_5526 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5527 = torch.constant.int 5
    %int0_5528 = torch.constant.int 0
    %4425 = torch.aten.select.int %4416, %int5_5527, %int0_5528 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4426 = torch.aten.mul.Tensor %4424, %4425 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_5529 = torch.constant.int 5
    %int1_5530 = torch.constant.int 1
    %4427 = torch.aten.select.int %211, %int5_5529, %int1_5530 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5531 = torch.constant.int 5
    %int1_5532 = torch.constant.int 1
    %4428 = torch.aten.select.int %4416, %int5_5531, %int1_5532 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4429 = torch.aten.mul.Tensor %4427, %4428 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_5533 = torch.constant.int 1
    %4430 = torch.aten.add.Tensor %4426, %4429, %int1_5533 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_5534 = torch.constant.int 1
    %int24_5535 = torch.constant.int 24
    %int4608_5536 = torch.constant.int 4608
    %int128_5537 = torch.constant.int 128
    %4431 = torch.prim.ListConstruct %int1_5534, %int24_5535, %int4608_5536, %int128_5537 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4432 = torch.aten.view %4423, %4431 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_5538 = torch.constant.int 5
    %4433 = torch.prims.convert_element_type %4432, %int5_5538 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_5539 = torch.constant.int 1
    %int24_5540 = torch.constant.int 24
    %int4608_5541 = torch.constant.int 4608
    %int128_5542 = torch.constant.int 128
    %4434 = torch.prim.ListConstruct %int1_5539, %int24_5540, %int4608_5541, %int128_5542 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4435 = torch.aten.view %4430, %4434 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_5543 = torch.constant.int 5
    %4436 = torch.prims.convert_element_type %4435, %int5_5543 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_5544 = torch.constant.float 0.000000e+00
    %false_5545 = torch.constant.bool false
    %none_5546 = torch.constant.none
    %none_5547 = torch.constant.none
    %4437:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%4433, %4436, %4410, %float0.000000e00_5544, %false_5545, %none_5546, %none_5547) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_5548 = torch.constant.int 0
    %int2_5549 = torch.constant.int 2
    %int1_5550 = torch.constant.int 1
    %int3_5551 = torch.constant.int 3
    %4438 = torch.prim.ListConstruct %int0_5548, %int2_5549, %int1_5550, %int3_5551 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4439 = torch.aten.permute %4437#0, %4438 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_5552 = torch.constant.int 1
    %int4608_5553 = torch.constant.int 4608
    %int3072_5554 = torch.constant.int 3072
    %4440 = torch.prim.ListConstruct %int1_5552, %int4608_5553, %int3072_5554 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4441 = torch.aten.view %4439, %4440 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_5555 = torch.constant.int 0
    %int0_5556 = torch.constant.int 0
    %int9223372036854775807_5557 = torch.constant.int 9223372036854775807
    %int1_5558 = torch.constant.int 1
    %4442 = torch.aten.slice.Tensor %4441, %int0_5555, %int0_5556, %int9223372036854775807_5557, %int1_5558 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_5559 = torch.constant.int 1
    %int0_5560 = torch.constant.int 0
    %int512_5561 = torch.constant.int 512
    %int1_5562 = torch.constant.int 1
    %4443 = torch.aten.slice.Tensor %4442, %int1_5559, %int0_5560, %int512_5561, %int1_5562 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_5563 = torch.constant.int 0
    %int0_5564 = torch.constant.int 0
    %int9223372036854775807_5565 = torch.constant.int 9223372036854775807
    %int1_5566 = torch.constant.int 1
    %4444 = torch.aten.slice.Tensor %4441, %int0_5563, %int0_5564, %int9223372036854775807_5565, %int1_5566 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_5567 = torch.constant.int 1
    %int512_5568 = torch.constant.int 512
    %int9223372036854775807_5569 = torch.constant.int 9223372036854775807
    %int1_5570 = torch.constant.int 1
    %4445 = torch.aten.slice.Tensor %4444, %int1_5567, %int512_5568, %int9223372036854775807_5569, %int1_5570 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_5571 = torch.constant.int 4096
    %int3072_5572 = torch.constant.int 3072
    %4446 = torch.prim.ListConstruct %int4096_5571, %int3072_5572 : (!torch.int, !torch.int) -> !torch.list<int>
    %4447 = torch.aten.view %4445, %4446 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.13.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.13.img_attn.proj.weight : tensor<3072x3072xf16>
    %4448 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_5573 = torch.constant.int 0
    %int1_5574 = torch.constant.int 1
    %4449 = torch.aten.transpose.int %4448, %int0_5573, %int1_5574 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.13.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.13.img_attn.proj.bias : tensor<3072xf16>
    %4450 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5575 = torch.constant.int 6
    %4451 = torch.prims.convert_element_type %4450, %int6_5575 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5576 = torch.constant.int 6
    %4452 = torch.prims.convert_element_type %4447, %int6_5576 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_5577 = torch.constant.int 6
    %4453 = torch.prims.convert_element_type %4449, %int6_5577 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %4454 = torch.aten.mm %4452, %4453 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_5578 = torch.constant.int 1
    %4455 = torch.aten.mul.Scalar %4454, %int1_5578 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_5579 = torch.constant.int 1
    %4456 = torch.aten.mul.Scalar %4451, %int1_5579 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5580 = torch.constant.int 1
    %4457 = torch.aten.add.Tensor %4455, %4456, %int1_5580 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_5581 = torch.constant.int 5
    %4458 = torch.prims.convert_element_type %4457, %int5_5581 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_5582 = torch.constant.int 1
    %int4096_5583 = torch.constant.int 4096
    %int3072_5584 = torch.constant.int 3072
    %4459 = torch.prim.ListConstruct %int1_5582, %int4096_5583, %int3072_5584 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4460 = torch.aten.view %4458, %4459 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %4461 = torch.aten.mul.Tensor %4272, %4460 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5585 = torch.constant.int 1
    %4462 = torch.aten.add.Tensor %4194, %4461, %int1_5585 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5586 = torch.constant.int 1
    %int1_5587 = torch.constant.int 1
    %4463 = torch.aten.add.Scalar %4274, %int1_5586, %int1_5587 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_5588 = torch.constant.int 6
    %4464 = torch.prims.convert_element_type %4462, %int6_5588 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_5589 = torch.constant.int 2
    %4465 = torch.prim.ListConstruct %int2_5589 : (!torch.int) -> !torch.list<int>
    %int0_5590 = torch.constant.int 0
    %true_5591 = torch.constant.bool true
    %result0_5592, %result1_5593 = torch.aten.var_mean.correction %4464, %4465, %int0_5590, %true_5591 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_5594 = torch.constant.float 9.9999999999999995E-7
    %int1_5595 = torch.constant.int 1
    %4466 = torch.aten.add.Scalar %result0_5592, %float9.999990e-07_5594, %int1_5595 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %4467 = torch.aten.rsqrt %4466 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_5596 = torch.constant.int 1
    %4468 = torch.aten.sub.Tensor %4462, %result1_5593, %int1_5596 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %4469 = torch.aten.mul.Tensor %4468, %4467 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_5597 = torch.constant.int 5
    %4470 = torch.prims.convert_element_type %4469, %int5_5597 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %4471 = torch.aten.mul.Tensor %4463, %4470 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5598 = torch.constant.int 1
    %4472 = torch.aten.add.Tensor %4471, %4273, %int1_5598 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_5599 = torch.constant.int 4096
    %int3072_5600 = torch.constant.int 3072
    %4473 = torch.prim.ListConstruct %int4096_5599, %int3072_5600 : (!torch.int, !torch.int) -> !torch.list<int>
    %4474 = torch.aten.view %4472, %4473 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.13.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.13.img_mlp.0.weight : tensor<12288x3072xf16>
    %4475 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_5601 = torch.constant.int 0
    %int1_5602 = torch.constant.int 1
    %4476 = torch.aten.transpose.int %4475, %int0_5601, %int1_5602 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.13.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.13.img_mlp.0.bias : tensor<12288xf16>
    %4477 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_5603 = torch.constant.int 6
    %4478 = torch.prims.convert_element_type %4477, %int6_5603 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_5604 = torch.constant.int 6
    %4479 = torch.prims.convert_element_type %4474, %int6_5604 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_5605 = torch.constant.int 6
    %4480 = torch.prims.convert_element_type %4476, %int6_5605 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %4481 = torch.aten.mm %4479, %4480 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_5606 = torch.constant.int 1
    %4482 = torch.aten.mul.Scalar %4481, %int1_5606 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_5607 = torch.constant.int 1
    %4483 = torch.aten.mul.Scalar %4478, %int1_5607 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_5608 = torch.constant.int 1
    %4484 = torch.aten.add.Tensor %4482, %4483, %int1_5608 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_5609 = torch.constant.int 5
    %4485 = torch.prims.convert_element_type %4484, %int5_5609 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_5610 = torch.constant.int 1
    %int4096_5611 = torch.constant.int 4096
    %int12288_5612 = torch.constant.int 12288
    %4486 = torch.prim.ListConstruct %int1_5610, %int4096_5611, %int12288_5612 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4487 = torch.aten.view %4485, %4486 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_5613 = torch.constant.str "tanh"
    %4488 = torch.aten.gelu %4487, %str_5613 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_5614 = torch.constant.int 4096
    %int12288_5615 = torch.constant.int 12288
    %4489 = torch.prim.ListConstruct %int4096_5614, %int12288_5615 : (!torch.int, !torch.int) -> !torch.list<int>
    %4490 = torch.aten.view %4488, %4489 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.13.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.13.img_mlp.2.weight : tensor<3072x12288xf16>
    %4491 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_5616 = torch.constant.int 0
    %int1_5617 = torch.constant.int 1
    %4492 = torch.aten.transpose.int %4491, %int0_5616, %int1_5617 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.13.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.13.img_mlp.2.bias : tensor<3072xf16>
    %4493 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5618 = torch.constant.int 6
    %4494 = torch.prims.convert_element_type %4493, %int6_5618 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5619 = torch.constant.int 6
    %4495 = torch.prims.convert_element_type %4490, %int6_5619 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_5620 = torch.constant.int 6
    %4496 = torch.prims.convert_element_type %4492, %int6_5620 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %4497 = torch.aten.mm %4495, %4496 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_5621 = torch.constant.int 1
    %4498 = torch.aten.mul.Scalar %4497, %int1_5621 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_5622 = torch.constant.int 1
    %4499 = torch.aten.mul.Scalar %4494, %int1_5622 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5623 = torch.constant.int 1
    %4500 = torch.aten.add.Tensor %4498, %4499, %int1_5623 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_5624 = torch.constant.int 5
    %4501 = torch.prims.convert_element_type %4500, %int5_5624 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_5625 = torch.constant.int 1
    %int4096_5626 = torch.constant.int 4096
    %int3072_5627 = torch.constant.int 3072
    %4502 = torch.prim.ListConstruct %int1_5625, %int4096_5626, %int3072_5627 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4503 = torch.aten.view %4501, %4502 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %4504 = torch.aten.mul.Tensor %4275, %4503 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5628 = torch.constant.int 1
    %4505 = torch.aten.add.Tensor %4462, %4504, %int1_5628 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_5629 = torch.constant.int 512
    %int3072_5630 = torch.constant.int 3072
    %4506 = torch.prim.ListConstruct %int512_5629, %int3072_5630 : (!torch.int, !torch.int) -> !torch.list<int>
    %4507 = torch.aten.view %4443, %4506 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.13.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.13.txt_attn.proj.weight : tensor<3072x3072xf16>
    %4508 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_5631 = torch.constant.int 0
    %int1_5632 = torch.constant.int 1
    %4509 = torch.aten.transpose.int %4508, %int0_5631, %int1_5632 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.13.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.13.txt_attn.proj.bias : tensor<3072xf16>
    %4510 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5633 = torch.constant.int 6
    %4511 = torch.prims.convert_element_type %4510, %int6_5633 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5634 = torch.constant.int 6
    %4512 = torch.prims.convert_element_type %4507, %int6_5634 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_5635 = torch.constant.int 6
    %4513 = torch.prims.convert_element_type %4509, %int6_5635 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %4514 = torch.aten.mm %4512, %4513 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_5636 = torch.constant.int 1
    %4515 = torch.aten.mul.Scalar %4514, %int1_5636 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_5637 = torch.constant.int 1
    %4516 = torch.aten.mul.Scalar %4511, %int1_5637 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5638 = torch.constant.int 1
    %4517 = torch.aten.add.Tensor %4515, %4516, %int1_5638 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_5639 = torch.constant.int 5
    %4518 = torch.prims.convert_element_type %4517, %int5_5639 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_5640 = torch.constant.int 1
    %int512_5641 = torch.constant.int 512
    %int3072_5642 = torch.constant.int 3072
    %4519 = torch.prim.ListConstruct %int1_5640, %int512_5641, %int3072_5642 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4520 = torch.aten.view %4518, %4519 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %4521 = torch.aten.mul.Tensor %4293, %4520 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5643 = torch.constant.int 1
    %4522 = torch.aten.add.Tensor %4254, %4521, %int1_5643 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_5644 = torch.constant.int 1
    %int1_5645 = torch.constant.int 1
    %4523 = torch.aten.add.Scalar %4295, %int1_5644, %int1_5645 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_5646 = torch.constant.int 6
    %4524 = torch.prims.convert_element_type %4522, %int6_5646 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_5647 = torch.constant.int 2
    %4525 = torch.prim.ListConstruct %int2_5647 : (!torch.int) -> !torch.list<int>
    %int0_5648 = torch.constant.int 0
    %true_5649 = torch.constant.bool true
    %result0_5650, %result1_5651 = torch.aten.var_mean.correction %4524, %4525, %int0_5648, %true_5649 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_5652 = torch.constant.float 9.9999999999999995E-7
    %int1_5653 = torch.constant.int 1
    %4526 = torch.aten.add.Scalar %result0_5650, %float9.999990e-07_5652, %int1_5653 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %4527 = torch.aten.rsqrt %4526 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_5654 = torch.constant.int 1
    %4528 = torch.aten.sub.Tensor %4522, %result1_5651, %int1_5654 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %4529 = torch.aten.mul.Tensor %4528, %4527 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_5655 = torch.constant.int 5
    %4530 = torch.prims.convert_element_type %4529, %int5_5655 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %4531 = torch.aten.mul.Tensor %4523, %4530 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5656 = torch.constant.int 1
    %4532 = torch.aten.add.Tensor %4531, %4294, %int1_5656 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_5657 = torch.constant.int 512
    %int3072_5658 = torch.constant.int 3072
    %4533 = torch.prim.ListConstruct %int512_5657, %int3072_5658 : (!torch.int, !torch.int) -> !torch.list<int>
    %4534 = torch.aten.view %4532, %4533 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.13.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.13.txt_mlp.0.weight : tensor<12288x3072xf16>
    %4535 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_5659 = torch.constant.int 0
    %int1_5660 = torch.constant.int 1
    %4536 = torch.aten.transpose.int %4535, %int0_5659, %int1_5660 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.13.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.13.txt_mlp.0.bias : tensor<12288xf16>
    %4537 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_5661 = torch.constant.int 6
    %4538 = torch.prims.convert_element_type %4537, %int6_5661 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_5662 = torch.constant.int 6
    %4539 = torch.prims.convert_element_type %4534, %int6_5662 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_5663 = torch.constant.int 6
    %4540 = torch.prims.convert_element_type %4536, %int6_5663 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %4541 = torch.aten.mm %4539, %4540 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_5664 = torch.constant.int 1
    %4542 = torch.aten.mul.Scalar %4541, %int1_5664 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_5665 = torch.constant.int 1
    %4543 = torch.aten.mul.Scalar %4538, %int1_5665 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_5666 = torch.constant.int 1
    %4544 = torch.aten.add.Tensor %4542, %4543, %int1_5666 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_5667 = torch.constant.int 5
    %4545 = torch.prims.convert_element_type %4544, %int5_5667 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_5668 = torch.constant.int 1
    %int512_5669 = torch.constant.int 512
    %int12288_5670 = torch.constant.int 12288
    %4546 = torch.prim.ListConstruct %int1_5668, %int512_5669, %int12288_5670 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4547 = torch.aten.view %4545, %4546 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_5671 = torch.constant.str "tanh"
    %4548 = torch.aten.gelu %4547, %str_5671 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_5672 = torch.constant.int 512
    %int12288_5673 = torch.constant.int 12288
    %4549 = torch.prim.ListConstruct %int512_5672, %int12288_5673 : (!torch.int, !torch.int) -> !torch.list<int>
    %4550 = torch.aten.view %4548, %4549 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.13.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.13.txt_mlp.2.weight : tensor<3072x12288xf16>
    %4551 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_5674 = torch.constant.int 0
    %int1_5675 = torch.constant.int 1
    %4552 = torch.aten.transpose.int %4551, %int0_5674, %int1_5675 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.13.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.13.txt_mlp.2.bias : tensor<3072xf16>
    %4553 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.13.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5676 = torch.constant.int 6
    %4554 = torch.prims.convert_element_type %4553, %int6_5676 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5677 = torch.constant.int 6
    %4555 = torch.prims.convert_element_type %4550, %int6_5677 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_5678 = torch.constant.int 6
    %4556 = torch.prims.convert_element_type %4552, %int6_5678 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %4557 = torch.aten.mm %4555, %4556 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_5679 = torch.constant.int 1
    %4558 = torch.aten.mul.Scalar %4557, %int1_5679 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_5680 = torch.constant.int 1
    %4559 = torch.aten.mul.Scalar %4554, %int1_5680 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5681 = torch.constant.int 1
    %4560 = torch.aten.add.Tensor %4558, %4559, %int1_5681 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_5682 = torch.constant.int 5
    %4561 = torch.prims.convert_element_type %4560, %int5_5682 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_5683 = torch.constant.int 1
    %int512_5684 = torch.constant.int 512
    %int3072_5685 = torch.constant.int 3072
    %4562 = torch.prim.ListConstruct %int1_5683, %int512_5684, %int3072_5685 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4563 = torch.aten.view %4561, %4562 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %4564 = torch.aten.mul.Tensor %4296, %4563 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5686 = torch.constant.int 1
    %4565 = torch.aten.add.Tensor %4522, %4564, %int1_5686 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %4566 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.14.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.14.img_mod.lin.weight : tensor<18432x3072xf16>
    %4567 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_5687 = torch.constant.int 0
    %int1_5688 = torch.constant.int 1
    %4568 = torch.aten.transpose.int %4567, %int0_5687, %int1_5688 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.14.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.14.img_mod.lin.bias : tensor<18432xf16>
    %4569 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_5689 = torch.constant.int 6
    %4570 = torch.prims.convert_element_type %4569, %int6_5689 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_5690 = torch.constant.int 6
    %4571 = torch.prims.convert_element_type %4566, %int6_5690 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_5691 = torch.constant.int 6
    %4572 = torch.prims.convert_element_type %4568, %int6_5691 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %4573 = torch.aten.mm %4571, %4572 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_5692 = torch.constant.int 1
    %4574 = torch.aten.mul.Scalar %4573, %int1_5692 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_5693 = torch.constant.int 1
    %4575 = torch.aten.mul.Scalar %4570, %int1_5693 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_5694 = torch.constant.int 1
    %4576 = torch.aten.add.Tensor %4574, %4575, %int1_5694 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_5695 = torch.constant.int 5
    %4577 = torch.prims.convert_element_type %4576, %int5_5695 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_5696 = torch.constant.int 0
    %int0_5697 = torch.constant.int 0
    %int9223372036854775807_5698 = torch.constant.int 9223372036854775807
    %int1_5699 = torch.constant.int 1
    %4578 = torch.aten.slice.Tensor %4577, %int0_5696, %int0_5697, %int9223372036854775807_5698, %int1_5699 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_5700 = torch.constant.int 1
    %4579 = torch.aten.unsqueeze %4578, %int1_5700 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_5701 = torch.constant.int 2
    %int0_5702 = torch.constant.int 0
    %int9223372036854775807_5703 = torch.constant.int 9223372036854775807
    %int1_5704 = torch.constant.int 1
    %4580 = torch.aten.slice.Tensor %4579, %int2_5701, %int0_5702, %int9223372036854775807_5703, %int1_5704 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_5705 = torch.constant.int -1
    %int0_5706 = torch.constant.int 0
    %int3072_5707 = torch.constant.int 3072
    %int1_5708 = torch.constant.int 1
    %4581 = torch.aten.slice.Tensor %4580, %int-1_5705, %int0_5706, %int3072_5707, %int1_5708 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5709 = torch.constant.int -1
    %int3072_5710 = torch.constant.int 3072
    %int6144_5711 = torch.constant.int 6144
    %int1_5712 = torch.constant.int 1
    %4582 = torch.aten.slice.Tensor %4580, %int-1_5709, %int3072_5710, %int6144_5711, %int1_5712 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5713 = torch.constant.int -1
    %int6144_5714 = torch.constant.int 6144
    %int9216_5715 = torch.constant.int 9216
    %int1_5716 = torch.constant.int 1
    %4583 = torch.aten.slice.Tensor %4580, %int-1_5713, %int6144_5714, %int9216_5715, %int1_5716 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5717 = torch.constant.int -1
    %int9216_5718 = torch.constant.int 9216
    %int12288_5719 = torch.constant.int 12288
    %int1_5720 = torch.constant.int 1
    %4584 = torch.aten.slice.Tensor %4580, %int-1_5717, %int9216_5718, %int12288_5719, %int1_5720 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5721 = torch.constant.int -1
    %int12288_5722 = torch.constant.int 12288
    %int15360_5723 = torch.constant.int 15360
    %int1_5724 = torch.constant.int 1
    %4585 = torch.aten.slice.Tensor %4580, %int-1_5721, %int12288_5722, %int15360_5723, %int1_5724 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5725 = torch.constant.int -1
    %int15360_5726 = torch.constant.int 15360
    %int18432_5727 = torch.constant.int 18432
    %int1_5728 = torch.constant.int 1
    %4586 = torch.aten.slice.Tensor %4580, %int-1_5725, %int15360_5726, %int18432_5727, %int1_5728 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4587 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.14.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.14.txt_mod.lin.weight : tensor<18432x3072xf16>
    %4588 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_5729 = torch.constant.int 0
    %int1_5730 = torch.constant.int 1
    %4589 = torch.aten.transpose.int %4588, %int0_5729, %int1_5730 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.14.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.14.txt_mod.lin.bias : tensor<18432xf16>
    %4590 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_5731 = torch.constant.int 6
    %4591 = torch.prims.convert_element_type %4590, %int6_5731 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_5732 = torch.constant.int 6
    %4592 = torch.prims.convert_element_type %4587, %int6_5732 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_5733 = torch.constant.int 6
    %4593 = torch.prims.convert_element_type %4589, %int6_5733 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %4594 = torch.aten.mm %4592, %4593 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_5734 = torch.constant.int 1
    %4595 = torch.aten.mul.Scalar %4594, %int1_5734 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_5735 = torch.constant.int 1
    %4596 = torch.aten.mul.Scalar %4591, %int1_5735 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_5736 = torch.constant.int 1
    %4597 = torch.aten.add.Tensor %4595, %4596, %int1_5736 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_5737 = torch.constant.int 5
    %4598 = torch.prims.convert_element_type %4597, %int5_5737 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_5738 = torch.constant.int 0
    %int0_5739 = torch.constant.int 0
    %int9223372036854775807_5740 = torch.constant.int 9223372036854775807
    %int1_5741 = torch.constant.int 1
    %4599 = torch.aten.slice.Tensor %4598, %int0_5738, %int0_5739, %int9223372036854775807_5740, %int1_5741 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_5742 = torch.constant.int 1
    %4600 = torch.aten.unsqueeze %4599, %int1_5742 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_5743 = torch.constant.int 2
    %int0_5744 = torch.constant.int 0
    %int9223372036854775807_5745 = torch.constant.int 9223372036854775807
    %int1_5746 = torch.constant.int 1
    %4601 = torch.aten.slice.Tensor %4600, %int2_5743, %int0_5744, %int9223372036854775807_5745, %int1_5746 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_5747 = torch.constant.int -1
    %int0_5748 = torch.constant.int 0
    %int3072_5749 = torch.constant.int 3072
    %int1_5750 = torch.constant.int 1
    %4602 = torch.aten.slice.Tensor %4601, %int-1_5747, %int0_5748, %int3072_5749, %int1_5750 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5751 = torch.constant.int -1
    %int3072_5752 = torch.constant.int 3072
    %int6144_5753 = torch.constant.int 6144
    %int1_5754 = torch.constant.int 1
    %4603 = torch.aten.slice.Tensor %4601, %int-1_5751, %int3072_5752, %int6144_5753, %int1_5754 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5755 = torch.constant.int -1
    %int6144_5756 = torch.constant.int 6144
    %int9216_5757 = torch.constant.int 9216
    %int1_5758 = torch.constant.int 1
    %4604 = torch.aten.slice.Tensor %4601, %int-1_5755, %int6144_5756, %int9216_5757, %int1_5758 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5759 = torch.constant.int -1
    %int9216_5760 = torch.constant.int 9216
    %int12288_5761 = torch.constant.int 12288
    %int1_5762 = torch.constant.int 1
    %4605 = torch.aten.slice.Tensor %4601, %int-1_5759, %int9216_5760, %int12288_5761, %int1_5762 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5763 = torch.constant.int -1
    %int12288_5764 = torch.constant.int 12288
    %int15360_5765 = torch.constant.int 15360
    %int1_5766 = torch.constant.int 1
    %4606 = torch.aten.slice.Tensor %4601, %int-1_5763, %int12288_5764, %int15360_5765, %int1_5766 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_5767 = torch.constant.int -1
    %int15360_5768 = torch.constant.int 15360
    %int18432_5769 = torch.constant.int 18432
    %int1_5770 = torch.constant.int 1
    %4607 = torch.aten.slice.Tensor %4601, %int-1_5767, %int15360_5768, %int18432_5769, %int1_5770 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_5771 = torch.constant.int 6
    %4608 = torch.prims.convert_element_type %4505, %int6_5771 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_5772 = torch.constant.int 2
    %4609 = torch.prim.ListConstruct %int2_5772 : (!torch.int) -> !torch.list<int>
    %int0_5773 = torch.constant.int 0
    %true_5774 = torch.constant.bool true
    %result0_5775, %result1_5776 = torch.aten.var_mean.correction %4608, %4609, %int0_5773, %true_5774 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_5777 = torch.constant.float 9.9999999999999995E-7
    %int1_5778 = torch.constant.int 1
    %4610 = torch.aten.add.Scalar %result0_5775, %float9.999990e-07_5777, %int1_5778 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %4611 = torch.aten.rsqrt %4610 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_5779 = torch.constant.int 1
    %4612 = torch.aten.sub.Tensor %4505, %result1_5776, %int1_5779 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %4613 = torch.aten.mul.Tensor %4612, %4611 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_5780 = torch.constant.int 5
    %4614 = torch.prims.convert_element_type %4613, %int5_5780 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5781 = torch.constant.int 1
    %int1_5782 = torch.constant.int 1
    %4615 = torch.aten.add.Scalar %4582, %int1_5781, %int1_5782 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4616 = torch.aten.mul.Tensor %4615, %4614 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5783 = torch.constant.int 1
    %4617 = torch.aten.add.Tensor %4616, %4581, %int1_5783 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_5784 = torch.constant.int 4096
    %int3072_5785 = torch.constant.int 3072
    %4618 = torch.prim.ListConstruct %int4096_5784, %int3072_5785 : (!torch.int, !torch.int) -> !torch.list<int>
    %4619 = torch.aten.view %4617, %4618 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.14.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.14.img_attn.qkv.weight : tensor<9216x3072xf16>
    %4620 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_5786 = torch.constant.int 0
    %int1_5787 = torch.constant.int 1
    %4621 = torch.aten.transpose.int %4620, %int0_5786, %int1_5787 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.14.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.14.img_attn.qkv.bias : tensor<9216xf16>
    %4622 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_5788 = torch.constant.int 6
    %4623 = torch.prims.convert_element_type %4622, %int6_5788 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_5789 = torch.constant.int 6
    %4624 = torch.prims.convert_element_type %4619, %int6_5789 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_5790 = torch.constant.int 6
    %4625 = torch.prims.convert_element_type %4621, %int6_5790 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %4626 = torch.aten.mm %4624, %4625 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_5791 = torch.constant.int 1
    %4627 = torch.aten.mul.Scalar %4626, %int1_5791 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_5792 = torch.constant.int 1
    %4628 = torch.aten.mul.Scalar %4623, %int1_5792 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_5793 = torch.constant.int 1
    %4629 = torch.aten.add.Tensor %4627, %4628, %int1_5793 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_5794 = torch.constant.int 5
    %4630 = torch.prims.convert_element_type %4629, %int5_5794 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_5795 = torch.constant.int 1
    %int4096_5796 = torch.constant.int 4096
    %int9216_5797 = torch.constant.int 9216
    %4631 = torch.prim.ListConstruct %int1_5795, %int4096_5796, %int9216_5797 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4632 = torch.aten.view %4630, %4631 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_5798 = torch.constant.int 1
    %int4096_5799 = torch.constant.int 4096
    %int3_5800 = torch.constant.int 3
    %int24_5801 = torch.constant.int 24
    %int128_5802 = torch.constant.int 128
    %4633 = torch.prim.ListConstruct %int1_5798, %int4096_5799, %int3_5800, %int24_5801, %int128_5802 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4634 = torch.aten.view %4632, %4633 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_5803 = torch.constant.int 2
    %int0_5804 = torch.constant.int 0
    %int3_5805 = torch.constant.int 3
    %int1_5806 = torch.constant.int 1
    %int4_5807 = torch.constant.int 4
    %4635 = torch.prim.ListConstruct %int2_5803, %int0_5804, %int3_5805, %int1_5806, %int4_5807 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4636 = torch.aten.permute %4634, %4635 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_5808 = torch.constant.int 0
    %int0_5809 = torch.constant.int 0
    %4637 = torch.aten.select.int %4636, %int0_5808, %int0_5809 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_5810 = torch.constant.int 0
    %int1_5811 = torch.constant.int 1
    %4638 = torch.aten.select.int %4636, %int0_5810, %int1_5811 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_5812 = torch.constant.int 0
    %int2_5813 = torch.constant.int 2
    %4639 = torch.aten.select.int %4636, %int0_5812, %int2_5813 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_5814 = torch.constant.int 6
    %4640 = torch.prims.convert_element_type %4637, %int6_5814 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_5815 = torch.constant.int 2
    %4641 = torch.aten.pow.Tensor_Scalar %4640, %int2_5815 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_5816 = torch.constant.int -1
    %4642 = torch.prim.ListConstruct %int-1_5816 : (!torch.int) -> !torch.list<int>
    %true_5817 = torch.constant.bool true
    %none_5818 = torch.constant.none
    %4643 = torch.aten.mean.dim %4641, %4642, %true_5817, %none_5818 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_5819 = torch.constant.float 9.9999999999999995E-7
    %int1_5820 = torch.constant.int 1
    %4644 = torch.aten.add.Scalar %4643, %float9.999990e-07_5819, %int1_5820 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %4645 = torch.aten.rsqrt %4644 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %4646 = torch.aten.mul.Tensor %4640, %4645 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_5821 = torch.constant.int 5
    %4647 = torch.prims.convert_element_type %4646, %int5_5821 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.14.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.14.img_attn.norm.query_norm.scale : tensor<128xf16>
    %4648 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4649 = torch.aten.mul.Tensor %4647, %4648 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_5822 = torch.constant.int 6
    %4650 = torch.prims.convert_element_type %4638, %int6_5822 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_5823 = torch.constant.int 2
    %4651 = torch.aten.pow.Tensor_Scalar %4650, %int2_5823 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_5824 = torch.constant.int -1
    %4652 = torch.prim.ListConstruct %int-1_5824 : (!torch.int) -> !torch.list<int>
    %true_5825 = torch.constant.bool true
    %none_5826 = torch.constant.none
    %4653 = torch.aten.mean.dim %4651, %4652, %true_5825, %none_5826 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_5827 = torch.constant.float 9.9999999999999995E-7
    %int1_5828 = torch.constant.int 1
    %4654 = torch.aten.add.Scalar %4653, %float9.999990e-07_5827, %int1_5828 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %4655 = torch.aten.rsqrt %4654 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %4656 = torch.aten.mul.Tensor %4650, %4655 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_5829 = torch.constant.int 5
    %4657 = torch.prims.convert_element_type %4656, %int5_5829 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.14.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.14.img_attn.norm.key_norm.scale : tensor<128xf16>
    %4658 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4659 = torch.aten.mul.Tensor %4657, %4658 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_5830 = torch.constant.int 5
    %4660 = torch.prims.convert_element_type %4649, %int5_5830 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_5831 = torch.constant.int 5
    %4661 = torch.prims.convert_element_type %4659, %int5_5831 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_5832 = torch.constant.int 6
    %4662 = torch.prims.convert_element_type %4565, %int6_5832 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_5833 = torch.constant.int 2
    %4663 = torch.prim.ListConstruct %int2_5833 : (!torch.int) -> !torch.list<int>
    %int0_5834 = torch.constant.int 0
    %true_5835 = torch.constant.bool true
    %result0_5836, %result1_5837 = torch.aten.var_mean.correction %4662, %4663, %int0_5834, %true_5835 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_5838 = torch.constant.float 9.9999999999999995E-7
    %int1_5839 = torch.constant.int 1
    %4664 = torch.aten.add.Scalar %result0_5836, %float9.999990e-07_5838, %int1_5839 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %4665 = torch.aten.rsqrt %4664 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_5840 = torch.constant.int 1
    %4666 = torch.aten.sub.Tensor %4565, %result1_5837, %int1_5840 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %4667 = torch.aten.mul.Tensor %4666, %4665 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_5841 = torch.constant.int 5
    %4668 = torch.prims.convert_element_type %4667, %int5_5841 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_5842 = torch.constant.int 1
    %int1_5843 = torch.constant.int 1
    %4669 = torch.aten.add.Scalar %4603, %int1_5842, %int1_5843 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4670 = torch.aten.mul.Tensor %4669, %4668 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_5844 = torch.constant.int 1
    %4671 = torch.aten.add.Tensor %4670, %4602, %int1_5844 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_5845 = torch.constant.int 512
    %int3072_5846 = torch.constant.int 3072
    %4672 = torch.prim.ListConstruct %int512_5845, %int3072_5846 : (!torch.int, !torch.int) -> !torch.list<int>
    %4673 = torch.aten.view %4671, %4672 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.14.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.14.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %4674 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_5847 = torch.constant.int 0
    %int1_5848 = torch.constant.int 1
    %4675 = torch.aten.transpose.int %4674, %int0_5847, %int1_5848 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.14.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.14.txt_attn.qkv.bias : tensor<9216xf16>
    %4676 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_5849 = torch.constant.int 6
    %4677 = torch.prims.convert_element_type %4676, %int6_5849 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_5850 = torch.constant.int 6
    %4678 = torch.prims.convert_element_type %4673, %int6_5850 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_5851 = torch.constant.int 6
    %4679 = torch.prims.convert_element_type %4675, %int6_5851 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %4680 = torch.aten.mm %4678, %4679 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_5852 = torch.constant.int 1
    %4681 = torch.aten.mul.Scalar %4680, %int1_5852 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_5853 = torch.constant.int 1
    %4682 = torch.aten.mul.Scalar %4677, %int1_5853 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_5854 = torch.constant.int 1
    %4683 = torch.aten.add.Tensor %4681, %4682, %int1_5854 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_5855 = torch.constant.int 5
    %4684 = torch.prims.convert_element_type %4683, %int5_5855 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_5856 = torch.constant.int 1
    %int512_5857 = torch.constant.int 512
    %int9216_5858 = torch.constant.int 9216
    %4685 = torch.prim.ListConstruct %int1_5856, %int512_5857, %int9216_5858 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4686 = torch.aten.view %4684, %4685 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_5859 = torch.constant.int 1
    %int512_5860 = torch.constant.int 512
    %int3_5861 = torch.constant.int 3
    %int24_5862 = torch.constant.int 24
    %int128_5863 = torch.constant.int 128
    %4687 = torch.prim.ListConstruct %int1_5859, %int512_5860, %int3_5861, %int24_5862, %int128_5863 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4688 = torch.aten.view %4686, %4687 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_5864 = torch.constant.int 2
    %int0_5865 = torch.constant.int 0
    %int3_5866 = torch.constant.int 3
    %int1_5867 = torch.constant.int 1
    %int4_5868 = torch.constant.int 4
    %4689 = torch.prim.ListConstruct %int2_5864, %int0_5865, %int3_5866, %int1_5867, %int4_5868 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4690 = torch.aten.permute %4688, %4689 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_5869 = torch.constant.int 0
    %int0_5870 = torch.constant.int 0
    %4691 = torch.aten.select.int %4690, %int0_5869, %int0_5870 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_5871 = torch.constant.int 0
    %int1_5872 = torch.constant.int 1
    %4692 = torch.aten.select.int %4690, %int0_5871, %int1_5872 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_5873 = torch.constant.int 0
    %int2_5874 = torch.constant.int 2
    %4693 = torch.aten.select.int %4690, %int0_5873, %int2_5874 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_5875 = torch.constant.int 6
    %4694 = torch.prims.convert_element_type %4691, %int6_5875 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_5876 = torch.constant.int 2
    %4695 = torch.aten.pow.Tensor_Scalar %4694, %int2_5876 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_5877 = torch.constant.int -1
    %4696 = torch.prim.ListConstruct %int-1_5877 : (!torch.int) -> !torch.list<int>
    %true_5878 = torch.constant.bool true
    %none_5879 = torch.constant.none
    %4697 = torch.aten.mean.dim %4695, %4696, %true_5878, %none_5879 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_5880 = torch.constant.float 9.9999999999999995E-7
    %int1_5881 = torch.constant.int 1
    %4698 = torch.aten.add.Scalar %4697, %float9.999990e-07_5880, %int1_5881 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %4699 = torch.aten.rsqrt %4698 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %4700 = torch.aten.mul.Tensor %4694, %4699 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_5882 = torch.constant.int 5
    %4701 = torch.prims.convert_element_type %4700, %int5_5882 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.14.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.14.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %4702 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4703 = torch.aten.mul.Tensor %4701, %4702 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_5883 = torch.constant.int 6
    %4704 = torch.prims.convert_element_type %4692, %int6_5883 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_5884 = torch.constant.int 2
    %4705 = torch.aten.pow.Tensor_Scalar %4704, %int2_5884 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_5885 = torch.constant.int -1
    %4706 = torch.prim.ListConstruct %int-1_5885 : (!torch.int) -> !torch.list<int>
    %true_5886 = torch.constant.bool true
    %none_5887 = torch.constant.none
    %4707 = torch.aten.mean.dim %4705, %4706, %true_5886, %none_5887 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_5888 = torch.constant.float 9.9999999999999995E-7
    %int1_5889 = torch.constant.int 1
    %4708 = torch.aten.add.Scalar %4707, %float9.999990e-07_5888, %int1_5889 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %4709 = torch.aten.rsqrt %4708 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %4710 = torch.aten.mul.Tensor %4704, %4709 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_5890 = torch.constant.int 5
    %4711 = torch.prims.convert_element_type %4710, %int5_5890 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.14.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.14.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %4712 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4713 = torch.aten.mul.Tensor %4711, %4712 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_5891 = torch.constant.int 5
    %4714 = torch.prims.convert_element_type %4703, %int5_5891 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_5892 = torch.constant.int 5
    %4715 = torch.prims.convert_element_type %4713, %int5_5892 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %4716 = torch.prim.ListConstruct %4714, %4660 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_5893 = torch.constant.int 2
    %4717 = torch.aten.cat %4716, %int2_5893 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %4718 = torch.prim.ListConstruct %4715, %4661 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_5894 = torch.constant.int 2
    %4719 = torch.aten.cat %4718, %int2_5894 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %4720 = torch.prim.ListConstruct %4693, %4639 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_5895 = torch.constant.int 2
    %4721 = torch.aten.cat %4720, %int2_5895 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_5896 = torch.constant.int 6
    %4722 = torch.prims.convert_element_type %4717, %int6_5896 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_5897 = torch.constant.int 1
    %int24_5898 = torch.constant.int 24
    %int4608_5899 = torch.constant.int 4608
    %int-1_5900 = torch.constant.int -1
    %int1_5901 = torch.constant.int 1
    %int2_5902 = torch.constant.int 2
    %4723 = torch.prim.ListConstruct %int1_5897, %int24_5898, %int4608_5899, %int-1_5900, %int1_5901, %int2_5902 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4724 = torch.aten.view %4722, %4723 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_5903 = torch.constant.int 6
    %4725 = torch.prims.convert_element_type %4719, %int6_5903 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_5904 = torch.constant.int 1
    %int24_5905 = torch.constant.int 24
    %int4608_5906 = torch.constant.int 4608
    %int-1_5907 = torch.constant.int -1
    %int1_5908 = torch.constant.int 1
    %int2_5909 = torch.constant.int 2
    %4726 = torch.prim.ListConstruct %int1_5904, %int24_5905, %int4608_5906, %int-1_5907, %int1_5908, %int2_5909 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4727 = torch.aten.view %4725, %4726 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_5910 = torch.constant.int 5
    %int0_5911 = torch.constant.int 0
    %4728 = torch.aten.select.int %211, %int5_5910, %int0_5911 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5912 = torch.constant.int 5
    %int0_5913 = torch.constant.int 0
    %4729 = torch.aten.select.int %4724, %int5_5912, %int0_5913 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4730 = torch.aten.mul.Tensor %4728, %4729 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_5914 = torch.constant.int 5
    %int1_5915 = torch.constant.int 1
    %4731 = torch.aten.select.int %211, %int5_5914, %int1_5915 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5916 = torch.constant.int 5
    %int1_5917 = torch.constant.int 1
    %4732 = torch.aten.select.int %4724, %int5_5916, %int1_5917 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4733 = torch.aten.mul.Tensor %4731, %4732 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_5918 = torch.constant.int 1
    %4734 = torch.aten.add.Tensor %4730, %4733, %int1_5918 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_5919 = torch.constant.int 5
    %int0_5920 = torch.constant.int 0
    %4735 = torch.aten.select.int %211, %int5_5919, %int0_5920 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5921 = torch.constant.int 5
    %int0_5922 = torch.constant.int 0
    %4736 = torch.aten.select.int %4727, %int5_5921, %int0_5922 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4737 = torch.aten.mul.Tensor %4735, %4736 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_5923 = torch.constant.int 5
    %int1_5924 = torch.constant.int 1
    %4738 = torch.aten.select.int %211, %int5_5923, %int1_5924 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_5925 = torch.constant.int 5
    %int1_5926 = torch.constant.int 1
    %4739 = torch.aten.select.int %4727, %int5_5925, %int1_5926 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %4740 = torch.aten.mul.Tensor %4738, %4739 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_5927 = torch.constant.int 1
    %4741 = torch.aten.add.Tensor %4737, %4740, %int1_5927 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_5928 = torch.constant.int 1
    %int24_5929 = torch.constant.int 24
    %int4608_5930 = torch.constant.int 4608
    %int128_5931 = torch.constant.int 128
    %4742 = torch.prim.ListConstruct %int1_5928, %int24_5929, %int4608_5930, %int128_5931 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4743 = torch.aten.view %4734, %4742 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_5932 = torch.constant.int 5
    %4744 = torch.prims.convert_element_type %4743, %int5_5932 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_5933 = torch.constant.int 1
    %int24_5934 = torch.constant.int 24
    %int4608_5935 = torch.constant.int 4608
    %int128_5936 = torch.constant.int 128
    %4745 = torch.prim.ListConstruct %int1_5933, %int24_5934, %int4608_5935, %int128_5936 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4746 = torch.aten.view %4741, %4745 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_5937 = torch.constant.int 5
    %4747 = torch.prims.convert_element_type %4746, %int5_5937 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_5938 = torch.constant.float 0.000000e+00
    %false_5939 = torch.constant.bool false
    %none_5940 = torch.constant.none
    %none_5941 = torch.constant.none
    %4748:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%4744, %4747, %4721, %float0.000000e00_5938, %false_5939, %none_5940, %none_5941) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_5942 = torch.constant.int 0
    %int2_5943 = torch.constant.int 2
    %int1_5944 = torch.constant.int 1
    %int3_5945 = torch.constant.int 3
    %4749 = torch.prim.ListConstruct %int0_5942, %int2_5943, %int1_5944, %int3_5945 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4750 = torch.aten.permute %4748#0, %4749 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_5946 = torch.constant.int 1
    %int4608_5947 = torch.constant.int 4608
    %int3072_5948 = torch.constant.int 3072
    %4751 = torch.prim.ListConstruct %int1_5946, %int4608_5947, %int3072_5948 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4752 = torch.aten.view %4750, %4751 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_5949 = torch.constant.int 0
    %int0_5950 = torch.constant.int 0
    %int9223372036854775807_5951 = torch.constant.int 9223372036854775807
    %int1_5952 = torch.constant.int 1
    %4753 = torch.aten.slice.Tensor %4752, %int0_5949, %int0_5950, %int9223372036854775807_5951, %int1_5952 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_5953 = torch.constant.int 1
    %int0_5954 = torch.constant.int 0
    %int512_5955 = torch.constant.int 512
    %int1_5956 = torch.constant.int 1
    %4754 = torch.aten.slice.Tensor %4753, %int1_5953, %int0_5954, %int512_5955, %int1_5956 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_5957 = torch.constant.int 0
    %int0_5958 = torch.constant.int 0
    %int9223372036854775807_5959 = torch.constant.int 9223372036854775807
    %int1_5960 = torch.constant.int 1
    %4755 = torch.aten.slice.Tensor %4752, %int0_5957, %int0_5958, %int9223372036854775807_5959, %int1_5960 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_5961 = torch.constant.int 1
    %int512_5962 = torch.constant.int 512
    %int9223372036854775807_5963 = torch.constant.int 9223372036854775807
    %int1_5964 = torch.constant.int 1
    %4756 = torch.aten.slice.Tensor %4755, %int1_5961, %int512_5962, %int9223372036854775807_5963, %int1_5964 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_5965 = torch.constant.int 4096
    %int3072_5966 = torch.constant.int 3072
    %4757 = torch.prim.ListConstruct %int4096_5965, %int3072_5966 : (!torch.int, !torch.int) -> !torch.list<int>
    %4758 = torch.aten.view %4756, %4757 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.14.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.14.img_attn.proj.weight : tensor<3072x3072xf16>
    %4759 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_5967 = torch.constant.int 0
    %int1_5968 = torch.constant.int 1
    %4760 = torch.aten.transpose.int %4759, %int0_5967, %int1_5968 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.14.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.14.img_attn.proj.bias : tensor<3072xf16>
    %4761 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_5969 = torch.constant.int 6
    %4762 = torch.prims.convert_element_type %4761, %int6_5969 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_5970 = torch.constant.int 6
    %4763 = torch.prims.convert_element_type %4758, %int6_5970 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_5971 = torch.constant.int 6
    %4764 = torch.prims.convert_element_type %4760, %int6_5971 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %4765 = torch.aten.mm %4763, %4764 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_5972 = torch.constant.int 1
    %4766 = torch.aten.mul.Scalar %4765, %int1_5972 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_5973 = torch.constant.int 1
    %4767 = torch.aten.mul.Scalar %4762, %int1_5973 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_5974 = torch.constant.int 1
    %4768 = torch.aten.add.Tensor %4766, %4767, %int1_5974 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_5975 = torch.constant.int 5
    %4769 = torch.prims.convert_element_type %4768, %int5_5975 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_5976 = torch.constant.int 1
    %int4096_5977 = torch.constant.int 4096
    %int3072_5978 = torch.constant.int 3072
    %4770 = torch.prim.ListConstruct %int1_5976, %int4096_5977, %int3072_5978 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4771 = torch.aten.view %4769, %4770 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %4772 = torch.aten.mul.Tensor %4583, %4771 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5979 = torch.constant.int 1
    %4773 = torch.aten.add.Tensor %4505, %4772, %int1_5979 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5980 = torch.constant.int 1
    %int1_5981 = torch.constant.int 1
    %4774 = torch.aten.add.Scalar %4585, %int1_5980, %int1_5981 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_5982 = torch.constant.int 6
    %4775 = torch.prims.convert_element_type %4773, %int6_5982 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_5983 = torch.constant.int 2
    %4776 = torch.prim.ListConstruct %int2_5983 : (!torch.int) -> !torch.list<int>
    %int0_5984 = torch.constant.int 0
    %true_5985 = torch.constant.bool true
    %result0_5986, %result1_5987 = torch.aten.var_mean.correction %4775, %4776, %int0_5984, %true_5985 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_5988 = torch.constant.float 9.9999999999999995E-7
    %int1_5989 = torch.constant.int 1
    %4777 = torch.aten.add.Scalar %result0_5986, %float9.999990e-07_5988, %int1_5989 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %4778 = torch.aten.rsqrt %4777 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_5990 = torch.constant.int 1
    %4779 = torch.aten.sub.Tensor %4773, %result1_5987, %int1_5990 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %4780 = torch.aten.mul.Tensor %4779, %4778 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_5991 = torch.constant.int 5
    %4781 = torch.prims.convert_element_type %4780, %int5_5991 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %4782 = torch.aten.mul.Tensor %4774, %4781 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_5992 = torch.constant.int 1
    %4783 = torch.aten.add.Tensor %4782, %4584, %int1_5992 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_5993 = torch.constant.int 4096
    %int3072_5994 = torch.constant.int 3072
    %4784 = torch.prim.ListConstruct %int4096_5993, %int3072_5994 : (!torch.int, !torch.int) -> !torch.list<int>
    %4785 = torch.aten.view %4783, %4784 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.14.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.14.img_mlp.0.weight : tensor<12288x3072xf16>
    %4786 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_5995 = torch.constant.int 0
    %int1_5996 = torch.constant.int 1
    %4787 = torch.aten.transpose.int %4786, %int0_5995, %int1_5996 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.14.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.14.img_mlp.0.bias : tensor<12288xf16>
    %4788 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_5997 = torch.constant.int 6
    %4789 = torch.prims.convert_element_type %4788, %int6_5997 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_5998 = torch.constant.int 6
    %4790 = torch.prims.convert_element_type %4785, %int6_5998 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_5999 = torch.constant.int 6
    %4791 = torch.prims.convert_element_type %4787, %int6_5999 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %4792 = torch.aten.mm %4790, %4791 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_6000 = torch.constant.int 1
    %4793 = torch.aten.mul.Scalar %4792, %int1_6000 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_6001 = torch.constant.int 1
    %4794 = torch.aten.mul.Scalar %4789, %int1_6001 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_6002 = torch.constant.int 1
    %4795 = torch.aten.add.Tensor %4793, %4794, %int1_6002 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_6003 = torch.constant.int 5
    %4796 = torch.prims.convert_element_type %4795, %int5_6003 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_6004 = torch.constant.int 1
    %int4096_6005 = torch.constant.int 4096
    %int12288_6006 = torch.constant.int 12288
    %4797 = torch.prim.ListConstruct %int1_6004, %int4096_6005, %int12288_6006 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4798 = torch.aten.view %4796, %4797 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_6007 = torch.constant.str "tanh"
    %4799 = torch.aten.gelu %4798, %str_6007 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_6008 = torch.constant.int 4096
    %int12288_6009 = torch.constant.int 12288
    %4800 = torch.prim.ListConstruct %int4096_6008, %int12288_6009 : (!torch.int, !torch.int) -> !torch.list<int>
    %4801 = torch.aten.view %4799, %4800 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.14.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.14.img_mlp.2.weight : tensor<3072x12288xf16>
    %4802 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_6010 = torch.constant.int 0
    %int1_6011 = torch.constant.int 1
    %4803 = torch.aten.transpose.int %4802, %int0_6010, %int1_6011 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.14.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.14.img_mlp.2.bias : tensor<3072xf16>
    %4804 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6012 = torch.constant.int 6
    %4805 = torch.prims.convert_element_type %4804, %int6_6012 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6013 = torch.constant.int 6
    %4806 = torch.prims.convert_element_type %4801, %int6_6013 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_6014 = torch.constant.int 6
    %4807 = torch.prims.convert_element_type %4803, %int6_6014 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %4808 = torch.aten.mm %4806, %4807 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_6015 = torch.constant.int 1
    %4809 = torch.aten.mul.Scalar %4808, %int1_6015 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_6016 = torch.constant.int 1
    %4810 = torch.aten.mul.Scalar %4805, %int1_6016 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6017 = torch.constant.int 1
    %4811 = torch.aten.add.Tensor %4809, %4810, %int1_6017 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_6018 = torch.constant.int 5
    %4812 = torch.prims.convert_element_type %4811, %int5_6018 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_6019 = torch.constant.int 1
    %int4096_6020 = torch.constant.int 4096
    %int3072_6021 = torch.constant.int 3072
    %4813 = torch.prim.ListConstruct %int1_6019, %int4096_6020, %int3072_6021 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4814 = torch.aten.view %4812, %4813 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %4815 = torch.aten.mul.Tensor %4586, %4814 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6022 = torch.constant.int 1
    %4816 = torch.aten.add.Tensor %4773, %4815, %int1_6022 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_6023 = torch.constant.int 512
    %int3072_6024 = torch.constant.int 3072
    %4817 = torch.prim.ListConstruct %int512_6023, %int3072_6024 : (!torch.int, !torch.int) -> !torch.list<int>
    %4818 = torch.aten.view %4754, %4817 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.14.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.14.txt_attn.proj.weight : tensor<3072x3072xf16>
    %4819 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_6025 = torch.constant.int 0
    %int1_6026 = torch.constant.int 1
    %4820 = torch.aten.transpose.int %4819, %int0_6025, %int1_6026 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.14.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.14.txt_attn.proj.bias : tensor<3072xf16>
    %4821 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6027 = torch.constant.int 6
    %4822 = torch.prims.convert_element_type %4821, %int6_6027 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6028 = torch.constant.int 6
    %4823 = torch.prims.convert_element_type %4818, %int6_6028 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_6029 = torch.constant.int 6
    %4824 = torch.prims.convert_element_type %4820, %int6_6029 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %4825 = torch.aten.mm %4823, %4824 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_6030 = torch.constant.int 1
    %4826 = torch.aten.mul.Scalar %4825, %int1_6030 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_6031 = torch.constant.int 1
    %4827 = torch.aten.mul.Scalar %4822, %int1_6031 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6032 = torch.constant.int 1
    %4828 = torch.aten.add.Tensor %4826, %4827, %int1_6032 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_6033 = torch.constant.int 5
    %4829 = torch.prims.convert_element_type %4828, %int5_6033 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_6034 = torch.constant.int 1
    %int512_6035 = torch.constant.int 512
    %int3072_6036 = torch.constant.int 3072
    %4830 = torch.prim.ListConstruct %int1_6034, %int512_6035, %int3072_6036 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4831 = torch.aten.view %4829, %4830 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %4832 = torch.aten.mul.Tensor %4604, %4831 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6037 = torch.constant.int 1
    %4833 = torch.aten.add.Tensor %4565, %4832, %int1_6037 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_6038 = torch.constant.int 1
    %int1_6039 = torch.constant.int 1
    %4834 = torch.aten.add.Scalar %4606, %int1_6038, %int1_6039 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_6040 = torch.constant.int 6
    %4835 = torch.prims.convert_element_type %4833, %int6_6040 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_6041 = torch.constant.int 2
    %4836 = torch.prim.ListConstruct %int2_6041 : (!torch.int) -> !torch.list<int>
    %int0_6042 = torch.constant.int 0
    %true_6043 = torch.constant.bool true
    %result0_6044, %result1_6045 = torch.aten.var_mean.correction %4835, %4836, %int0_6042, %true_6043 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_6046 = torch.constant.float 9.9999999999999995E-7
    %int1_6047 = torch.constant.int 1
    %4837 = torch.aten.add.Scalar %result0_6044, %float9.999990e-07_6046, %int1_6047 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %4838 = torch.aten.rsqrt %4837 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_6048 = torch.constant.int 1
    %4839 = torch.aten.sub.Tensor %4833, %result1_6045, %int1_6048 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %4840 = torch.aten.mul.Tensor %4839, %4838 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_6049 = torch.constant.int 5
    %4841 = torch.prims.convert_element_type %4840, %int5_6049 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %4842 = torch.aten.mul.Tensor %4834, %4841 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6050 = torch.constant.int 1
    %4843 = torch.aten.add.Tensor %4842, %4605, %int1_6050 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_6051 = torch.constant.int 512
    %int3072_6052 = torch.constant.int 3072
    %4844 = torch.prim.ListConstruct %int512_6051, %int3072_6052 : (!torch.int, !torch.int) -> !torch.list<int>
    %4845 = torch.aten.view %4843, %4844 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.14.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.14.txt_mlp.0.weight : tensor<12288x3072xf16>
    %4846 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_6053 = torch.constant.int 0
    %int1_6054 = torch.constant.int 1
    %4847 = torch.aten.transpose.int %4846, %int0_6053, %int1_6054 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.14.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.14.txt_mlp.0.bias : tensor<12288xf16>
    %4848 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_6055 = torch.constant.int 6
    %4849 = torch.prims.convert_element_type %4848, %int6_6055 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_6056 = torch.constant.int 6
    %4850 = torch.prims.convert_element_type %4845, %int6_6056 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_6057 = torch.constant.int 6
    %4851 = torch.prims.convert_element_type %4847, %int6_6057 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %4852 = torch.aten.mm %4850, %4851 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_6058 = torch.constant.int 1
    %4853 = torch.aten.mul.Scalar %4852, %int1_6058 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_6059 = torch.constant.int 1
    %4854 = torch.aten.mul.Scalar %4849, %int1_6059 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_6060 = torch.constant.int 1
    %4855 = torch.aten.add.Tensor %4853, %4854, %int1_6060 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_6061 = torch.constant.int 5
    %4856 = torch.prims.convert_element_type %4855, %int5_6061 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_6062 = torch.constant.int 1
    %int512_6063 = torch.constant.int 512
    %int12288_6064 = torch.constant.int 12288
    %4857 = torch.prim.ListConstruct %int1_6062, %int512_6063, %int12288_6064 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4858 = torch.aten.view %4856, %4857 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_6065 = torch.constant.str "tanh"
    %4859 = torch.aten.gelu %4858, %str_6065 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_6066 = torch.constant.int 512
    %int12288_6067 = torch.constant.int 12288
    %4860 = torch.prim.ListConstruct %int512_6066, %int12288_6067 : (!torch.int, !torch.int) -> !torch.list<int>
    %4861 = torch.aten.view %4859, %4860 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.14.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.14.txt_mlp.2.weight : tensor<3072x12288xf16>
    %4862 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_6068 = torch.constant.int 0
    %int1_6069 = torch.constant.int 1
    %4863 = torch.aten.transpose.int %4862, %int0_6068, %int1_6069 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.14.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.14.txt_mlp.2.bias : tensor<3072xf16>
    %4864 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.14.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6070 = torch.constant.int 6
    %4865 = torch.prims.convert_element_type %4864, %int6_6070 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6071 = torch.constant.int 6
    %4866 = torch.prims.convert_element_type %4861, %int6_6071 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_6072 = torch.constant.int 6
    %4867 = torch.prims.convert_element_type %4863, %int6_6072 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %4868 = torch.aten.mm %4866, %4867 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_6073 = torch.constant.int 1
    %4869 = torch.aten.mul.Scalar %4868, %int1_6073 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_6074 = torch.constant.int 1
    %4870 = torch.aten.mul.Scalar %4865, %int1_6074 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6075 = torch.constant.int 1
    %4871 = torch.aten.add.Tensor %4869, %4870, %int1_6075 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_6076 = torch.constant.int 5
    %4872 = torch.prims.convert_element_type %4871, %int5_6076 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_6077 = torch.constant.int 1
    %int512_6078 = torch.constant.int 512
    %int3072_6079 = torch.constant.int 3072
    %4873 = torch.prim.ListConstruct %int1_6077, %int512_6078, %int3072_6079 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4874 = torch.aten.view %4872, %4873 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %4875 = torch.aten.mul.Tensor %4607, %4874 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6080 = torch.constant.int 1
    %4876 = torch.aten.add.Tensor %4833, %4875, %int1_6080 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %4877 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.15.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.15.img_mod.lin.weight : tensor<18432x3072xf16>
    %4878 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_6081 = torch.constant.int 0
    %int1_6082 = torch.constant.int 1
    %4879 = torch.aten.transpose.int %4878, %int0_6081, %int1_6082 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.15.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.15.img_mod.lin.bias : tensor<18432xf16>
    %4880 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_6083 = torch.constant.int 6
    %4881 = torch.prims.convert_element_type %4880, %int6_6083 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_6084 = torch.constant.int 6
    %4882 = torch.prims.convert_element_type %4877, %int6_6084 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_6085 = torch.constant.int 6
    %4883 = torch.prims.convert_element_type %4879, %int6_6085 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %4884 = torch.aten.mm %4882, %4883 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_6086 = torch.constant.int 1
    %4885 = torch.aten.mul.Scalar %4884, %int1_6086 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_6087 = torch.constant.int 1
    %4886 = torch.aten.mul.Scalar %4881, %int1_6087 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_6088 = torch.constant.int 1
    %4887 = torch.aten.add.Tensor %4885, %4886, %int1_6088 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_6089 = torch.constant.int 5
    %4888 = torch.prims.convert_element_type %4887, %int5_6089 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_6090 = torch.constant.int 0
    %int0_6091 = torch.constant.int 0
    %int9223372036854775807_6092 = torch.constant.int 9223372036854775807
    %int1_6093 = torch.constant.int 1
    %4889 = torch.aten.slice.Tensor %4888, %int0_6090, %int0_6091, %int9223372036854775807_6092, %int1_6093 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_6094 = torch.constant.int 1
    %4890 = torch.aten.unsqueeze %4889, %int1_6094 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_6095 = torch.constant.int 2
    %int0_6096 = torch.constant.int 0
    %int9223372036854775807_6097 = torch.constant.int 9223372036854775807
    %int1_6098 = torch.constant.int 1
    %4891 = torch.aten.slice.Tensor %4890, %int2_6095, %int0_6096, %int9223372036854775807_6097, %int1_6098 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_6099 = torch.constant.int -1
    %int0_6100 = torch.constant.int 0
    %int3072_6101 = torch.constant.int 3072
    %int1_6102 = torch.constant.int 1
    %4892 = torch.aten.slice.Tensor %4891, %int-1_6099, %int0_6100, %int3072_6101, %int1_6102 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6103 = torch.constant.int -1
    %int3072_6104 = torch.constant.int 3072
    %int6144_6105 = torch.constant.int 6144
    %int1_6106 = torch.constant.int 1
    %4893 = torch.aten.slice.Tensor %4891, %int-1_6103, %int3072_6104, %int6144_6105, %int1_6106 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6107 = torch.constant.int -1
    %int6144_6108 = torch.constant.int 6144
    %int9216_6109 = torch.constant.int 9216
    %int1_6110 = torch.constant.int 1
    %4894 = torch.aten.slice.Tensor %4891, %int-1_6107, %int6144_6108, %int9216_6109, %int1_6110 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6111 = torch.constant.int -1
    %int9216_6112 = torch.constant.int 9216
    %int12288_6113 = torch.constant.int 12288
    %int1_6114 = torch.constant.int 1
    %4895 = torch.aten.slice.Tensor %4891, %int-1_6111, %int9216_6112, %int12288_6113, %int1_6114 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6115 = torch.constant.int -1
    %int12288_6116 = torch.constant.int 12288
    %int15360_6117 = torch.constant.int 15360
    %int1_6118 = torch.constant.int 1
    %4896 = torch.aten.slice.Tensor %4891, %int-1_6115, %int12288_6116, %int15360_6117, %int1_6118 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6119 = torch.constant.int -1
    %int15360_6120 = torch.constant.int 15360
    %int18432_6121 = torch.constant.int 18432
    %int1_6122 = torch.constant.int 1
    %4897 = torch.aten.slice.Tensor %4891, %int-1_6119, %int15360_6120, %int18432_6121, %int1_6122 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4898 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.15.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.15.txt_mod.lin.weight : tensor<18432x3072xf16>
    %4899 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_6123 = torch.constant.int 0
    %int1_6124 = torch.constant.int 1
    %4900 = torch.aten.transpose.int %4899, %int0_6123, %int1_6124 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.15.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.15.txt_mod.lin.bias : tensor<18432xf16>
    %4901 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_6125 = torch.constant.int 6
    %4902 = torch.prims.convert_element_type %4901, %int6_6125 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_6126 = torch.constant.int 6
    %4903 = torch.prims.convert_element_type %4898, %int6_6126 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_6127 = torch.constant.int 6
    %4904 = torch.prims.convert_element_type %4900, %int6_6127 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %4905 = torch.aten.mm %4903, %4904 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_6128 = torch.constant.int 1
    %4906 = torch.aten.mul.Scalar %4905, %int1_6128 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_6129 = torch.constant.int 1
    %4907 = torch.aten.mul.Scalar %4902, %int1_6129 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_6130 = torch.constant.int 1
    %4908 = torch.aten.add.Tensor %4906, %4907, %int1_6130 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_6131 = torch.constant.int 5
    %4909 = torch.prims.convert_element_type %4908, %int5_6131 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_6132 = torch.constant.int 0
    %int0_6133 = torch.constant.int 0
    %int9223372036854775807_6134 = torch.constant.int 9223372036854775807
    %int1_6135 = torch.constant.int 1
    %4910 = torch.aten.slice.Tensor %4909, %int0_6132, %int0_6133, %int9223372036854775807_6134, %int1_6135 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_6136 = torch.constant.int 1
    %4911 = torch.aten.unsqueeze %4910, %int1_6136 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_6137 = torch.constant.int 2
    %int0_6138 = torch.constant.int 0
    %int9223372036854775807_6139 = torch.constant.int 9223372036854775807
    %int1_6140 = torch.constant.int 1
    %4912 = torch.aten.slice.Tensor %4911, %int2_6137, %int0_6138, %int9223372036854775807_6139, %int1_6140 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_6141 = torch.constant.int -1
    %int0_6142 = torch.constant.int 0
    %int3072_6143 = torch.constant.int 3072
    %int1_6144 = torch.constant.int 1
    %4913 = torch.aten.slice.Tensor %4912, %int-1_6141, %int0_6142, %int3072_6143, %int1_6144 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6145 = torch.constant.int -1
    %int3072_6146 = torch.constant.int 3072
    %int6144_6147 = torch.constant.int 6144
    %int1_6148 = torch.constant.int 1
    %4914 = torch.aten.slice.Tensor %4912, %int-1_6145, %int3072_6146, %int6144_6147, %int1_6148 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6149 = torch.constant.int -1
    %int6144_6150 = torch.constant.int 6144
    %int9216_6151 = torch.constant.int 9216
    %int1_6152 = torch.constant.int 1
    %4915 = torch.aten.slice.Tensor %4912, %int-1_6149, %int6144_6150, %int9216_6151, %int1_6152 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6153 = torch.constant.int -1
    %int9216_6154 = torch.constant.int 9216
    %int12288_6155 = torch.constant.int 12288
    %int1_6156 = torch.constant.int 1
    %4916 = torch.aten.slice.Tensor %4912, %int-1_6153, %int9216_6154, %int12288_6155, %int1_6156 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6157 = torch.constant.int -1
    %int12288_6158 = torch.constant.int 12288
    %int15360_6159 = torch.constant.int 15360
    %int1_6160 = torch.constant.int 1
    %4917 = torch.aten.slice.Tensor %4912, %int-1_6157, %int12288_6158, %int15360_6159, %int1_6160 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6161 = torch.constant.int -1
    %int15360_6162 = torch.constant.int 15360
    %int18432_6163 = torch.constant.int 18432
    %int1_6164 = torch.constant.int 1
    %4918 = torch.aten.slice.Tensor %4912, %int-1_6161, %int15360_6162, %int18432_6163, %int1_6164 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_6165 = torch.constant.int 6
    %4919 = torch.prims.convert_element_type %4816, %int6_6165 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_6166 = torch.constant.int 2
    %4920 = torch.prim.ListConstruct %int2_6166 : (!torch.int) -> !torch.list<int>
    %int0_6167 = torch.constant.int 0
    %true_6168 = torch.constant.bool true
    %result0_6169, %result1_6170 = torch.aten.var_mean.correction %4919, %4920, %int0_6167, %true_6168 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_6171 = torch.constant.float 9.9999999999999995E-7
    %int1_6172 = torch.constant.int 1
    %4921 = torch.aten.add.Scalar %result0_6169, %float9.999990e-07_6171, %int1_6172 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %4922 = torch.aten.rsqrt %4921 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_6173 = torch.constant.int 1
    %4923 = torch.aten.sub.Tensor %4816, %result1_6170, %int1_6173 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %4924 = torch.aten.mul.Tensor %4923, %4922 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_6174 = torch.constant.int 5
    %4925 = torch.prims.convert_element_type %4924, %int5_6174 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6175 = torch.constant.int 1
    %int1_6176 = torch.constant.int 1
    %4926 = torch.aten.add.Scalar %4893, %int1_6175, %int1_6176 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4927 = torch.aten.mul.Tensor %4926, %4925 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6177 = torch.constant.int 1
    %4928 = torch.aten.add.Tensor %4927, %4892, %int1_6177 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_6178 = torch.constant.int 4096
    %int3072_6179 = torch.constant.int 3072
    %4929 = torch.prim.ListConstruct %int4096_6178, %int3072_6179 : (!torch.int, !torch.int) -> !torch.list<int>
    %4930 = torch.aten.view %4928, %4929 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.15.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.15.img_attn.qkv.weight : tensor<9216x3072xf16>
    %4931 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_6180 = torch.constant.int 0
    %int1_6181 = torch.constant.int 1
    %4932 = torch.aten.transpose.int %4931, %int0_6180, %int1_6181 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.15.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.15.img_attn.qkv.bias : tensor<9216xf16>
    %4933 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_6182 = torch.constant.int 6
    %4934 = torch.prims.convert_element_type %4933, %int6_6182 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_6183 = torch.constant.int 6
    %4935 = torch.prims.convert_element_type %4930, %int6_6183 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_6184 = torch.constant.int 6
    %4936 = torch.prims.convert_element_type %4932, %int6_6184 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %4937 = torch.aten.mm %4935, %4936 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_6185 = torch.constant.int 1
    %4938 = torch.aten.mul.Scalar %4937, %int1_6185 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_6186 = torch.constant.int 1
    %4939 = torch.aten.mul.Scalar %4934, %int1_6186 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_6187 = torch.constant.int 1
    %4940 = torch.aten.add.Tensor %4938, %4939, %int1_6187 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_6188 = torch.constant.int 5
    %4941 = torch.prims.convert_element_type %4940, %int5_6188 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_6189 = torch.constant.int 1
    %int4096_6190 = torch.constant.int 4096
    %int9216_6191 = torch.constant.int 9216
    %4942 = torch.prim.ListConstruct %int1_6189, %int4096_6190, %int9216_6191 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4943 = torch.aten.view %4941, %4942 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_6192 = torch.constant.int 1
    %int4096_6193 = torch.constant.int 4096
    %int3_6194 = torch.constant.int 3
    %int24_6195 = torch.constant.int 24
    %int128_6196 = torch.constant.int 128
    %4944 = torch.prim.ListConstruct %int1_6192, %int4096_6193, %int3_6194, %int24_6195, %int128_6196 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4945 = torch.aten.view %4943, %4944 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_6197 = torch.constant.int 2
    %int0_6198 = torch.constant.int 0
    %int3_6199 = torch.constant.int 3
    %int1_6200 = torch.constant.int 1
    %int4_6201 = torch.constant.int 4
    %4946 = torch.prim.ListConstruct %int2_6197, %int0_6198, %int3_6199, %int1_6200, %int4_6201 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4947 = torch.aten.permute %4945, %4946 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_6202 = torch.constant.int 0
    %int0_6203 = torch.constant.int 0
    %4948 = torch.aten.select.int %4947, %int0_6202, %int0_6203 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_6204 = torch.constant.int 0
    %int1_6205 = torch.constant.int 1
    %4949 = torch.aten.select.int %4947, %int0_6204, %int1_6205 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_6206 = torch.constant.int 0
    %int2_6207 = torch.constant.int 2
    %4950 = torch.aten.select.int %4947, %int0_6206, %int2_6207 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_6208 = torch.constant.int 6
    %4951 = torch.prims.convert_element_type %4948, %int6_6208 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_6209 = torch.constant.int 2
    %4952 = torch.aten.pow.Tensor_Scalar %4951, %int2_6209 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_6210 = torch.constant.int -1
    %4953 = torch.prim.ListConstruct %int-1_6210 : (!torch.int) -> !torch.list<int>
    %true_6211 = torch.constant.bool true
    %none_6212 = torch.constant.none
    %4954 = torch.aten.mean.dim %4952, %4953, %true_6211, %none_6212 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_6213 = torch.constant.float 9.9999999999999995E-7
    %int1_6214 = torch.constant.int 1
    %4955 = torch.aten.add.Scalar %4954, %float9.999990e-07_6213, %int1_6214 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %4956 = torch.aten.rsqrt %4955 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %4957 = torch.aten.mul.Tensor %4951, %4956 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_6215 = torch.constant.int 5
    %4958 = torch.prims.convert_element_type %4957, %int5_6215 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.15.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.15.img_attn.norm.query_norm.scale : tensor<128xf16>
    %4959 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4960 = torch.aten.mul.Tensor %4958, %4959 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_6216 = torch.constant.int 6
    %4961 = torch.prims.convert_element_type %4949, %int6_6216 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_6217 = torch.constant.int 2
    %4962 = torch.aten.pow.Tensor_Scalar %4961, %int2_6217 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_6218 = torch.constant.int -1
    %4963 = torch.prim.ListConstruct %int-1_6218 : (!torch.int) -> !torch.list<int>
    %true_6219 = torch.constant.bool true
    %none_6220 = torch.constant.none
    %4964 = torch.aten.mean.dim %4962, %4963, %true_6219, %none_6220 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_6221 = torch.constant.float 9.9999999999999995E-7
    %int1_6222 = torch.constant.int 1
    %4965 = torch.aten.add.Scalar %4964, %float9.999990e-07_6221, %int1_6222 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %4966 = torch.aten.rsqrt %4965 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %4967 = torch.aten.mul.Tensor %4961, %4966 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_6223 = torch.constant.int 5
    %4968 = torch.prims.convert_element_type %4967, %int5_6223 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.15.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.15.img_attn.norm.key_norm.scale : tensor<128xf16>
    %4969 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %4970 = torch.aten.mul.Tensor %4968, %4969 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_6224 = torch.constant.int 5
    %4971 = torch.prims.convert_element_type %4960, %int5_6224 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_6225 = torch.constant.int 5
    %4972 = torch.prims.convert_element_type %4970, %int5_6225 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_6226 = torch.constant.int 6
    %4973 = torch.prims.convert_element_type %4876, %int6_6226 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_6227 = torch.constant.int 2
    %4974 = torch.prim.ListConstruct %int2_6227 : (!torch.int) -> !torch.list<int>
    %int0_6228 = torch.constant.int 0
    %true_6229 = torch.constant.bool true
    %result0_6230, %result1_6231 = torch.aten.var_mean.correction %4973, %4974, %int0_6228, %true_6229 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_6232 = torch.constant.float 9.9999999999999995E-7
    %int1_6233 = torch.constant.int 1
    %4975 = torch.aten.add.Scalar %result0_6230, %float9.999990e-07_6232, %int1_6233 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %4976 = torch.aten.rsqrt %4975 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_6234 = torch.constant.int 1
    %4977 = torch.aten.sub.Tensor %4876, %result1_6231, %int1_6234 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %4978 = torch.aten.mul.Tensor %4977, %4976 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_6235 = torch.constant.int 5
    %4979 = torch.prims.convert_element_type %4978, %int5_6235 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_6236 = torch.constant.int 1
    %int1_6237 = torch.constant.int 1
    %4980 = torch.aten.add.Scalar %4914, %int1_6236, %int1_6237 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %4981 = torch.aten.mul.Tensor %4980, %4979 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6238 = torch.constant.int 1
    %4982 = torch.aten.add.Tensor %4981, %4913, %int1_6238 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_6239 = torch.constant.int 512
    %int3072_6240 = torch.constant.int 3072
    %4983 = torch.prim.ListConstruct %int512_6239, %int3072_6240 : (!torch.int, !torch.int) -> !torch.list<int>
    %4984 = torch.aten.view %4982, %4983 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.15.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.15.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %4985 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_6241 = torch.constant.int 0
    %int1_6242 = torch.constant.int 1
    %4986 = torch.aten.transpose.int %4985, %int0_6241, %int1_6242 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.15.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.15.txt_attn.qkv.bias : tensor<9216xf16>
    %4987 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_6243 = torch.constant.int 6
    %4988 = torch.prims.convert_element_type %4987, %int6_6243 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_6244 = torch.constant.int 6
    %4989 = torch.prims.convert_element_type %4984, %int6_6244 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_6245 = torch.constant.int 6
    %4990 = torch.prims.convert_element_type %4986, %int6_6245 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %4991 = torch.aten.mm %4989, %4990 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_6246 = torch.constant.int 1
    %4992 = torch.aten.mul.Scalar %4991, %int1_6246 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_6247 = torch.constant.int 1
    %4993 = torch.aten.mul.Scalar %4988, %int1_6247 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_6248 = torch.constant.int 1
    %4994 = torch.aten.add.Tensor %4992, %4993, %int1_6248 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_6249 = torch.constant.int 5
    %4995 = torch.prims.convert_element_type %4994, %int5_6249 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_6250 = torch.constant.int 1
    %int512_6251 = torch.constant.int 512
    %int9216_6252 = torch.constant.int 9216
    %4996 = torch.prim.ListConstruct %int1_6250, %int512_6251, %int9216_6252 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4997 = torch.aten.view %4995, %4996 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_6253 = torch.constant.int 1
    %int512_6254 = torch.constant.int 512
    %int3_6255 = torch.constant.int 3
    %int24_6256 = torch.constant.int 24
    %int128_6257 = torch.constant.int 128
    %4998 = torch.prim.ListConstruct %int1_6253, %int512_6254, %int3_6255, %int24_6256, %int128_6257 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %4999 = torch.aten.view %4997, %4998 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_6258 = torch.constant.int 2
    %int0_6259 = torch.constant.int 0
    %int3_6260 = torch.constant.int 3
    %int1_6261 = torch.constant.int 1
    %int4_6262 = torch.constant.int 4
    %5000 = torch.prim.ListConstruct %int2_6258, %int0_6259, %int3_6260, %int1_6261, %int4_6262 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5001 = torch.aten.permute %4999, %5000 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_6263 = torch.constant.int 0
    %int0_6264 = torch.constant.int 0
    %5002 = torch.aten.select.int %5001, %int0_6263, %int0_6264 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_6265 = torch.constant.int 0
    %int1_6266 = torch.constant.int 1
    %5003 = torch.aten.select.int %5001, %int0_6265, %int1_6266 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_6267 = torch.constant.int 0
    %int2_6268 = torch.constant.int 2
    %5004 = torch.aten.select.int %5001, %int0_6267, %int2_6268 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_6269 = torch.constant.int 6
    %5005 = torch.prims.convert_element_type %5002, %int6_6269 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_6270 = torch.constant.int 2
    %5006 = torch.aten.pow.Tensor_Scalar %5005, %int2_6270 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_6271 = torch.constant.int -1
    %5007 = torch.prim.ListConstruct %int-1_6271 : (!torch.int) -> !torch.list<int>
    %true_6272 = torch.constant.bool true
    %none_6273 = torch.constant.none
    %5008 = torch.aten.mean.dim %5006, %5007, %true_6272, %none_6273 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_6274 = torch.constant.float 9.9999999999999995E-7
    %int1_6275 = torch.constant.int 1
    %5009 = torch.aten.add.Scalar %5008, %float9.999990e-07_6274, %int1_6275 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %5010 = torch.aten.rsqrt %5009 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %5011 = torch.aten.mul.Tensor %5005, %5010 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_6276 = torch.constant.int 5
    %5012 = torch.prims.convert_element_type %5011, %int5_6276 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.15.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.15.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %5013 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5014 = torch.aten.mul.Tensor %5012, %5013 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_6277 = torch.constant.int 6
    %5015 = torch.prims.convert_element_type %5003, %int6_6277 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_6278 = torch.constant.int 2
    %5016 = torch.aten.pow.Tensor_Scalar %5015, %int2_6278 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_6279 = torch.constant.int -1
    %5017 = torch.prim.ListConstruct %int-1_6279 : (!torch.int) -> !torch.list<int>
    %true_6280 = torch.constant.bool true
    %none_6281 = torch.constant.none
    %5018 = torch.aten.mean.dim %5016, %5017, %true_6280, %none_6281 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_6282 = torch.constant.float 9.9999999999999995E-7
    %int1_6283 = torch.constant.int 1
    %5019 = torch.aten.add.Scalar %5018, %float9.999990e-07_6282, %int1_6283 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %5020 = torch.aten.rsqrt %5019 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %5021 = torch.aten.mul.Tensor %5015, %5020 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_6284 = torch.constant.int 5
    %5022 = torch.prims.convert_element_type %5021, %int5_6284 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.15.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.15.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %5023 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5024 = torch.aten.mul.Tensor %5022, %5023 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_6285 = torch.constant.int 5
    %5025 = torch.prims.convert_element_type %5014, %int5_6285 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_6286 = torch.constant.int 5
    %5026 = torch.prims.convert_element_type %5024, %int5_6286 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %5027 = torch.prim.ListConstruct %5025, %4971 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_6287 = torch.constant.int 2
    %5028 = torch.aten.cat %5027, %int2_6287 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %5029 = torch.prim.ListConstruct %5026, %4972 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_6288 = torch.constant.int 2
    %5030 = torch.aten.cat %5029, %int2_6288 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %5031 = torch.prim.ListConstruct %5004, %4950 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_6289 = torch.constant.int 2
    %5032 = torch.aten.cat %5031, %int2_6289 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_6290 = torch.constant.int 6
    %5033 = torch.prims.convert_element_type %5028, %int6_6290 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_6291 = torch.constant.int 1
    %int24_6292 = torch.constant.int 24
    %int4608_6293 = torch.constant.int 4608
    %int-1_6294 = torch.constant.int -1
    %int1_6295 = torch.constant.int 1
    %int2_6296 = torch.constant.int 2
    %5034 = torch.prim.ListConstruct %int1_6291, %int24_6292, %int4608_6293, %int-1_6294, %int1_6295, %int2_6296 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5035 = torch.aten.view %5033, %5034 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_6297 = torch.constant.int 6
    %5036 = torch.prims.convert_element_type %5030, %int6_6297 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_6298 = torch.constant.int 1
    %int24_6299 = torch.constant.int 24
    %int4608_6300 = torch.constant.int 4608
    %int-1_6301 = torch.constant.int -1
    %int1_6302 = torch.constant.int 1
    %int2_6303 = torch.constant.int 2
    %5037 = torch.prim.ListConstruct %int1_6298, %int24_6299, %int4608_6300, %int-1_6301, %int1_6302, %int2_6303 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5038 = torch.aten.view %5036, %5037 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_6304 = torch.constant.int 5
    %int0_6305 = torch.constant.int 0
    %5039 = torch.aten.select.int %211, %int5_6304, %int0_6305 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6306 = torch.constant.int 5
    %int0_6307 = torch.constant.int 0
    %5040 = torch.aten.select.int %5035, %int5_6306, %int0_6307 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5041 = torch.aten.mul.Tensor %5039, %5040 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_6308 = torch.constant.int 5
    %int1_6309 = torch.constant.int 1
    %5042 = torch.aten.select.int %211, %int5_6308, %int1_6309 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6310 = torch.constant.int 5
    %int1_6311 = torch.constant.int 1
    %5043 = torch.aten.select.int %5035, %int5_6310, %int1_6311 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5044 = torch.aten.mul.Tensor %5042, %5043 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_6312 = torch.constant.int 1
    %5045 = torch.aten.add.Tensor %5041, %5044, %int1_6312 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_6313 = torch.constant.int 5
    %int0_6314 = torch.constant.int 0
    %5046 = torch.aten.select.int %211, %int5_6313, %int0_6314 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6315 = torch.constant.int 5
    %int0_6316 = torch.constant.int 0
    %5047 = torch.aten.select.int %5038, %int5_6315, %int0_6316 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5048 = torch.aten.mul.Tensor %5046, %5047 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_6317 = torch.constant.int 5
    %int1_6318 = torch.constant.int 1
    %5049 = torch.aten.select.int %211, %int5_6317, %int1_6318 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6319 = torch.constant.int 5
    %int1_6320 = torch.constant.int 1
    %5050 = torch.aten.select.int %5038, %int5_6319, %int1_6320 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5051 = torch.aten.mul.Tensor %5049, %5050 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_6321 = torch.constant.int 1
    %5052 = torch.aten.add.Tensor %5048, %5051, %int1_6321 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_6322 = torch.constant.int 1
    %int24_6323 = torch.constant.int 24
    %int4608_6324 = torch.constant.int 4608
    %int128_6325 = torch.constant.int 128
    %5053 = torch.prim.ListConstruct %int1_6322, %int24_6323, %int4608_6324, %int128_6325 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5054 = torch.aten.view %5045, %5053 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_6326 = torch.constant.int 5
    %5055 = torch.prims.convert_element_type %5054, %int5_6326 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_6327 = torch.constant.int 1
    %int24_6328 = torch.constant.int 24
    %int4608_6329 = torch.constant.int 4608
    %int128_6330 = torch.constant.int 128
    %5056 = torch.prim.ListConstruct %int1_6327, %int24_6328, %int4608_6329, %int128_6330 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5057 = torch.aten.view %5052, %5056 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_6331 = torch.constant.int 5
    %5058 = torch.prims.convert_element_type %5057, %int5_6331 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_6332 = torch.constant.float 0.000000e+00
    %false_6333 = torch.constant.bool false
    %none_6334 = torch.constant.none
    %none_6335 = torch.constant.none
    %5059:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%5055, %5058, %5032, %float0.000000e00_6332, %false_6333, %none_6334, %none_6335) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_6336 = torch.constant.int 0
    %int2_6337 = torch.constant.int 2
    %int1_6338 = torch.constant.int 1
    %int3_6339 = torch.constant.int 3
    %5060 = torch.prim.ListConstruct %int0_6336, %int2_6337, %int1_6338, %int3_6339 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5061 = torch.aten.permute %5059#0, %5060 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_6340 = torch.constant.int 1
    %int4608_6341 = torch.constant.int 4608
    %int3072_6342 = torch.constant.int 3072
    %5062 = torch.prim.ListConstruct %int1_6340, %int4608_6341, %int3072_6342 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5063 = torch.aten.view %5061, %5062 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_6343 = torch.constant.int 0
    %int0_6344 = torch.constant.int 0
    %int9223372036854775807_6345 = torch.constant.int 9223372036854775807
    %int1_6346 = torch.constant.int 1
    %5064 = torch.aten.slice.Tensor %5063, %int0_6343, %int0_6344, %int9223372036854775807_6345, %int1_6346 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_6347 = torch.constant.int 1
    %int0_6348 = torch.constant.int 0
    %int512_6349 = torch.constant.int 512
    %int1_6350 = torch.constant.int 1
    %5065 = torch.aten.slice.Tensor %5064, %int1_6347, %int0_6348, %int512_6349, %int1_6350 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_6351 = torch.constant.int 0
    %int0_6352 = torch.constant.int 0
    %int9223372036854775807_6353 = torch.constant.int 9223372036854775807
    %int1_6354 = torch.constant.int 1
    %5066 = torch.aten.slice.Tensor %5063, %int0_6351, %int0_6352, %int9223372036854775807_6353, %int1_6354 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_6355 = torch.constant.int 1
    %int512_6356 = torch.constant.int 512
    %int9223372036854775807_6357 = torch.constant.int 9223372036854775807
    %int1_6358 = torch.constant.int 1
    %5067 = torch.aten.slice.Tensor %5066, %int1_6355, %int512_6356, %int9223372036854775807_6357, %int1_6358 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_6359 = torch.constant.int 4096
    %int3072_6360 = torch.constant.int 3072
    %5068 = torch.prim.ListConstruct %int4096_6359, %int3072_6360 : (!torch.int, !torch.int) -> !torch.list<int>
    %5069 = torch.aten.view %5067, %5068 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.15.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.15.img_attn.proj.weight : tensor<3072x3072xf16>
    %5070 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_6361 = torch.constant.int 0
    %int1_6362 = torch.constant.int 1
    %5071 = torch.aten.transpose.int %5070, %int0_6361, %int1_6362 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.15.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.15.img_attn.proj.bias : tensor<3072xf16>
    %5072 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6363 = torch.constant.int 6
    %5073 = torch.prims.convert_element_type %5072, %int6_6363 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6364 = torch.constant.int 6
    %5074 = torch.prims.convert_element_type %5069, %int6_6364 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_6365 = torch.constant.int 6
    %5075 = torch.prims.convert_element_type %5071, %int6_6365 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %5076 = torch.aten.mm %5074, %5075 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_6366 = torch.constant.int 1
    %5077 = torch.aten.mul.Scalar %5076, %int1_6366 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_6367 = torch.constant.int 1
    %5078 = torch.aten.mul.Scalar %5073, %int1_6367 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6368 = torch.constant.int 1
    %5079 = torch.aten.add.Tensor %5077, %5078, %int1_6368 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_6369 = torch.constant.int 5
    %5080 = torch.prims.convert_element_type %5079, %int5_6369 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_6370 = torch.constant.int 1
    %int4096_6371 = torch.constant.int 4096
    %int3072_6372 = torch.constant.int 3072
    %5081 = torch.prim.ListConstruct %int1_6370, %int4096_6371, %int3072_6372 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5082 = torch.aten.view %5080, %5081 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %5083 = torch.aten.mul.Tensor %4894, %5082 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6373 = torch.constant.int 1
    %5084 = torch.aten.add.Tensor %4816, %5083, %int1_6373 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6374 = torch.constant.int 1
    %int1_6375 = torch.constant.int 1
    %5085 = torch.aten.add.Scalar %4896, %int1_6374, %int1_6375 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_6376 = torch.constant.int 6
    %5086 = torch.prims.convert_element_type %5084, %int6_6376 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_6377 = torch.constant.int 2
    %5087 = torch.prim.ListConstruct %int2_6377 : (!torch.int) -> !torch.list<int>
    %int0_6378 = torch.constant.int 0
    %true_6379 = torch.constant.bool true
    %result0_6380, %result1_6381 = torch.aten.var_mean.correction %5086, %5087, %int0_6378, %true_6379 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_6382 = torch.constant.float 9.9999999999999995E-7
    %int1_6383 = torch.constant.int 1
    %5088 = torch.aten.add.Scalar %result0_6380, %float9.999990e-07_6382, %int1_6383 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %5089 = torch.aten.rsqrt %5088 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_6384 = torch.constant.int 1
    %5090 = torch.aten.sub.Tensor %5084, %result1_6381, %int1_6384 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %5091 = torch.aten.mul.Tensor %5090, %5089 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_6385 = torch.constant.int 5
    %5092 = torch.prims.convert_element_type %5091, %int5_6385 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %5093 = torch.aten.mul.Tensor %5085, %5092 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6386 = torch.constant.int 1
    %5094 = torch.aten.add.Tensor %5093, %4895, %int1_6386 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_6387 = torch.constant.int 4096
    %int3072_6388 = torch.constant.int 3072
    %5095 = torch.prim.ListConstruct %int4096_6387, %int3072_6388 : (!torch.int, !torch.int) -> !torch.list<int>
    %5096 = torch.aten.view %5094, %5095 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.15.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.15.img_mlp.0.weight : tensor<12288x3072xf16>
    %5097 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_6389 = torch.constant.int 0
    %int1_6390 = torch.constant.int 1
    %5098 = torch.aten.transpose.int %5097, %int0_6389, %int1_6390 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.15.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.15.img_mlp.0.bias : tensor<12288xf16>
    %5099 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_6391 = torch.constant.int 6
    %5100 = torch.prims.convert_element_type %5099, %int6_6391 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_6392 = torch.constant.int 6
    %5101 = torch.prims.convert_element_type %5096, %int6_6392 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_6393 = torch.constant.int 6
    %5102 = torch.prims.convert_element_type %5098, %int6_6393 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %5103 = torch.aten.mm %5101, %5102 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_6394 = torch.constant.int 1
    %5104 = torch.aten.mul.Scalar %5103, %int1_6394 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_6395 = torch.constant.int 1
    %5105 = torch.aten.mul.Scalar %5100, %int1_6395 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_6396 = torch.constant.int 1
    %5106 = torch.aten.add.Tensor %5104, %5105, %int1_6396 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_6397 = torch.constant.int 5
    %5107 = torch.prims.convert_element_type %5106, %int5_6397 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_6398 = torch.constant.int 1
    %int4096_6399 = torch.constant.int 4096
    %int12288_6400 = torch.constant.int 12288
    %5108 = torch.prim.ListConstruct %int1_6398, %int4096_6399, %int12288_6400 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5109 = torch.aten.view %5107, %5108 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_6401 = torch.constant.str "tanh"
    %5110 = torch.aten.gelu %5109, %str_6401 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_6402 = torch.constant.int 4096
    %int12288_6403 = torch.constant.int 12288
    %5111 = torch.prim.ListConstruct %int4096_6402, %int12288_6403 : (!torch.int, !torch.int) -> !torch.list<int>
    %5112 = torch.aten.view %5110, %5111 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.15.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.15.img_mlp.2.weight : tensor<3072x12288xf16>
    %5113 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_6404 = torch.constant.int 0
    %int1_6405 = torch.constant.int 1
    %5114 = torch.aten.transpose.int %5113, %int0_6404, %int1_6405 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.15.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.15.img_mlp.2.bias : tensor<3072xf16>
    %5115 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6406 = torch.constant.int 6
    %5116 = torch.prims.convert_element_type %5115, %int6_6406 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6407 = torch.constant.int 6
    %5117 = torch.prims.convert_element_type %5112, %int6_6407 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_6408 = torch.constant.int 6
    %5118 = torch.prims.convert_element_type %5114, %int6_6408 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %5119 = torch.aten.mm %5117, %5118 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_6409 = torch.constant.int 1
    %5120 = torch.aten.mul.Scalar %5119, %int1_6409 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_6410 = torch.constant.int 1
    %5121 = torch.aten.mul.Scalar %5116, %int1_6410 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6411 = torch.constant.int 1
    %5122 = torch.aten.add.Tensor %5120, %5121, %int1_6411 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_6412 = torch.constant.int 5
    %5123 = torch.prims.convert_element_type %5122, %int5_6412 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_6413 = torch.constant.int 1
    %int4096_6414 = torch.constant.int 4096
    %int3072_6415 = torch.constant.int 3072
    %5124 = torch.prim.ListConstruct %int1_6413, %int4096_6414, %int3072_6415 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5125 = torch.aten.view %5123, %5124 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %5126 = torch.aten.mul.Tensor %4897, %5125 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6416 = torch.constant.int 1
    %5127 = torch.aten.add.Tensor %5084, %5126, %int1_6416 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_6417 = torch.constant.int 512
    %int3072_6418 = torch.constant.int 3072
    %5128 = torch.prim.ListConstruct %int512_6417, %int3072_6418 : (!torch.int, !torch.int) -> !torch.list<int>
    %5129 = torch.aten.view %5065, %5128 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.15.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.15.txt_attn.proj.weight : tensor<3072x3072xf16>
    %5130 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_6419 = torch.constant.int 0
    %int1_6420 = torch.constant.int 1
    %5131 = torch.aten.transpose.int %5130, %int0_6419, %int1_6420 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.15.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.15.txt_attn.proj.bias : tensor<3072xf16>
    %5132 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6421 = torch.constant.int 6
    %5133 = torch.prims.convert_element_type %5132, %int6_6421 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6422 = torch.constant.int 6
    %5134 = torch.prims.convert_element_type %5129, %int6_6422 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_6423 = torch.constant.int 6
    %5135 = torch.prims.convert_element_type %5131, %int6_6423 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %5136 = torch.aten.mm %5134, %5135 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_6424 = torch.constant.int 1
    %5137 = torch.aten.mul.Scalar %5136, %int1_6424 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_6425 = torch.constant.int 1
    %5138 = torch.aten.mul.Scalar %5133, %int1_6425 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6426 = torch.constant.int 1
    %5139 = torch.aten.add.Tensor %5137, %5138, %int1_6426 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_6427 = torch.constant.int 5
    %5140 = torch.prims.convert_element_type %5139, %int5_6427 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_6428 = torch.constant.int 1
    %int512_6429 = torch.constant.int 512
    %int3072_6430 = torch.constant.int 3072
    %5141 = torch.prim.ListConstruct %int1_6428, %int512_6429, %int3072_6430 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5142 = torch.aten.view %5140, %5141 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %5143 = torch.aten.mul.Tensor %4915, %5142 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6431 = torch.constant.int 1
    %5144 = torch.aten.add.Tensor %4876, %5143, %int1_6431 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_6432 = torch.constant.int 1
    %int1_6433 = torch.constant.int 1
    %5145 = torch.aten.add.Scalar %4917, %int1_6432, %int1_6433 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_6434 = torch.constant.int 6
    %5146 = torch.prims.convert_element_type %5144, %int6_6434 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_6435 = torch.constant.int 2
    %5147 = torch.prim.ListConstruct %int2_6435 : (!torch.int) -> !torch.list<int>
    %int0_6436 = torch.constant.int 0
    %true_6437 = torch.constant.bool true
    %result0_6438, %result1_6439 = torch.aten.var_mean.correction %5146, %5147, %int0_6436, %true_6437 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_6440 = torch.constant.float 9.9999999999999995E-7
    %int1_6441 = torch.constant.int 1
    %5148 = torch.aten.add.Scalar %result0_6438, %float9.999990e-07_6440, %int1_6441 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %5149 = torch.aten.rsqrt %5148 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_6442 = torch.constant.int 1
    %5150 = torch.aten.sub.Tensor %5144, %result1_6439, %int1_6442 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %5151 = torch.aten.mul.Tensor %5150, %5149 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_6443 = torch.constant.int 5
    %5152 = torch.prims.convert_element_type %5151, %int5_6443 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %5153 = torch.aten.mul.Tensor %5145, %5152 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6444 = torch.constant.int 1
    %5154 = torch.aten.add.Tensor %5153, %4916, %int1_6444 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_6445 = torch.constant.int 512
    %int3072_6446 = torch.constant.int 3072
    %5155 = torch.prim.ListConstruct %int512_6445, %int3072_6446 : (!torch.int, !torch.int) -> !torch.list<int>
    %5156 = torch.aten.view %5154, %5155 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.15.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.15.txt_mlp.0.weight : tensor<12288x3072xf16>
    %5157 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_6447 = torch.constant.int 0
    %int1_6448 = torch.constant.int 1
    %5158 = torch.aten.transpose.int %5157, %int0_6447, %int1_6448 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.15.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.15.txt_mlp.0.bias : tensor<12288xf16>
    %5159 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_6449 = torch.constant.int 6
    %5160 = torch.prims.convert_element_type %5159, %int6_6449 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_6450 = torch.constant.int 6
    %5161 = torch.prims.convert_element_type %5156, %int6_6450 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_6451 = torch.constant.int 6
    %5162 = torch.prims.convert_element_type %5158, %int6_6451 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %5163 = torch.aten.mm %5161, %5162 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_6452 = torch.constant.int 1
    %5164 = torch.aten.mul.Scalar %5163, %int1_6452 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_6453 = torch.constant.int 1
    %5165 = torch.aten.mul.Scalar %5160, %int1_6453 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_6454 = torch.constant.int 1
    %5166 = torch.aten.add.Tensor %5164, %5165, %int1_6454 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_6455 = torch.constant.int 5
    %5167 = torch.prims.convert_element_type %5166, %int5_6455 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_6456 = torch.constant.int 1
    %int512_6457 = torch.constant.int 512
    %int12288_6458 = torch.constant.int 12288
    %5168 = torch.prim.ListConstruct %int1_6456, %int512_6457, %int12288_6458 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5169 = torch.aten.view %5167, %5168 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_6459 = torch.constant.str "tanh"
    %5170 = torch.aten.gelu %5169, %str_6459 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_6460 = torch.constant.int 512
    %int12288_6461 = torch.constant.int 12288
    %5171 = torch.prim.ListConstruct %int512_6460, %int12288_6461 : (!torch.int, !torch.int) -> !torch.list<int>
    %5172 = torch.aten.view %5170, %5171 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.15.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.15.txt_mlp.2.weight : tensor<3072x12288xf16>
    %5173 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_6462 = torch.constant.int 0
    %int1_6463 = torch.constant.int 1
    %5174 = torch.aten.transpose.int %5173, %int0_6462, %int1_6463 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.15.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.15.txt_mlp.2.bias : tensor<3072xf16>
    %5175 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.15.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6464 = torch.constant.int 6
    %5176 = torch.prims.convert_element_type %5175, %int6_6464 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6465 = torch.constant.int 6
    %5177 = torch.prims.convert_element_type %5172, %int6_6465 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_6466 = torch.constant.int 6
    %5178 = torch.prims.convert_element_type %5174, %int6_6466 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %5179 = torch.aten.mm %5177, %5178 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_6467 = torch.constant.int 1
    %5180 = torch.aten.mul.Scalar %5179, %int1_6467 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_6468 = torch.constant.int 1
    %5181 = torch.aten.mul.Scalar %5176, %int1_6468 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6469 = torch.constant.int 1
    %5182 = torch.aten.add.Tensor %5180, %5181, %int1_6469 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_6470 = torch.constant.int 5
    %5183 = torch.prims.convert_element_type %5182, %int5_6470 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_6471 = torch.constant.int 1
    %int512_6472 = torch.constant.int 512
    %int3072_6473 = torch.constant.int 3072
    %5184 = torch.prim.ListConstruct %int1_6471, %int512_6472, %int3072_6473 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5185 = torch.aten.view %5183, %5184 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %5186 = torch.aten.mul.Tensor %4918, %5185 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6474 = torch.constant.int 1
    %5187 = torch.aten.add.Tensor %5144, %5186, %int1_6474 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %5188 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.16.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.16.img_mod.lin.weight : tensor<18432x3072xf16>
    %5189 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_6475 = torch.constant.int 0
    %int1_6476 = torch.constant.int 1
    %5190 = torch.aten.transpose.int %5189, %int0_6475, %int1_6476 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.16.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.16.img_mod.lin.bias : tensor<18432xf16>
    %5191 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_6477 = torch.constant.int 6
    %5192 = torch.prims.convert_element_type %5191, %int6_6477 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_6478 = torch.constant.int 6
    %5193 = torch.prims.convert_element_type %5188, %int6_6478 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_6479 = torch.constant.int 6
    %5194 = torch.prims.convert_element_type %5190, %int6_6479 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %5195 = torch.aten.mm %5193, %5194 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_6480 = torch.constant.int 1
    %5196 = torch.aten.mul.Scalar %5195, %int1_6480 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_6481 = torch.constant.int 1
    %5197 = torch.aten.mul.Scalar %5192, %int1_6481 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_6482 = torch.constant.int 1
    %5198 = torch.aten.add.Tensor %5196, %5197, %int1_6482 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_6483 = torch.constant.int 5
    %5199 = torch.prims.convert_element_type %5198, %int5_6483 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_6484 = torch.constant.int 0
    %int0_6485 = torch.constant.int 0
    %int9223372036854775807_6486 = torch.constant.int 9223372036854775807
    %int1_6487 = torch.constant.int 1
    %5200 = torch.aten.slice.Tensor %5199, %int0_6484, %int0_6485, %int9223372036854775807_6486, %int1_6487 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_6488 = torch.constant.int 1
    %5201 = torch.aten.unsqueeze %5200, %int1_6488 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_6489 = torch.constant.int 2
    %int0_6490 = torch.constant.int 0
    %int9223372036854775807_6491 = torch.constant.int 9223372036854775807
    %int1_6492 = torch.constant.int 1
    %5202 = torch.aten.slice.Tensor %5201, %int2_6489, %int0_6490, %int9223372036854775807_6491, %int1_6492 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_6493 = torch.constant.int -1
    %int0_6494 = torch.constant.int 0
    %int3072_6495 = torch.constant.int 3072
    %int1_6496 = torch.constant.int 1
    %5203 = torch.aten.slice.Tensor %5202, %int-1_6493, %int0_6494, %int3072_6495, %int1_6496 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6497 = torch.constant.int -1
    %int3072_6498 = torch.constant.int 3072
    %int6144_6499 = torch.constant.int 6144
    %int1_6500 = torch.constant.int 1
    %5204 = torch.aten.slice.Tensor %5202, %int-1_6497, %int3072_6498, %int6144_6499, %int1_6500 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6501 = torch.constant.int -1
    %int6144_6502 = torch.constant.int 6144
    %int9216_6503 = torch.constant.int 9216
    %int1_6504 = torch.constant.int 1
    %5205 = torch.aten.slice.Tensor %5202, %int-1_6501, %int6144_6502, %int9216_6503, %int1_6504 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6505 = torch.constant.int -1
    %int9216_6506 = torch.constant.int 9216
    %int12288_6507 = torch.constant.int 12288
    %int1_6508 = torch.constant.int 1
    %5206 = torch.aten.slice.Tensor %5202, %int-1_6505, %int9216_6506, %int12288_6507, %int1_6508 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6509 = torch.constant.int -1
    %int12288_6510 = torch.constant.int 12288
    %int15360_6511 = torch.constant.int 15360
    %int1_6512 = torch.constant.int 1
    %5207 = torch.aten.slice.Tensor %5202, %int-1_6509, %int12288_6510, %int15360_6511, %int1_6512 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6513 = torch.constant.int -1
    %int15360_6514 = torch.constant.int 15360
    %int18432_6515 = torch.constant.int 18432
    %int1_6516 = torch.constant.int 1
    %5208 = torch.aten.slice.Tensor %5202, %int-1_6513, %int15360_6514, %int18432_6515, %int1_6516 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5209 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.16.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.16.txt_mod.lin.weight : tensor<18432x3072xf16>
    %5210 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_6517 = torch.constant.int 0
    %int1_6518 = torch.constant.int 1
    %5211 = torch.aten.transpose.int %5210, %int0_6517, %int1_6518 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.16.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.16.txt_mod.lin.bias : tensor<18432xf16>
    %5212 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_6519 = torch.constant.int 6
    %5213 = torch.prims.convert_element_type %5212, %int6_6519 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_6520 = torch.constant.int 6
    %5214 = torch.prims.convert_element_type %5209, %int6_6520 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_6521 = torch.constant.int 6
    %5215 = torch.prims.convert_element_type %5211, %int6_6521 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %5216 = torch.aten.mm %5214, %5215 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_6522 = torch.constant.int 1
    %5217 = torch.aten.mul.Scalar %5216, %int1_6522 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_6523 = torch.constant.int 1
    %5218 = torch.aten.mul.Scalar %5213, %int1_6523 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_6524 = torch.constant.int 1
    %5219 = torch.aten.add.Tensor %5217, %5218, %int1_6524 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_6525 = torch.constant.int 5
    %5220 = torch.prims.convert_element_type %5219, %int5_6525 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_6526 = torch.constant.int 0
    %int0_6527 = torch.constant.int 0
    %int9223372036854775807_6528 = torch.constant.int 9223372036854775807
    %int1_6529 = torch.constant.int 1
    %5221 = torch.aten.slice.Tensor %5220, %int0_6526, %int0_6527, %int9223372036854775807_6528, %int1_6529 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_6530 = torch.constant.int 1
    %5222 = torch.aten.unsqueeze %5221, %int1_6530 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_6531 = torch.constant.int 2
    %int0_6532 = torch.constant.int 0
    %int9223372036854775807_6533 = torch.constant.int 9223372036854775807
    %int1_6534 = torch.constant.int 1
    %5223 = torch.aten.slice.Tensor %5222, %int2_6531, %int0_6532, %int9223372036854775807_6533, %int1_6534 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_6535 = torch.constant.int -1
    %int0_6536 = torch.constant.int 0
    %int3072_6537 = torch.constant.int 3072
    %int1_6538 = torch.constant.int 1
    %5224 = torch.aten.slice.Tensor %5223, %int-1_6535, %int0_6536, %int3072_6537, %int1_6538 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6539 = torch.constant.int -1
    %int3072_6540 = torch.constant.int 3072
    %int6144_6541 = torch.constant.int 6144
    %int1_6542 = torch.constant.int 1
    %5225 = torch.aten.slice.Tensor %5223, %int-1_6539, %int3072_6540, %int6144_6541, %int1_6542 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6543 = torch.constant.int -1
    %int6144_6544 = torch.constant.int 6144
    %int9216_6545 = torch.constant.int 9216
    %int1_6546 = torch.constant.int 1
    %5226 = torch.aten.slice.Tensor %5223, %int-1_6543, %int6144_6544, %int9216_6545, %int1_6546 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6547 = torch.constant.int -1
    %int9216_6548 = torch.constant.int 9216
    %int12288_6549 = torch.constant.int 12288
    %int1_6550 = torch.constant.int 1
    %5227 = torch.aten.slice.Tensor %5223, %int-1_6547, %int9216_6548, %int12288_6549, %int1_6550 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6551 = torch.constant.int -1
    %int12288_6552 = torch.constant.int 12288
    %int15360_6553 = torch.constant.int 15360
    %int1_6554 = torch.constant.int 1
    %5228 = torch.aten.slice.Tensor %5223, %int-1_6551, %int12288_6552, %int15360_6553, %int1_6554 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6555 = torch.constant.int -1
    %int15360_6556 = torch.constant.int 15360
    %int18432_6557 = torch.constant.int 18432
    %int1_6558 = torch.constant.int 1
    %5229 = torch.aten.slice.Tensor %5223, %int-1_6555, %int15360_6556, %int18432_6557, %int1_6558 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_6559 = torch.constant.int 6
    %5230 = torch.prims.convert_element_type %5127, %int6_6559 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_6560 = torch.constant.int 2
    %5231 = torch.prim.ListConstruct %int2_6560 : (!torch.int) -> !torch.list<int>
    %int0_6561 = torch.constant.int 0
    %true_6562 = torch.constant.bool true
    %result0_6563, %result1_6564 = torch.aten.var_mean.correction %5230, %5231, %int0_6561, %true_6562 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_6565 = torch.constant.float 9.9999999999999995E-7
    %int1_6566 = torch.constant.int 1
    %5232 = torch.aten.add.Scalar %result0_6563, %float9.999990e-07_6565, %int1_6566 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %5233 = torch.aten.rsqrt %5232 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_6567 = torch.constant.int 1
    %5234 = torch.aten.sub.Tensor %5127, %result1_6564, %int1_6567 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %5235 = torch.aten.mul.Tensor %5234, %5233 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_6568 = torch.constant.int 5
    %5236 = torch.prims.convert_element_type %5235, %int5_6568 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6569 = torch.constant.int 1
    %int1_6570 = torch.constant.int 1
    %5237 = torch.aten.add.Scalar %5204, %int1_6569, %int1_6570 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5238 = torch.aten.mul.Tensor %5237, %5236 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6571 = torch.constant.int 1
    %5239 = torch.aten.add.Tensor %5238, %5203, %int1_6571 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_6572 = torch.constant.int 4096
    %int3072_6573 = torch.constant.int 3072
    %5240 = torch.prim.ListConstruct %int4096_6572, %int3072_6573 : (!torch.int, !torch.int) -> !torch.list<int>
    %5241 = torch.aten.view %5239, %5240 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.16.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.16.img_attn.qkv.weight : tensor<9216x3072xf16>
    %5242 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_6574 = torch.constant.int 0
    %int1_6575 = torch.constant.int 1
    %5243 = torch.aten.transpose.int %5242, %int0_6574, %int1_6575 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.16.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.16.img_attn.qkv.bias : tensor<9216xf16>
    %5244 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_6576 = torch.constant.int 6
    %5245 = torch.prims.convert_element_type %5244, %int6_6576 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_6577 = torch.constant.int 6
    %5246 = torch.prims.convert_element_type %5241, %int6_6577 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_6578 = torch.constant.int 6
    %5247 = torch.prims.convert_element_type %5243, %int6_6578 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %5248 = torch.aten.mm %5246, %5247 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_6579 = torch.constant.int 1
    %5249 = torch.aten.mul.Scalar %5248, %int1_6579 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_6580 = torch.constant.int 1
    %5250 = torch.aten.mul.Scalar %5245, %int1_6580 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_6581 = torch.constant.int 1
    %5251 = torch.aten.add.Tensor %5249, %5250, %int1_6581 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_6582 = torch.constant.int 5
    %5252 = torch.prims.convert_element_type %5251, %int5_6582 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_6583 = torch.constant.int 1
    %int4096_6584 = torch.constant.int 4096
    %int9216_6585 = torch.constant.int 9216
    %5253 = torch.prim.ListConstruct %int1_6583, %int4096_6584, %int9216_6585 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5254 = torch.aten.view %5252, %5253 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_6586 = torch.constant.int 1
    %int4096_6587 = torch.constant.int 4096
    %int3_6588 = torch.constant.int 3
    %int24_6589 = torch.constant.int 24
    %int128_6590 = torch.constant.int 128
    %5255 = torch.prim.ListConstruct %int1_6586, %int4096_6587, %int3_6588, %int24_6589, %int128_6590 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5256 = torch.aten.view %5254, %5255 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_6591 = torch.constant.int 2
    %int0_6592 = torch.constant.int 0
    %int3_6593 = torch.constant.int 3
    %int1_6594 = torch.constant.int 1
    %int4_6595 = torch.constant.int 4
    %5257 = torch.prim.ListConstruct %int2_6591, %int0_6592, %int3_6593, %int1_6594, %int4_6595 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5258 = torch.aten.permute %5256, %5257 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_6596 = torch.constant.int 0
    %int0_6597 = torch.constant.int 0
    %5259 = torch.aten.select.int %5258, %int0_6596, %int0_6597 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_6598 = torch.constant.int 0
    %int1_6599 = torch.constant.int 1
    %5260 = torch.aten.select.int %5258, %int0_6598, %int1_6599 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_6600 = torch.constant.int 0
    %int2_6601 = torch.constant.int 2
    %5261 = torch.aten.select.int %5258, %int0_6600, %int2_6601 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_6602 = torch.constant.int 6
    %5262 = torch.prims.convert_element_type %5259, %int6_6602 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_6603 = torch.constant.int 2
    %5263 = torch.aten.pow.Tensor_Scalar %5262, %int2_6603 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_6604 = torch.constant.int -1
    %5264 = torch.prim.ListConstruct %int-1_6604 : (!torch.int) -> !torch.list<int>
    %true_6605 = torch.constant.bool true
    %none_6606 = torch.constant.none
    %5265 = torch.aten.mean.dim %5263, %5264, %true_6605, %none_6606 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_6607 = torch.constant.float 9.9999999999999995E-7
    %int1_6608 = torch.constant.int 1
    %5266 = torch.aten.add.Scalar %5265, %float9.999990e-07_6607, %int1_6608 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %5267 = torch.aten.rsqrt %5266 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %5268 = torch.aten.mul.Tensor %5262, %5267 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_6609 = torch.constant.int 5
    %5269 = torch.prims.convert_element_type %5268, %int5_6609 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.16.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.16.img_attn.norm.query_norm.scale : tensor<128xf16>
    %5270 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5271 = torch.aten.mul.Tensor %5269, %5270 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_6610 = torch.constant.int 6
    %5272 = torch.prims.convert_element_type %5260, %int6_6610 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_6611 = torch.constant.int 2
    %5273 = torch.aten.pow.Tensor_Scalar %5272, %int2_6611 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_6612 = torch.constant.int -1
    %5274 = torch.prim.ListConstruct %int-1_6612 : (!torch.int) -> !torch.list<int>
    %true_6613 = torch.constant.bool true
    %none_6614 = torch.constant.none
    %5275 = torch.aten.mean.dim %5273, %5274, %true_6613, %none_6614 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_6615 = torch.constant.float 9.9999999999999995E-7
    %int1_6616 = torch.constant.int 1
    %5276 = torch.aten.add.Scalar %5275, %float9.999990e-07_6615, %int1_6616 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %5277 = torch.aten.rsqrt %5276 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %5278 = torch.aten.mul.Tensor %5272, %5277 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_6617 = torch.constant.int 5
    %5279 = torch.prims.convert_element_type %5278, %int5_6617 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.16.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.16.img_attn.norm.key_norm.scale : tensor<128xf16>
    %5280 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5281 = torch.aten.mul.Tensor %5279, %5280 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_6618 = torch.constant.int 5
    %5282 = torch.prims.convert_element_type %5271, %int5_6618 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_6619 = torch.constant.int 5
    %5283 = torch.prims.convert_element_type %5281, %int5_6619 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_6620 = torch.constant.int 6
    %5284 = torch.prims.convert_element_type %5187, %int6_6620 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_6621 = torch.constant.int 2
    %5285 = torch.prim.ListConstruct %int2_6621 : (!torch.int) -> !torch.list<int>
    %int0_6622 = torch.constant.int 0
    %true_6623 = torch.constant.bool true
    %result0_6624, %result1_6625 = torch.aten.var_mean.correction %5284, %5285, %int0_6622, %true_6623 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_6626 = torch.constant.float 9.9999999999999995E-7
    %int1_6627 = torch.constant.int 1
    %5286 = torch.aten.add.Scalar %result0_6624, %float9.999990e-07_6626, %int1_6627 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %5287 = torch.aten.rsqrt %5286 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_6628 = torch.constant.int 1
    %5288 = torch.aten.sub.Tensor %5187, %result1_6625, %int1_6628 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %5289 = torch.aten.mul.Tensor %5288, %5287 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_6629 = torch.constant.int 5
    %5290 = torch.prims.convert_element_type %5289, %int5_6629 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_6630 = torch.constant.int 1
    %int1_6631 = torch.constant.int 1
    %5291 = torch.aten.add.Scalar %5225, %int1_6630, %int1_6631 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5292 = torch.aten.mul.Tensor %5291, %5290 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6632 = torch.constant.int 1
    %5293 = torch.aten.add.Tensor %5292, %5224, %int1_6632 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_6633 = torch.constant.int 512
    %int3072_6634 = torch.constant.int 3072
    %5294 = torch.prim.ListConstruct %int512_6633, %int3072_6634 : (!torch.int, !torch.int) -> !torch.list<int>
    %5295 = torch.aten.view %5293, %5294 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.16.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.16.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %5296 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_6635 = torch.constant.int 0
    %int1_6636 = torch.constant.int 1
    %5297 = torch.aten.transpose.int %5296, %int0_6635, %int1_6636 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.16.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.16.txt_attn.qkv.bias : tensor<9216xf16>
    %5298 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_6637 = torch.constant.int 6
    %5299 = torch.prims.convert_element_type %5298, %int6_6637 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_6638 = torch.constant.int 6
    %5300 = torch.prims.convert_element_type %5295, %int6_6638 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_6639 = torch.constant.int 6
    %5301 = torch.prims.convert_element_type %5297, %int6_6639 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %5302 = torch.aten.mm %5300, %5301 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_6640 = torch.constant.int 1
    %5303 = torch.aten.mul.Scalar %5302, %int1_6640 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_6641 = torch.constant.int 1
    %5304 = torch.aten.mul.Scalar %5299, %int1_6641 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_6642 = torch.constant.int 1
    %5305 = torch.aten.add.Tensor %5303, %5304, %int1_6642 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_6643 = torch.constant.int 5
    %5306 = torch.prims.convert_element_type %5305, %int5_6643 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_6644 = torch.constant.int 1
    %int512_6645 = torch.constant.int 512
    %int9216_6646 = torch.constant.int 9216
    %5307 = torch.prim.ListConstruct %int1_6644, %int512_6645, %int9216_6646 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5308 = torch.aten.view %5306, %5307 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_6647 = torch.constant.int 1
    %int512_6648 = torch.constant.int 512
    %int3_6649 = torch.constant.int 3
    %int24_6650 = torch.constant.int 24
    %int128_6651 = torch.constant.int 128
    %5309 = torch.prim.ListConstruct %int1_6647, %int512_6648, %int3_6649, %int24_6650, %int128_6651 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5310 = torch.aten.view %5308, %5309 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_6652 = torch.constant.int 2
    %int0_6653 = torch.constant.int 0
    %int3_6654 = torch.constant.int 3
    %int1_6655 = torch.constant.int 1
    %int4_6656 = torch.constant.int 4
    %5311 = torch.prim.ListConstruct %int2_6652, %int0_6653, %int3_6654, %int1_6655, %int4_6656 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5312 = torch.aten.permute %5310, %5311 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_6657 = torch.constant.int 0
    %int0_6658 = torch.constant.int 0
    %5313 = torch.aten.select.int %5312, %int0_6657, %int0_6658 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_6659 = torch.constant.int 0
    %int1_6660 = torch.constant.int 1
    %5314 = torch.aten.select.int %5312, %int0_6659, %int1_6660 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_6661 = torch.constant.int 0
    %int2_6662 = torch.constant.int 2
    %5315 = torch.aten.select.int %5312, %int0_6661, %int2_6662 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_6663 = torch.constant.int 6
    %5316 = torch.prims.convert_element_type %5313, %int6_6663 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_6664 = torch.constant.int 2
    %5317 = torch.aten.pow.Tensor_Scalar %5316, %int2_6664 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_6665 = torch.constant.int -1
    %5318 = torch.prim.ListConstruct %int-1_6665 : (!torch.int) -> !torch.list<int>
    %true_6666 = torch.constant.bool true
    %none_6667 = torch.constant.none
    %5319 = torch.aten.mean.dim %5317, %5318, %true_6666, %none_6667 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_6668 = torch.constant.float 9.9999999999999995E-7
    %int1_6669 = torch.constant.int 1
    %5320 = torch.aten.add.Scalar %5319, %float9.999990e-07_6668, %int1_6669 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %5321 = torch.aten.rsqrt %5320 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %5322 = torch.aten.mul.Tensor %5316, %5321 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_6670 = torch.constant.int 5
    %5323 = torch.prims.convert_element_type %5322, %int5_6670 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.16.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.16.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %5324 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5325 = torch.aten.mul.Tensor %5323, %5324 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_6671 = torch.constant.int 6
    %5326 = torch.prims.convert_element_type %5314, %int6_6671 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_6672 = torch.constant.int 2
    %5327 = torch.aten.pow.Tensor_Scalar %5326, %int2_6672 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_6673 = torch.constant.int -1
    %5328 = torch.prim.ListConstruct %int-1_6673 : (!torch.int) -> !torch.list<int>
    %true_6674 = torch.constant.bool true
    %none_6675 = torch.constant.none
    %5329 = torch.aten.mean.dim %5327, %5328, %true_6674, %none_6675 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_6676 = torch.constant.float 9.9999999999999995E-7
    %int1_6677 = torch.constant.int 1
    %5330 = torch.aten.add.Scalar %5329, %float9.999990e-07_6676, %int1_6677 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %5331 = torch.aten.rsqrt %5330 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %5332 = torch.aten.mul.Tensor %5326, %5331 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_6678 = torch.constant.int 5
    %5333 = torch.prims.convert_element_type %5332, %int5_6678 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.16.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.16.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %5334 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5335 = torch.aten.mul.Tensor %5333, %5334 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_6679 = torch.constant.int 5
    %5336 = torch.prims.convert_element_type %5325, %int5_6679 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_6680 = torch.constant.int 5
    %5337 = torch.prims.convert_element_type %5335, %int5_6680 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %5338 = torch.prim.ListConstruct %5336, %5282 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_6681 = torch.constant.int 2
    %5339 = torch.aten.cat %5338, %int2_6681 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %5340 = torch.prim.ListConstruct %5337, %5283 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_6682 = torch.constant.int 2
    %5341 = torch.aten.cat %5340, %int2_6682 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %5342 = torch.prim.ListConstruct %5315, %5261 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_6683 = torch.constant.int 2
    %5343 = torch.aten.cat %5342, %int2_6683 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_6684 = torch.constant.int 6
    %5344 = torch.prims.convert_element_type %5339, %int6_6684 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_6685 = torch.constant.int 1
    %int24_6686 = torch.constant.int 24
    %int4608_6687 = torch.constant.int 4608
    %int-1_6688 = torch.constant.int -1
    %int1_6689 = torch.constant.int 1
    %int2_6690 = torch.constant.int 2
    %5345 = torch.prim.ListConstruct %int1_6685, %int24_6686, %int4608_6687, %int-1_6688, %int1_6689, %int2_6690 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5346 = torch.aten.view %5344, %5345 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_6691 = torch.constant.int 6
    %5347 = torch.prims.convert_element_type %5341, %int6_6691 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_6692 = torch.constant.int 1
    %int24_6693 = torch.constant.int 24
    %int4608_6694 = torch.constant.int 4608
    %int-1_6695 = torch.constant.int -1
    %int1_6696 = torch.constant.int 1
    %int2_6697 = torch.constant.int 2
    %5348 = torch.prim.ListConstruct %int1_6692, %int24_6693, %int4608_6694, %int-1_6695, %int1_6696, %int2_6697 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5349 = torch.aten.view %5347, %5348 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_6698 = torch.constant.int 5
    %int0_6699 = torch.constant.int 0
    %5350 = torch.aten.select.int %211, %int5_6698, %int0_6699 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6700 = torch.constant.int 5
    %int0_6701 = torch.constant.int 0
    %5351 = torch.aten.select.int %5346, %int5_6700, %int0_6701 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5352 = torch.aten.mul.Tensor %5350, %5351 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_6702 = torch.constant.int 5
    %int1_6703 = torch.constant.int 1
    %5353 = torch.aten.select.int %211, %int5_6702, %int1_6703 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6704 = torch.constant.int 5
    %int1_6705 = torch.constant.int 1
    %5354 = torch.aten.select.int %5346, %int5_6704, %int1_6705 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5355 = torch.aten.mul.Tensor %5353, %5354 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_6706 = torch.constant.int 1
    %5356 = torch.aten.add.Tensor %5352, %5355, %int1_6706 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_6707 = torch.constant.int 5
    %int0_6708 = torch.constant.int 0
    %5357 = torch.aten.select.int %211, %int5_6707, %int0_6708 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6709 = torch.constant.int 5
    %int0_6710 = torch.constant.int 0
    %5358 = torch.aten.select.int %5349, %int5_6709, %int0_6710 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5359 = torch.aten.mul.Tensor %5357, %5358 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_6711 = torch.constant.int 5
    %int1_6712 = torch.constant.int 1
    %5360 = torch.aten.select.int %211, %int5_6711, %int1_6712 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_6713 = torch.constant.int 5
    %int1_6714 = torch.constant.int 1
    %5361 = torch.aten.select.int %5349, %int5_6713, %int1_6714 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5362 = torch.aten.mul.Tensor %5360, %5361 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_6715 = torch.constant.int 1
    %5363 = torch.aten.add.Tensor %5359, %5362, %int1_6715 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_6716 = torch.constant.int 1
    %int24_6717 = torch.constant.int 24
    %int4608_6718 = torch.constant.int 4608
    %int128_6719 = torch.constant.int 128
    %5364 = torch.prim.ListConstruct %int1_6716, %int24_6717, %int4608_6718, %int128_6719 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5365 = torch.aten.view %5356, %5364 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_6720 = torch.constant.int 5
    %5366 = torch.prims.convert_element_type %5365, %int5_6720 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_6721 = torch.constant.int 1
    %int24_6722 = torch.constant.int 24
    %int4608_6723 = torch.constant.int 4608
    %int128_6724 = torch.constant.int 128
    %5367 = torch.prim.ListConstruct %int1_6721, %int24_6722, %int4608_6723, %int128_6724 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5368 = torch.aten.view %5363, %5367 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_6725 = torch.constant.int 5
    %5369 = torch.prims.convert_element_type %5368, %int5_6725 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_6726 = torch.constant.float 0.000000e+00
    %false_6727 = torch.constant.bool false
    %none_6728 = torch.constant.none
    %none_6729 = torch.constant.none
    %5370:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%5366, %5369, %5343, %float0.000000e00_6726, %false_6727, %none_6728, %none_6729) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_6730 = torch.constant.int 0
    %int2_6731 = torch.constant.int 2
    %int1_6732 = torch.constant.int 1
    %int3_6733 = torch.constant.int 3
    %5371 = torch.prim.ListConstruct %int0_6730, %int2_6731, %int1_6732, %int3_6733 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5372 = torch.aten.permute %5370#0, %5371 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_6734 = torch.constant.int 1
    %int4608_6735 = torch.constant.int 4608
    %int3072_6736 = torch.constant.int 3072
    %5373 = torch.prim.ListConstruct %int1_6734, %int4608_6735, %int3072_6736 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5374 = torch.aten.view %5372, %5373 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_6737 = torch.constant.int 0
    %int0_6738 = torch.constant.int 0
    %int9223372036854775807_6739 = torch.constant.int 9223372036854775807
    %int1_6740 = torch.constant.int 1
    %5375 = torch.aten.slice.Tensor %5374, %int0_6737, %int0_6738, %int9223372036854775807_6739, %int1_6740 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_6741 = torch.constant.int 1
    %int0_6742 = torch.constant.int 0
    %int512_6743 = torch.constant.int 512
    %int1_6744 = torch.constant.int 1
    %5376 = torch.aten.slice.Tensor %5375, %int1_6741, %int0_6742, %int512_6743, %int1_6744 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_6745 = torch.constant.int 0
    %int0_6746 = torch.constant.int 0
    %int9223372036854775807_6747 = torch.constant.int 9223372036854775807
    %int1_6748 = torch.constant.int 1
    %5377 = torch.aten.slice.Tensor %5374, %int0_6745, %int0_6746, %int9223372036854775807_6747, %int1_6748 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_6749 = torch.constant.int 1
    %int512_6750 = torch.constant.int 512
    %int9223372036854775807_6751 = torch.constant.int 9223372036854775807
    %int1_6752 = torch.constant.int 1
    %5378 = torch.aten.slice.Tensor %5377, %int1_6749, %int512_6750, %int9223372036854775807_6751, %int1_6752 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_6753 = torch.constant.int 4096
    %int3072_6754 = torch.constant.int 3072
    %5379 = torch.prim.ListConstruct %int4096_6753, %int3072_6754 : (!torch.int, !torch.int) -> !torch.list<int>
    %5380 = torch.aten.view %5378, %5379 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.16.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.16.img_attn.proj.weight : tensor<3072x3072xf16>
    %5381 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_6755 = torch.constant.int 0
    %int1_6756 = torch.constant.int 1
    %5382 = torch.aten.transpose.int %5381, %int0_6755, %int1_6756 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.16.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.16.img_attn.proj.bias : tensor<3072xf16>
    %5383 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6757 = torch.constant.int 6
    %5384 = torch.prims.convert_element_type %5383, %int6_6757 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6758 = torch.constant.int 6
    %5385 = torch.prims.convert_element_type %5380, %int6_6758 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_6759 = torch.constant.int 6
    %5386 = torch.prims.convert_element_type %5382, %int6_6759 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %5387 = torch.aten.mm %5385, %5386 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_6760 = torch.constant.int 1
    %5388 = torch.aten.mul.Scalar %5387, %int1_6760 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_6761 = torch.constant.int 1
    %5389 = torch.aten.mul.Scalar %5384, %int1_6761 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6762 = torch.constant.int 1
    %5390 = torch.aten.add.Tensor %5388, %5389, %int1_6762 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_6763 = torch.constant.int 5
    %5391 = torch.prims.convert_element_type %5390, %int5_6763 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_6764 = torch.constant.int 1
    %int4096_6765 = torch.constant.int 4096
    %int3072_6766 = torch.constant.int 3072
    %5392 = torch.prim.ListConstruct %int1_6764, %int4096_6765, %int3072_6766 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5393 = torch.aten.view %5391, %5392 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %5394 = torch.aten.mul.Tensor %5205, %5393 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6767 = torch.constant.int 1
    %5395 = torch.aten.add.Tensor %5127, %5394, %int1_6767 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6768 = torch.constant.int 1
    %int1_6769 = torch.constant.int 1
    %5396 = torch.aten.add.Scalar %5207, %int1_6768, %int1_6769 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_6770 = torch.constant.int 6
    %5397 = torch.prims.convert_element_type %5395, %int6_6770 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_6771 = torch.constant.int 2
    %5398 = torch.prim.ListConstruct %int2_6771 : (!torch.int) -> !torch.list<int>
    %int0_6772 = torch.constant.int 0
    %true_6773 = torch.constant.bool true
    %result0_6774, %result1_6775 = torch.aten.var_mean.correction %5397, %5398, %int0_6772, %true_6773 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_6776 = torch.constant.float 9.9999999999999995E-7
    %int1_6777 = torch.constant.int 1
    %5399 = torch.aten.add.Scalar %result0_6774, %float9.999990e-07_6776, %int1_6777 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %5400 = torch.aten.rsqrt %5399 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_6778 = torch.constant.int 1
    %5401 = torch.aten.sub.Tensor %5395, %result1_6775, %int1_6778 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %5402 = torch.aten.mul.Tensor %5401, %5400 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_6779 = torch.constant.int 5
    %5403 = torch.prims.convert_element_type %5402, %int5_6779 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %5404 = torch.aten.mul.Tensor %5396, %5403 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6780 = torch.constant.int 1
    %5405 = torch.aten.add.Tensor %5404, %5206, %int1_6780 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_6781 = torch.constant.int 4096
    %int3072_6782 = torch.constant.int 3072
    %5406 = torch.prim.ListConstruct %int4096_6781, %int3072_6782 : (!torch.int, !torch.int) -> !torch.list<int>
    %5407 = torch.aten.view %5405, %5406 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.16.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.16.img_mlp.0.weight : tensor<12288x3072xf16>
    %5408 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_6783 = torch.constant.int 0
    %int1_6784 = torch.constant.int 1
    %5409 = torch.aten.transpose.int %5408, %int0_6783, %int1_6784 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.16.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.16.img_mlp.0.bias : tensor<12288xf16>
    %5410 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_6785 = torch.constant.int 6
    %5411 = torch.prims.convert_element_type %5410, %int6_6785 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_6786 = torch.constant.int 6
    %5412 = torch.prims.convert_element_type %5407, %int6_6786 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_6787 = torch.constant.int 6
    %5413 = torch.prims.convert_element_type %5409, %int6_6787 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %5414 = torch.aten.mm %5412, %5413 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_6788 = torch.constant.int 1
    %5415 = torch.aten.mul.Scalar %5414, %int1_6788 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_6789 = torch.constant.int 1
    %5416 = torch.aten.mul.Scalar %5411, %int1_6789 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_6790 = torch.constant.int 1
    %5417 = torch.aten.add.Tensor %5415, %5416, %int1_6790 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_6791 = torch.constant.int 5
    %5418 = torch.prims.convert_element_type %5417, %int5_6791 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_6792 = torch.constant.int 1
    %int4096_6793 = torch.constant.int 4096
    %int12288_6794 = torch.constant.int 12288
    %5419 = torch.prim.ListConstruct %int1_6792, %int4096_6793, %int12288_6794 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5420 = torch.aten.view %5418, %5419 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_6795 = torch.constant.str "tanh"
    %5421 = torch.aten.gelu %5420, %str_6795 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_6796 = torch.constant.int 4096
    %int12288_6797 = torch.constant.int 12288
    %5422 = torch.prim.ListConstruct %int4096_6796, %int12288_6797 : (!torch.int, !torch.int) -> !torch.list<int>
    %5423 = torch.aten.view %5421, %5422 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.16.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.16.img_mlp.2.weight : tensor<3072x12288xf16>
    %5424 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_6798 = torch.constant.int 0
    %int1_6799 = torch.constant.int 1
    %5425 = torch.aten.transpose.int %5424, %int0_6798, %int1_6799 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.16.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.16.img_mlp.2.bias : tensor<3072xf16>
    %5426 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6800 = torch.constant.int 6
    %5427 = torch.prims.convert_element_type %5426, %int6_6800 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6801 = torch.constant.int 6
    %5428 = torch.prims.convert_element_type %5423, %int6_6801 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_6802 = torch.constant.int 6
    %5429 = torch.prims.convert_element_type %5425, %int6_6802 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %5430 = torch.aten.mm %5428, %5429 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_6803 = torch.constant.int 1
    %5431 = torch.aten.mul.Scalar %5430, %int1_6803 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_6804 = torch.constant.int 1
    %5432 = torch.aten.mul.Scalar %5427, %int1_6804 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6805 = torch.constant.int 1
    %5433 = torch.aten.add.Tensor %5431, %5432, %int1_6805 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_6806 = torch.constant.int 5
    %5434 = torch.prims.convert_element_type %5433, %int5_6806 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_6807 = torch.constant.int 1
    %int4096_6808 = torch.constant.int 4096
    %int3072_6809 = torch.constant.int 3072
    %5435 = torch.prim.ListConstruct %int1_6807, %int4096_6808, %int3072_6809 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5436 = torch.aten.view %5434, %5435 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %5437 = torch.aten.mul.Tensor %5208, %5436 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6810 = torch.constant.int 1
    %5438 = torch.aten.add.Tensor %5395, %5437, %int1_6810 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_6811 = torch.constant.int 512
    %int3072_6812 = torch.constant.int 3072
    %5439 = torch.prim.ListConstruct %int512_6811, %int3072_6812 : (!torch.int, !torch.int) -> !torch.list<int>
    %5440 = torch.aten.view %5376, %5439 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.16.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.16.txt_attn.proj.weight : tensor<3072x3072xf16>
    %5441 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_6813 = torch.constant.int 0
    %int1_6814 = torch.constant.int 1
    %5442 = torch.aten.transpose.int %5441, %int0_6813, %int1_6814 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.16.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.16.txt_attn.proj.bias : tensor<3072xf16>
    %5443 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6815 = torch.constant.int 6
    %5444 = torch.prims.convert_element_type %5443, %int6_6815 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6816 = torch.constant.int 6
    %5445 = torch.prims.convert_element_type %5440, %int6_6816 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_6817 = torch.constant.int 6
    %5446 = torch.prims.convert_element_type %5442, %int6_6817 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %5447 = torch.aten.mm %5445, %5446 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_6818 = torch.constant.int 1
    %5448 = torch.aten.mul.Scalar %5447, %int1_6818 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_6819 = torch.constant.int 1
    %5449 = torch.aten.mul.Scalar %5444, %int1_6819 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6820 = torch.constant.int 1
    %5450 = torch.aten.add.Tensor %5448, %5449, %int1_6820 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_6821 = torch.constant.int 5
    %5451 = torch.prims.convert_element_type %5450, %int5_6821 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_6822 = torch.constant.int 1
    %int512_6823 = torch.constant.int 512
    %int3072_6824 = torch.constant.int 3072
    %5452 = torch.prim.ListConstruct %int1_6822, %int512_6823, %int3072_6824 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5453 = torch.aten.view %5451, %5452 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %5454 = torch.aten.mul.Tensor %5226, %5453 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6825 = torch.constant.int 1
    %5455 = torch.aten.add.Tensor %5187, %5454, %int1_6825 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_6826 = torch.constant.int 1
    %int1_6827 = torch.constant.int 1
    %5456 = torch.aten.add.Scalar %5228, %int1_6826, %int1_6827 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_6828 = torch.constant.int 6
    %5457 = torch.prims.convert_element_type %5455, %int6_6828 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_6829 = torch.constant.int 2
    %5458 = torch.prim.ListConstruct %int2_6829 : (!torch.int) -> !torch.list<int>
    %int0_6830 = torch.constant.int 0
    %true_6831 = torch.constant.bool true
    %result0_6832, %result1_6833 = torch.aten.var_mean.correction %5457, %5458, %int0_6830, %true_6831 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_6834 = torch.constant.float 9.9999999999999995E-7
    %int1_6835 = torch.constant.int 1
    %5459 = torch.aten.add.Scalar %result0_6832, %float9.999990e-07_6834, %int1_6835 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %5460 = torch.aten.rsqrt %5459 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_6836 = torch.constant.int 1
    %5461 = torch.aten.sub.Tensor %5455, %result1_6833, %int1_6836 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %5462 = torch.aten.mul.Tensor %5461, %5460 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_6837 = torch.constant.int 5
    %5463 = torch.prims.convert_element_type %5462, %int5_6837 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %5464 = torch.aten.mul.Tensor %5456, %5463 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6838 = torch.constant.int 1
    %5465 = torch.aten.add.Tensor %5464, %5227, %int1_6838 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_6839 = torch.constant.int 512
    %int3072_6840 = torch.constant.int 3072
    %5466 = torch.prim.ListConstruct %int512_6839, %int3072_6840 : (!torch.int, !torch.int) -> !torch.list<int>
    %5467 = torch.aten.view %5465, %5466 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.16.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.16.txt_mlp.0.weight : tensor<12288x3072xf16>
    %5468 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_6841 = torch.constant.int 0
    %int1_6842 = torch.constant.int 1
    %5469 = torch.aten.transpose.int %5468, %int0_6841, %int1_6842 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.16.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.16.txt_mlp.0.bias : tensor<12288xf16>
    %5470 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_6843 = torch.constant.int 6
    %5471 = torch.prims.convert_element_type %5470, %int6_6843 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_6844 = torch.constant.int 6
    %5472 = torch.prims.convert_element_type %5467, %int6_6844 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_6845 = torch.constant.int 6
    %5473 = torch.prims.convert_element_type %5469, %int6_6845 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %5474 = torch.aten.mm %5472, %5473 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_6846 = torch.constant.int 1
    %5475 = torch.aten.mul.Scalar %5474, %int1_6846 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_6847 = torch.constant.int 1
    %5476 = torch.aten.mul.Scalar %5471, %int1_6847 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_6848 = torch.constant.int 1
    %5477 = torch.aten.add.Tensor %5475, %5476, %int1_6848 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_6849 = torch.constant.int 5
    %5478 = torch.prims.convert_element_type %5477, %int5_6849 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_6850 = torch.constant.int 1
    %int512_6851 = torch.constant.int 512
    %int12288_6852 = torch.constant.int 12288
    %5479 = torch.prim.ListConstruct %int1_6850, %int512_6851, %int12288_6852 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5480 = torch.aten.view %5478, %5479 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_6853 = torch.constant.str "tanh"
    %5481 = torch.aten.gelu %5480, %str_6853 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_6854 = torch.constant.int 512
    %int12288_6855 = torch.constant.int 12288
    %5482 = torch.prim.ListConstruct %int512_6854, %int12288_6855 : (!torch.int, !torch.int) -> !torch.list<int>
    %5483 = torch.aten.view %5481, %5482 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.16.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.16.txt_mlp.2.weight : tensor<3072x12288xf16>
    %5484 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_6856 = torch.constant.int 0
    %int1_6857 = torch.constant.int 1
    %5485 = torch.aten.transpose.int %5484, %int0_6856, %int1_6857 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.16.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.16.txt_mlp.2.bias : tensor<3072xf16>
    %5486 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.16.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_6858 = torch.constant.int 6
    %5487 = torch.prims.convert_element_type %5486, %int6_6858 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_6859 = torch.constant.int 6
    %5488 = torch.prims.convert_element_type %5483, %int6_6859 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_6860 = torch.constant.int 6
    %5489 = torch.prims.convert_element_type %5485, %int6_6860 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %5490 = torch.aten.mm %5488, %5489 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_6861 = torch.constant.int 1
    %5491 = torch.aten.mul.Scalar %5490, %int1_6861 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_6862 = torch.constant.int 1
    %5492 = torch.aten.mul.Scalar %5487, %int1_6862 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_6863 = torch.constant.int 1
    %5493 = torch.aten.add.Tensor %5491, %5492, %int1_6863 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_6864 = torch.constant.int 5
    %5494 = torch.prims.convert_element_type %5493, %int5_6864 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_6865 = torch.constant.int 1
    %int512_6866 = torch.constant.int 512
    %int3072_6867 = torch.constant.int 3072
    %5495 = torch.prim.ListConstruct %int1_6865, %int512_6866, %int3072_6867 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5496 = torch.aten.view %5494, %5495 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %5497 = torch.aten.mul.Tensor %5229, %5496 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_6868 = torch.constant.int 1
    %5498 = torch.aten.add.Tensor %5455, %5497, %int1_6868 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %5499 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.17.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.17.img_mod.lin.weight : tensor<18432x3072xf16>
    %5500 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_6869 = torch.constant.int 0
    %int1_6870 = torch.constant.int 1
    %5501 = torch.aten.transpose.int %5500, %int0_6869, %int1_6870 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.17.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.17.img_mod.lin.bias : tensor<18432xf16>
    %5502 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_6871 = torch.constant.int 6
    %5503 = torch.prims.convert_element_type %5502, %int6_6871 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_6872 = torch.constant.int 6
    %5504 = torch.prims.convert_element_type %5499, %int6_6872 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_6873 = torch.constant.int 6
    %5505 = torch.prims.convert_element_type %5501, %int6_6873 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %5506 = torch.aten.mm %5504, %5505 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_6874 = torch.constant.int 1
    %5507 = torch.aten.mul.Scalar %5506, %int1_6874 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_6875 = torch.constant.int 1
    %5508 = torch.aten.mul.Scalar %5503, %int1_6875 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_6876 = torch.constant.int 1
    %5509 = torch.aten.add.Tensor %5507, %5508, %int1_6876 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_6877 = torch.constant.int 5
    %5510 = torch.prims.convert_element_type %5509, %int5_6877 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_6878 = torch.constant.int 0
    %int0_6879 = torch.constant.int 0
    %int9223372036854775807_6880 = torch.constant.int 9223372036854775807
    %int1_6881 = torch.constant.int 1
    %5511 = torch.aten.slice.Tensor %5510, %int0_6878, %int0_6879, %int9223372036854775807_6880, %int1_6881 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_6882 = torch.constant.int 1
    %5512 = torch.aten.unsqueeze %5511, %int1_6882 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_6883 = torch.constant.int 2
    %int0_6884 = torch.constant.int 0
    %int9223372036854775807_6885 = torch.constant.int 9223372036854775807
    %int1_6886 = torch.constant.int 1
    %5513 = torch.aten.slice.Tensor %5512, %int2_6883, %int0_6884, %int9223372036854775807_6885, %int1_6886 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_6887 = torch.constant.int -1
    %int0_6888 = torch.constant.int 0
    %int3072_6889 = torch.constant.int 3072
    %int1_6890 = torch.constant.int 1
    %5514 = torch.aten.slice.Tensor %5513, %int-1_6887, %int0_6888, %int3072_6889, %int1_6890 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6891 = torch.constant.int -1
    %int3072_6892 = torch.constant.int 3072
    %int6144_6893 = torch.constant.int 6144
    %int1_6894 = torch.constant.int 1
    %5515 = torch.aten.slice.Tensor %5513, %int-1_6891, %int3072_6892, %int6144_6893, %int1_6894 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6895 = torch.constant.int -1
    %int6144_6896 = torch.constant.int 6144
    %int9216_6897 = torch.constant.int 9216
    %int1_6898 = torch.constant.int 1
    %5516 = torch.aten.slice.Tensor %5513, %int-1_6895, %int6144_6896, %int9216_6897, %int1_6898 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6899 = torch.constant.int -1
    %int9216_6900 = torch.constant.int 9216
    %int12288_6901 = torch.constant.int 12288
    %int1_6902 = torch.constant.int 1
    %5517 = torch.aten.slice.Tensor %5513, %int-1_6899, %int9216_6900, %int12288_6901, %int1_6902 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6903 = torch.constant.int -1
    %int12288_6904 = torch.constant.int 12288
    %int15360_6905 = torch.constant.int 15360
    %int1_6906 = torch.constant.int 1
    %5518 = torch.aten.slice.Tensor %5513, %int-1_6903, %int12288_6904, %int15360_6905, %int1_6906 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6907 = torch.constant.int -1
    %int15360_6908 = torch.constant.int 15360
    %int18432_6909 = torch.constant.int 18432
    %int1_6910 = torch.constant.int 1
    %5519 = torch.aten.slice.Tensor %5513, %int-1_6907, %int15360_6908, %int18432_6909, %int1_6910 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5520 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.17.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.17.txt_mod.lin.weight : tensor<18432x3072xf16>
    %5521 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_6911 = torch.constant.int 0
    %int1_6912 = torch.constant.int 1
    %5522 = torch.aten.transpose.int %5521, %int0_6911, %int1_6912 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.17.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.17.txt_mod.lin.bias : tensor<18432xf16>
    %5523 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_6913 = torch.constant.int 6
    %5524 = torch.prims.convert_element_type %5523, %int6_6913 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_6914 = torch.constant.int 6
    %5525 = torch.prims.convert_element_type %5520, %int6_6914 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_6915 = torch.constant.int 6
    %5526 = torch.prims.convert_element_type %5522, %int6_6915 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %5527 = torch.aten.mm %5525, %5526 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_6916 = torch.constant.int 1
    %5528 = torch.aten.mul.Scalar %5527, %int1_6916 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_6917 = torch.constant.int 1
    %5529 = torch.aten.mul.Scalar %5524, %int1_6917 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_6918 = torch.constant.int 1
    %5530 = torch.aten.add.Tensor %5528, %5529, %int1_6918 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_6919 = torch.constant.int 5
    %5531 = torch.prims.convert_element_type %5530, %int5_6919 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_6920 = torch.constant.int 0
    %int0_6921 = torch.constant.int 0
    %int9223372036854775807_6922 = torch.constant.int 9223372036854775807
    %int1_6923 = torch.constant.int 1
    %5532 = torch.aten.slice.Tensor %5531, %int0_6920, %int0_6921, %int9223372036854775807_6922, %int1_6923 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_6924 = torch.constant.int 1
    %5533 = torch.aten.unsqueeze %5532, %int1_6924 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_6925 = torch.constant.int 2
    %int0_6926 = torch.constant.int 0
    %int9223372036854775807_6927 = torch.constant.int 9223372036854775807
    %int1_6928 = torch.constant.int 1
    %5534 = torch.aten.slice.Tensor %5533, %int2_6925, %int0_6926, %int9223372036854775807_6927, %int1_6928 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_6929 = torch.constant.int -1
    %int0_6930 = torch.constant.int 0
    %int3072_6931 = torch.constant.int 3072
    %int1_6932 = torch.constant.int 1
    %5535 = torch.aten.slice.Tensor %5534, %int-1_6929, %int0_6930, %int3072_6931, %int1_6932 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6933 = torch.constant.int -1
    %int3072_6934 = torch.constant.int 3072
    %int6144_6935 = torch.constant.int 6144
    %int1_6936 = torch.constant.int 1
    %5536 = torch.aten.slice.Tensor %5534, %int-1_6933, %int3072_6934, %int6144_6935, %int1_6936 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6937 = torch.constant.int -1
    %int6144_6938 = torch.constant.int 6144
    %int9216_6939 = torch.constant.int 9216
    %int1_6940 = torch.constant.int 1
    %5537 = torch.aten.slice.Tensor %5534, %int-1_6937, %int6144_6938, %int9216_6939, %int1_6940 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6941 = torch.constant.int -1
    %int9216_6942 = torch.constant.int 9216
    %int12288_6943 = torch.constant.int 12288
    %int1_6944 = torch.constant.int 1
    %5538 = torch.aten.slice.Tensor %5534, %int-1_6941, %int9216_6942, %int12288_6943, %int1_6944 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6945 = torch.constant.int -1
    %int12288_6946 = torch.constant.int 12288
    %int15360_6947 = torch.constant.int 15360
    %int1_6948 = torch.constant.int 1
    %5539 = torch.aten.slice.Tensor %5534, %int-1_6945, %int12288_6946, %int15360_6947, %int1_6948 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_6949 = torch.constant.int -1
    %int15360_6950 = torch.constant.int 15360
    %int18432_6951 = torch.constant.int 18432
    %int1_6952 = torch.constant.int 1
    %5540 = torch.aten.slice.Tensor %5534, %int-1_6949, %int15360_6950, %int18432_6951, %int1_6952 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_6953 = torch.constant.int 6
    %5541 = torch.prims.convert_element_type %5438, %int6_6953 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_6954 = torch.constant.int 2
    %5542 = torch.prim.ListConstruct %int2_6954 : (!torch.int) -> !torch.list<int>
    %int0_6955 = torch.constant.int 0
    %true_6956 = torch.constant.bool true
    %result0_6957, %result1_6958 = torch.aten.var_mean.correction %5541, %5542, %int0_6955, %true_6956 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_6959 = torch.constant.float 9.9999999999999995E-7
    %int1_6960 = torch.constant.int 1
    %5543 = torch.aten.add.Scalar %result0_6957, %float9.999990e-07_6959, %int1_6960 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %5544 = torch.aten.rsqrt %5543 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_6961 = torch.constant.int 1
    %5545 = torch.aten.sub.Tensor %5438, %result1_6958, %int1_6961 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %5546 = torch.aten.mul.Tensor %5545, %5544 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_6962 = torch.constant.int 5
    %5547 = torch.prims.convert_element_type %5546, %int5_6962 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6963 = torch.constant.int 1
    %int1_6964 = torch.constant.int 1
    %5548 = torch.aten.add.Scalar %5515, %int1_6963, %int1_6964 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5549 = torch.aten.mul.Tensor %5548, %5547 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_6965 = torch.constant.int 1
    %5550 = torch.aten.add.Tensor %5549, %5514, %int1_6965 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_6966 = torch.constant.int 4096
    %int3072_6967 = torch.constant.int 3072
    %5551 = torch.prim.ListConstruct %int4096_6966, %int3072_6967 : (!torch.int, !torch.int) -> !torch.list<int>
    %5552 = torch.aten.view %5550, %5551 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.17.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.17.img_attn.qkv.weight : tensor<9216x3072xf16>
    %5553 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_6968 = torch.constant.int 0
    %int1_6969 = torch.constant.int 1
    %5554 = torch.aten.transpose.int %5553, %int0_6968, %int1_6969 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.17.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.17.img_attn.qkv.bias : tensor<9216xf16>
    %5555 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_6970 = torch.constant.int 6
    %5556 = torch.prims.convert_element_type %5555, %int6_6970 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_6971 = torch.constant.int 6
    %5557 = torch.prims.convert_element_type %5552, %int6_6971 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_6972 = torch.constant.int 6
    %5558 = torch.prims.convert_element_type %5554, %int6_6972 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %5559 = torch.aten.mm %5557, %5558 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_6973 = torch.constant.int 1
    %5560 = torch.aten.mul.Scalar %5559, %int1_6973 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_6974 = torch.constant.int 1
    %5561 = torch.aten.mul.Scalar %5556, %int1_6974 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_6975 = torch.constant.int 1
    %5562 = torch.aten.add.Tensor %5560, %5561, %int1_6975 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_6976 = torch.constant.int 5
    %5563 = torch.prims.convert_element_type %5562, %int5_6976 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_6977 = torch.constant.int 1
    %int4096_6978 = torch.constant.int 4096
    %int9216_6979 = torch.constant.int 9216
    %5564 = torch.prim.ListConstruct %int1_6977, %int4096_6978, %int9216_6979 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5565 = torch.aten.view %5563, %5564 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_6980 = torch.constant.int 1
    %int4096_6981 = torch.constant.int 4096
    %int3_6982 = torch.constant.int 3
    %int24_6983 = torch.constant.int 24
    %int128_6984 = torch.constant.int 128
    %5566 = torch.prim.ListConstruct %int1_6980, %int4096_6981, %int3_6982, %int24_6983, %int128_6984 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5567 = torch.aten.view %5565, %5566 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_6985 = torch.constant.int 2
    %int0_6986 = torch.constant.int 0
    %int3_6987 = torch.constant.int 3
    %int1_6988 = torch.constant.int 1
    %int4_6989 = torch.constant.int 4
    %5568 = torch.prim.ListConstruct %int2_6985, %int0_6986, %int3_6987, %int1_6988, %int4_6989 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5569 = torch.aten.permute %5567, %5568 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_6990 = torch.constant.int 0
    %int0_6991 = torch.constant.int 0
    %5570 = torch.aten.select.int %5569, %int0_6990, %int0_6991 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_6992 = torch.constant.int 0
    %int1_6993 = torch.constant.int 1
    %5571 = torch.aten.select.int %5569, %int0_6992, %int1_6993 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_6994 = torch.constant.int 0
    %int2_6995 = torch.constant.int 2
    %5572 = torch.aten.select.int %5569, %int0_6994, %int2_6995 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_6996 = torch.constant.int 6
    %5573 = torch.prims.convert_element_type %5570, %int6_6996 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_6997 = torch.constant.int 2
    %5574 = torch.aten.pow.Tensor_Scalar %5573, %int2_6997 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_6998 = torch.constant.int -1
    %5575 = torch.prim.ListConstruct %int-1_6998 : (!torch.int) -> !torch.list<int>
    %true_6999 = torch.constant.bool true
    %none_7000 = torch.constant.none
    %5576 = torch.aten.mean.dim %5574, %5575, %true_6999, %none_7000 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_7001 = torch.constant.float 9.9999999999999995E-7
    %int1_7002 = torch.constant.int 1
    %5577 = torch.aten.add.Scalar %5576, %float9.999990e-07_7001, %int1_7002 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %5578 = torch.aten.rsqrt %5577 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %5579 = torch.aten.mul.Tensor %5573, %5578 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_7003 = torch.constant.int 5
    %5580 = torch.prims.convert_element_type %5579, %int5_7003 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.17.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.17.img_attn.norm.query_norm.scale : tensor<128xf16>
    %5581 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5582 = torch.aten.mul.Tensor %5580, %5581 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_7004 = torch.constant.int 6
    %5583 = torch.prims.convert_element_type %5571, %int6_7004 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_7005 = torch.constant.int 2
    %5584 = torch.aten.pow.Tensor_Scalar %5583, %int2_7005 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_7006 = torch.constant.int -1
    %5585 = torch.prim.ListConstruct %int-1_7006 : (!torch.int) -> !torch.list<int>
    %true_7007 = torch.constant.bool true
    %none_7008 = torch.constant.none
    %5586 = torch.aten.mean.dim %5584, %5585, %true_7007, %none_7008 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_7009 = torch.constant.float 9.9999999999999995E-7
    %int1_7010 = torch.constant.int 1
    %5587 = torch.aten.add.Scalar %5586, %float9.999990e-07_7009, %int1_7010 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %5588 = torch.aten.rsqrt %5587 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %5589 = torch.aten.mul.Tensor %5583, %5588 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_7011 = torch.constant.int 5
    %5590 = torch.prims.convert_element_type %5589, %int5_7011 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.17.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.17.img_attn.norm.key_norm.scale : tensor<128xf16>
    %5591 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5592 = torch.aten.mul.Tensor %5590, %5591 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_7012 = torch.constant.int 5
    %5593 = torch.prims.convert_element_type %5582, %int5_7012 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_7013 = torch.constant.int 5
    %5594 = torch.prims.convert_element_type %5592, %int5_7013 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_7014 = torch.constant.int 6
    %5595 = torch.prims.convert_element_type %5498, %int6_7014 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_7015 = torch.constant.int 2
    %5596 = torch.prim.ListConstruct %int2_7015 : (!torch.int) -> !torch.list<int>
    %int0_7016 = torch.constant.int 0
    %true_7017 = torch.constant.bool true
    %result0_7018, %result1_7019 = torch.aten.var_mean.correction %5595, %5596, %int0_7016, %true_7017 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_7020 = torch.constant.float 9.9999999999999995E-7
    %int1_7021 = torch.constant.int 1
    %5597 = torch.aten.add.Scalar %result0_7018, %float9.999990e-07_7020, %int1_7021 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %5598 = torch.aten.rsqrt %5597 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_7022 = torch.constant.int 1
    %5599 = torch.aten.sub.Tensor %5498, %result1_7019, %int1_7022 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %5600 = torch.aten.mul.Tensor %5599, %5598 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_7023 = torch.constant.int 5
    %5601 = torch.prims.convert_element_type %5600, %int5_7023 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_7024 = torch.constant.int 1
    %int1_7025 = torch.constant.int 1
    %5602 = torch.aten.add.Scalar %5536, %int1_7024, %int1_7025 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5603 = torch.aten.mul.Tensor %5602, %5601 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7026 = torch.constant.int 1
    %5604 = torch.aten.add.Tensor %5603, %5535, %int1_7026 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_7027 = torch.constant.int 512
    %int3072_7028 = torch.constant.int 3072
    %5605 = torch.prim.ListConstruct %int512_7027, %int3072_7028 : (!torch.int, !torch.int) -> !torch.list<int>
    %5606 = torch.aten.view %5604, %5605 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.17.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.17.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %5607 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_7029 = torch.constant.int 0
    %int1_7030 = torch.constant.int 1
    %5608 = torch.aten.transpose.int %5607, %int0_7029, %int1_7030 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.17.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.17.txt_attn.qkv.bias : tensor<9216xf16>
    %5609 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_7031 = torch.constant.int 6
    %5610 = torch.prims.convert_element_type %5609, %int6_7031 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_7032 = torch.constant.int 6
    %5611 = torch.prims.convert_element_type %5606, %int6_7032 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_7033 = torch.constant.int 6
    %5612 = torch.prims.convert_element_type %5608, %int6_7033 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %5613 = torch.aten.mm %5611, %5612 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_7034 = torch.constant.int 1
    %5614 = torch.aten.mul.Scalar %5613, %int1_7034 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_7035 = torch.constant.int 1
    %5615 = torch.aten.mul.Scalar %5610, %int1_7035 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_7036 = torch.constant.int 1
    %5616 = torch.aten.add.Tensor %5614, %5615, %int1_7036 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_7037 = torch.constant.int 5
    %5617 = torch.prims.convert_element_type %5616, %int5_7037 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_7038 = torch.constant.int 1
    %int512_7039 = torch.constant.int 512
    %int9216_7040 = torch.constant.int 9216
    %5618 = torch.prim.ListConstruct %int1_7038, %int512_7039, %int9216_7040 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5619 = torch.aten.view %5617, %5618 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_7041 = torch.constant.int 1
    %int512_7042 = torch.constant.int 512
    %int3_7043 = torch.constant.int 3
    %int24_7044 = torch.constant.int 24
    %int128_7045 = torch.constant.int 128
    %5620 = torch.prim.ListConstruct %int1_7041, %int512_7042, %int3_7043, %int24_7044, %int128_7045 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5621 = torch.aten.view %5619, %5620 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_7046 = torch.constant.int 2
    %int0_7047 = torch.constant.int 0
    %int3_7048 = torch.constant.int 3
    %int1_7049 = torch.constant.int 1
    %int4_7050 = torch.constant.int 4
    %5622 = torch.prim.ListConstruct %int2_7046, %int0_7047, %int3_7048, %int1_7049, %int4_7050 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5623 = torch.aten.permute %5621, %5622 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_7051 = torch.constant.int 0
    %int0_7052 = torch.constant.int 0
    %5624 = torch.aten.select.int %5623, %int0_7051, %int0_7052 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_7053 = torch.constant.int 0
    %int1_7054 = torch.constant.int 1
    %5625 = torch.aten.select.int %5623, %int0_7053, %int1_7054 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_7055 = torch.constant.int 0
    %int2_7056 = torch.constant.int 2
    %5626 = torch.aten.select.int %5623, %int0_7055, %int2_7056 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_7057 = torch.constant.int 6
    %5627 = torch.prims.convert_element_type %5624, %int6_7057 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_7058 = torch.constant.int 2
    %5628 = torch.aten.pow.Tensor_Scalar %5627, %int2_7058 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_7059 = torch.constant.int -1
    %5629 = torch.prim.ListConstruct %int-1_7059 : (!torch.int) -> !torch.list<int>
    %true_7060 = torch.constant.bool true
    %none_7061 = torch.constant.none
    %5630 = torch.aten.mean.dim %5628, %5629, %true_7060, %none_7061 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_7062 = torch.constant.float 9.9999999999999995E-7
    %int1_7063 = torch.constant.int 1
    %5631 = torch.aten.add.Scalar %5630, %float9.999990e-07_7062, %int1_7063 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %5632 = torch.aten.rsqrt %5631 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %5633 = torch.aten.mul.Tensor %5627, %5632 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_7064 = torch.constant.int 5
    %5634 = torch.prims.convert_element_type %5633, %int5_7064 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.17.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.17.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %5635 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5636 = torch.aten.mul.Tensor %5634, %5635 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_7065 = torch.constant.int 6
    %5637 = torch.prims.convert_element_type %5625, %int6_7065 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_7066 = torch.constant.int 2
    %5638 = torch.aten.pow.Tensor_Scalar %5637, %int2_7066 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_7067 = torch.constant.int -1
    %5639 = torch.prim.ListConstruct %int-1_7067 : (!torch.int) -> !torch.list<int>
    %true_7068 = torch.constant.bool true
    %none_7069 = torch.constant.none
    %5640 = torch.aten.mean.dim %5638, %5639, %true_7068, %none_7069 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_7070 = torch.constant.float 9.9999999999999995E-7
    %int1_7071 = torch.constant.int 1
    %5641 = torch.aten.add.Scalar %5640, %float9.999990e-07_7070, %int1_7071 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %5642 = torch.aten.rsqrt %5641 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %5643 = torch.aten.mul.Tensor %5637, %5642 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_7072 = torch.constant.int 5
    %5644 = torch.prims.convert_element_type %5643, %int5_7072 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.17.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.17.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %5645 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5646 = torch.aten.mul.Tensor %5644, %5645 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_7073 = torch.constant.int 5
    %5647 = torch.prims.convert_element_type %5636, %int5_7073 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_7074 = torch.constant.int 5
    %5648 = torch.prims.convert_element_type %5646, %int5_7074 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %5649 = torch.prim.ListConstruct %5647, %5593 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_7075 = torch.constant.int 2
    %5650 = torch.aten.cat %5649, %int2_7075 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %5651 = torch.prim.ListConstruct %5648, %5594 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_7076 = torch.constant.int 2
    %5652 = torch.aten.cat %5651, %int2_7076 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %5653 = torch.prim.ListConstruct %5626, %5572 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_7077 = torch.constant.int 2
    %5654 = torch.aten.cat %5653, %int2_7077 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_7078 = torch.constant.int 6
    %5655 = torch.prims.convert_element_type %5650, %int6_7078 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_7079 = torch.constant.int 1
    %int24_7080 = torch.constant.int 24
    %int4608_7081 = torch.constant.int 4608
    %int-1_7082 = torch.constant.int -1
    %int1_7083 = torch.constant.int 1
    %int2_7084 = torch.constant.int 2
    %5656 = torch.prim.ListConstruct %int1_7079, %int24_7080, %int4608_7081, %int-1_7082, %int1_7083, %int2_7084 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5657 = torch.aten.view %5655, %5656 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_7085 = torch.constant.int 6
    %5658 = torch.prims.convert_element_type %5652, %int6_7085 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_7086 = torch.constant.int 1
    %int24_7087 = torch.constant.int 24
    %int4608_7088 = torch.constant.int 4608
    %int-1_7089 = torch.constant.int -1
    %int1_7090 = torch.constant.int 1
    %int2_7091 = torch.constant.int 2
    %5659 = torch.prim.ListConstruct %int1_7086, %int24_7087, %int4608_7088, %int-1_7089, %int1_7090, %int2_7091 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5660 = torch.aten.view %5658, %5659 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_7092 = torch.constant.int 5
    %int0_7093 = torch.constant.int 0
    %5661 = torch.aten.select.int %211, %int5_7092, %int0_7093 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7094 = torch.constant.int 5
    %int0_7095 = torch.constant.int 0
    %5662 = torch.aten.select.int %5657, %int5_7094, %int0_7095 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5663 = torch.aten.mul.Tensor %5661, %5662 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7096 = torch.constant.int 5
    %int1_7097 = torch.constant.int 1
    %5664 = torch.aten.select.int %211, %int5_7096, %int1_7097 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7098 = torch.constant.int 5
    %int1_7099 = torch.constant.int 1
    %5665 = torch.aten.select.int %5657, %int5_7098, %int1_7099 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5666 = torch.aten.mul.Tensor %5664, %5665 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7100 = torch.constant.int 1
    %5667 = torch.aten.add.Tensor %5663, %5666, %int1_7100 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7101 = torch.constant.int 5
    %int0_7102 = torch.constant.int 0
    %5668 = torch.aten.select.int %211, %int5_7101, %int0_7102 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7103 = torch.constant.int 5
    %int0_7104 = torch.constant.int 0
    %5669 = torch.aten.select.int %5660, %int5_7103, %int0_7104 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5670 = torch.aten.mul.Tensor %5668, %5669 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7105 = torch.constant.int 5
    %int1_7106 = torch.constant.int 1
    %5671 = torch.aten.select.int %211, %int5_7105, %int1_7106 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7107 = torch.constant.int 5
    %int1_7108 = torch.constant.int 1
    %5672 = torch.aten.select.int %5660, %int5_7107, %int1_7108 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5673 = torch.aten.mul.Tensor %5671, %5672 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7109 = torch.constant.int 1
    %5674 = torch.aten.add.Tensor %5670, %5673, %int1_7109 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7110 = torch.constant.int 1
    %int24_7111 = torch.constant.int 24
    %int4608_7112 = torch.constant.int 4608
    %int128_7113 = torch.constant.int 128
    %5675 = torch.prim.ListConstruct %int1_7110, %int24_7111, %int4608_7112, %int128_7113 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5676 = torch.aten.view %5667, %5675 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7114 = torch.constant.int 5
    %5677 = torch.prims.convert_element_type %5676, %int5_7114 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_7115 = torch.constant.int 1
    %int24_7116 = torch.constant.int 24
    %int4608_7117 = torch.constant.int 4608
    %int128_7118 = torch.constant.int 128
    %5678 = torch.prim.ListConstruct %int1_7115, %int24_7116, %int4608_7117, %int128_7118 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5679 = torch.aten.view %5674, %5678 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7119 = torch.constant.int 5
    %5680 = torch.prims.convert_element_type %5679, %int5_7119 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_7120 = torch.constant.float 0.000000e+00
    %false_7121 = torch.constant.bool false
    %none_7122 = torch.constant.none
    %none_7123 = torch.constant.none
    %5681:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%5677, %5680, %5654, %float0.000000e00_7120, %false_7121, %none_7122, %none_7123) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_7124 = torch.constant.int 0
    %int2_7125 = torch.constant.int 2
    %int1_7126 = torch.constant.int 1
    %int3_7127 = torch.constant.int 3
    %5682 = torch.prim.ListConstruct %int0_7124, %int2_7125, %int1_7126, %int3_7127 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5683 = torch.aten.permute %5681#0, %5682 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_7128 = torch.constant.int 1
    %int4608_7129 = torch.constant.int 4608
    %int3072_7130 = torch.constant.int 3072
    %5684 = torch.prim.ListConstruct %int1_7128, %int4608_7129, %int3072_7130 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5685 = torch.aten.view %5683, %5684 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_7131 = torch.constant.int 0
    %int0_7132 = torch.constant.int 0
    %int9223372036854775807_7133 = torch.constant.int 9223372036854775807
    %int1_7134 = torch.constant.int 1
    %5686 = torch.aten.slice.Tensor %5685, %int0_7131, %int0_7132, %int9223372036854775807_7133, %int1_7134 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_7135 = torch.constant.int 1
    %int0_7136 = torch.constant.int 0
    %int512_7137 = torch.constant.int 512
    %int1_7138 = torch.constant.int 1
    %5687 = torch.aten.slice.Tensor %5686, %int1_7135, %int0_7136, %int512_7137, %int1_7138 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_7139 = torch.constant.int 0
    %int0_7140 = torch.constant.int 0
    %int9223372036854775807_7141 = torch.constant.int 9223372036854775807
    %int1_7142 = torch.constant.int 1
    %5688 = torch.aten.slice.Tensor %5685, %int0_7139, %int0_7140, %int9223372036854775807_7141, %int1_7142 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_7143 = torch.constant.int 1
    %int512_7144 = torch.constant.int 512
    %int9223372036854775807_7145 = torch.constant.int 9223372036854775807
    %int1_7146 = torch.constant.int 1
    %5689 = torch.aten.slice.Tensor %5688, %int1_7143, %int512_7144, %int9223372036854775807_7145, %int1_7146 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_7147 = torch.constant.int 4096
    %int3072_7148 = torch.constant.int 3072
    %5690 = torch.prim.ListConstruct %int4096_7147, %int3072_7148 : (!torch.int, !torch.int) -> !torch.list<int>
    %5691 = torch.aten.view %5689, %5690 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.17.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.17.img_attn.proj.weight : tensor<3072x3072xf16>
    %5692 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_7149 = torch.constant.int 0
    %int1_7150 = torch.constant.int 1
    %5693 = torch.aten.transpose.int %5692, %int0_7149, %int1_7150 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.17.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.17.img_attn.proj.bias : tensor<3072xf16>
    %5694 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7151 = torch.constant.int 6
    %5695 = torch.prims.convert_element_type %5694, %int6_7151 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7152 = torch.constant.int 6
    %5696 = torch.prims.convert_element_type %5691, %int6_7152 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_7153 = torch.constant.int 6
    %5697 = torch.prims.convert_element_type %5693, %int6_7153 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %5698 = torch.aten.mm %5696, %5697 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_7154 = torch.constant.int 1
    %5699 = torch.aten.mul.Scalar %5698, %int1_7154 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_7155 = torch.constant.int 1
    %5700 = torch.aten.mul.Scalar %5695, %int1_7155 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7156 = torch.constant.int 1
    %5701 = torch.aten.add.Tensor %5699, %5700, %int1_7156 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_7157 = torch.constant.int 5
    %5702 = torch.prims.convert_element_type %5701, %int5_7157 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_7158 = torch.constant.int 1
    %int4096_7159 = torch.constant.int 4096
    %int3072_7160 = torch.constant.int 3072
    %5703 = torch.prim.ListConstruct %int1_7158, %int4096_7159, %int3072_7160 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5704 = torch.aten.view %5702, %5703 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %5705 = torch.aten.mul.Tensor %5516, %5704 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7161 = torch.constant.int 1
    %5706 = torch.aten.add.Tensor %5438, %5705, %int1_7161 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7162 = torch.constant.int 1
    %int1_7163 = torch.constant.int 1
    %5707 = torch.aten.add.Scalar %5518, %int1_7162, %int1_7163 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_7164 = torch.constant.int 6
    %5708 = torch.prims.convert_element_type %5706, %int6_7164 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_7165 = torch.constant.int 2
    %5709 = torch.prim.ListConstruct %int2_7165 : (!torch.int) -> !torch.list<int>
    %int0_7166 = torch.constant.int 0
    %true_7167 = torch.constant.bool true
    %result0_7168, %result1_7169 = torch.aten.var_mean.correction %5708, %5709, %int0_7166, %true_7167 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_7170 = torch.constant.float 9.9999999999999995E-7
    %int1_7171 = torch.constant.int 1
    %5710 = torch.aten.add.Scalar %result0_7168, %float9.999990e-07_7170, %int1_7171 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %5711 = torch.aten.rsqrt %5710 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_7172 = torch.constant.int 1
    %5712 = torch.aten.sub.Tensor %5706, %result1_7169, %int1_7172 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %5713 = torch.aten.mul.Tensor %5712, %5711 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_7173 = torch.constant.int 5
    %5714 = torch.prims.convert_element_type %5713, %int5_7173 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %5715 = torch.aten.mul.Tensor %5707, %5714 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7174 = torch.constant.int 1
    %5716 = torch.aten.add.Tensor %5715, %5517, %int1_7174 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_7175 = torch.constant.int 4096
    %int3072_7176 = torch.constant.int 3072
    %5717 = torch.prim.ListConstruct %int4096_7175, %int3072_7176 : (!torch.int, !torch.int) -> !torch.list<int>
    %5718 = torch.aten.view %5716, %5717 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.17.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.17.img_mlp.0.weight : tensor<12288x3072xf16>
    %5719 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_7177 = torch.constant.int 0
    %int1_7178 = torch.constant.int 1
    %5720 = torch.aten.transpose.int %5719, %int0_7177, %int1_7178 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.17.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.17.img_mlp.0.bias : tensor<12288xf16>
    %5721 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_7179 = torch.constant.int 6
    %5722 = torch.prims.convert_element_type %5721, %int6_7179 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_7180 = torch.constant.int 6
    %5723 = torch.prims.convert_element_type %5718, %int6_7180 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_7181 = torch.constant.int 6
    %5724 = torch.prims.convert_element_type %5720, %int6_7181 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %5725 = torch.aten.mm %5723, %5724 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_7182 = torch.constant.int 1
    %5726 = torch.aten.mul.Scalar %5725, %int1_7182 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_7183 = torch.constant.int 1
    %5727 = torch.aten.mul.Scalar %5722, %int1_7183 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_7184 = torch.constant.int 1
    %5728 = torch.aten.add.Tensor %5726, %5727, %int1_7184 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_7185 = torch.constant.int 5
    %5729 = torch.prims.convert_element_type %5728, %int5_7185 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_7186 = torch.constant.int 1
    %int4096_7187 = torch.constant.int 4096
    %int12288_7188 = torch.constant.int 12288
    %5730 = torch.prim.ListConstruct %int1_7186, %int4096_7187, %int12288_7188 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5731 = torch.aten.view %5729, %5730 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_7189 = torch.constant.str "tanh"
    %5732 = torch.aten.gelu %5731, %str_7189 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_7190 = torch.constant.int 4096
    %int12288_7191 = torch.constant.int 12288
    %5733 = torch.prim.ListConstruct %int4096_7190, %int12288_7191 : (!torch.int, !torch.int) -> !torch.list<int>
    %5734 = torch.aten.view %5732, %5733 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.17.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.17.img_mlp.2.weight : tensor<3072x12288xf16>
    %5735 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_7192 = torch.constant.int 0
    %int1_7193 = torch.constant.int 1
    %5736 = torch.aten.transpose.int %5735, %int0_7192, %int1_7193 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.17.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.17.img_mlp.2.bias : tensor<3072xf16>
    %5737 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7194 = torch.constant.int 6
    %5738 = torch.prims.convert_element_type %5737, %int6_7194 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7195 = torch.constant.int 6
    %5739 = torch.prims.convert_element_type %5734, %int6_7195 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_7196 = torch.constant.int 6
    %5740 = torch.prims.convert_element_type %5736, %int6_7196 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %5741 = torch.aten.mm %5739, %5740 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_7197 = torch.constant.int 1
    %5742 = torch.aten.mul.Scalar %5741, %int1_7197 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_7198 = torch.constant.int 1
    %5743 = torch.aten.mul.Scalar %5738, %int1_7198 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7199 = torch.constant.int 1
    %5744 = torch.aten.add.Tensor %5742, %5743, %int1_7199 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_7200 = torch.constant.int 5
    %5745 = torch.prims.convert_element_type %5744, %int5_7200 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_7201 = torch.constant.int 1
    %int4096_7202 = torch.constant.int 4096
    %int3072_7203 = torch.constant.int 3072
    %5746 = torch.prim.ListConstruct %int1_7201, %int4096_7202, %int3072_7203 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5747 = torch.aten.view %5745, %5746 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %5748 = torch.aten.mul.Tensor %5519, %5747 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7204 = torch.constant.int 1
    %5749 = torch.aten.add.Tensor %5706, %5748, %int1_7204 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_7205 = torch.constant.int 512
    %int3072_7206 = torch.constant.int 3072
    %5750 = torch.prim.ListConstruct %int512_7205, %int3072_7206 : (!torch.int, !torch.int) -> !torch.list<int>
    %5751 = torch.aten.view %5687, %5750 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.17.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.17.txt_attn.proj.weight : tensor<3072x3072xf16>
    %5752 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_7207 = torch.constant.int 0
    %int1_7208 = torch.constant.int 1
    %5753 = torch.aten.transpose.int %5752, %int0_7207, %int1_7208 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.17.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.17.txt_attn.proj.bias : tensor<3072xf16>
    %5754 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7209 = torch.constant.int 6
    %5755 = torch.prims.convert_element_type %5754, %int6_7209 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7210 = torch.constant.int 6
    %5756 = torch.prims.convert_element_type %5751, %int6_7210 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_7211 = torch.constant.int 6
    %5757 = torch.prims.convert_element_type %5753, %int6_7211 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %5758 = torch.aten.mm %5756, %5757 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_7212 = torch.constant.int 1
    %5759 = torch.aten.mul.Scalar %5758, %int1_7212 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_7213 = torch.constant.int 1
    %5760 = torch.aten.mul.Scalar %5755, %int1_7213 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7214 = torch.constant.int 1
    %5761 = torch.aten.add.Tensor %5759, %5760, %int1_7214 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_7215 = torch.constant.int 5
    %5762 = torch.prims.convert_element_type %5761, %int5_7215 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_7216 = torch.constant.int 1
    %int512_7217 = torch.constant.int 512
    %int3072_7218 = torch.constant.int 3072
    %5763 = torch.prim.ListConstruct %int1_7216, %int512_7217, %int3072_7218 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5764 = torch.aten.view %5762, %5763 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %5765 = torch.aten.mul.Tensor %5537, %5764 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7219 = torch.constant.int 1
    %5766 = torch.aten.add.Tensor %5498, %5765, %int1_7219 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_7220 = torch.constant.int 1
    %int1_7221 = torch.constant.int 1
    %5767 = torch.aten.add.Scalar %5539, %int1_7220, %int1_7221 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_7222 = torch.constant.int 6
    %5768 = torch.prims.convert_element_type %5766, %int6_7222 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_7223 = torch.constant.int 2
    %5769 = torch.prim.ListConstruct %int2_7223 : (!torch.int) -> !torch.list<int>
    %int0_7224 = torch.constant.int 0
    %true_7225 = torch.constant.bool true
    %result0_7226, %result1_7227 = torch.aten.var_mean.correction %5768, %5769, %int0_7224, %true_7225 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_7228 = torch.constant.float 9.9999999999999995E-7
    %int1_7229 = torch.constant.int 1
    %5770 = torch.aten.add.Scalar %result0_7226, %float9.999990e-07_7228, %int1_7229 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %5771 = torch.aten.rsqrt %5770 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_7230 = torch.constant.int 1
    %5772 = torch.aten.sub.Tensor %5766, %result1_7227, %int1_7230 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %5773 = torch.aten.mul.Tensor %5772, %5771 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_7231 = torch.constant.int 5
    %5774 = torch.prims.convert_element_type %5773, %int5_7231 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %5775 = torch.aten.mul.Tensor %5767, %5774 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7232 = torch.constant.int 1
    %5776 = torch.aten.add.Tensor %5775, %5538, %int1_7232 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_7233 = torch.constant.int 512
    %int3072_7234 = torch.constant.int 3072
    %5777 = torch.prim.ListConstruct %int512_7233, %int3072_7234 : (!torch.int, !torch.int) -> !torch.list<int>
    %5778 = torch.aten.view %5776, %5777 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.17.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.17.txt_mlp.0.weight : tensor<12288x3072xf16>
    %5779 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_7235 = torch.constant.int 0
    %int1_7236 = torch.constant.int 1
    %5780 = torch.aten.transpose.int %5779, %int0_7235, %int1_7236 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.17.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.17.txt_mlp.0.bias : tensor<12288xf16>
    %5781 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_7237 = torch.constant.int 6
    %5782 = torch.prims.convert_element_type %5781, %int6_7237 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_7238 = torch.constant.int 6
    %5783 = torch.prims.convert_element_type %5778, %int6_7238 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_7239 = torch.constant.int 6
    %5784 = torch.prims.convert_element_type %5780, %int6_7239 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %5785 = torch.aten.mm %5783, %5784 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_7240 = torch.constant.int 1
    %5786 = torch.aten.mul.Scalar %5785, %int1_7240 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_7241 = torch.constant.int 1
    %5787 = torch.aten.mul.Scalar %5782, %int1_7241 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_7242 = torch.constant.int 1
    %5788 = torch.aten.add.Tensor %5786, %5787, %int1_7242 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_7243 = torch.constant.int 5
    %5789 = torch.prims.convert_element_type %5788, %int5_7243 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_7244 = torch.constant.int 1
    %int512_7245 = torch.constant.int 512
    %int12288_7246 = torch.constant.int 12288
    %5790 = torch.prim.ListConstruct %int1_7244, %int512_7245, %int12288_7246 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5791 = torch.aten.view %5789, %5790 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_7247 = torch.constant.str "tanh"
    %5792 = torch.aten.gelu %5791, %str_7247 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_7248 = torch.constant.int 512
    %int12288_7249 = torch.constant.int 12288
    %5793 = torch.prim.ListConstruct %int512_7248, %int12288_7249 : (!torch.int, !torch.int) -> !torch.list<int>
    %5794 = torch.aten.view %5792, %5793 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.17.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.17.txt_mlp.2.weight : tensor<3072x12288xf16>
    %5795 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_7250 = torch.constant.int 0
    %int1_7251 = torch.constant.int 1
    %5796 = torch.aten.transpose.int %5795, %int0_7250, %int1_7251 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.17.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.17.txt_mlp.2.bias : tensor<3072xf16>
    %5797 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.17.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7252 = torch.constant.int 6
    %5798 = torch.prims.convert_element_type %5797, %int6_7252 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7253 = torch.constant.int 6
    %5799 = torch.prims.convert_element_type %5794, %int6_7253 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_7254 = torch.constant.int 6
    %5800 = torch.prims.convert_element_type %5796, %int6_7254 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %5801 = torch.aten.mm %5799, %5800 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_7255 = torch.constant.int 1
    %5802 = torch.aten.mul.Scalar %5801, %int1_7255 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_7256 = torch.constant.int 1
    %5803 = torch.aten.mul.Scalar %5798, %int1_7256 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7257 = torch.constant.int 1
    %5804 = torch.aten.add.Tensor %5802, %5803, %int1_7257 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_7258 = torch.constant.int 5
    %5805 = torch.prims.convert_element_type %5804, %int5_7258 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_7259 = torch.constant.int 1
    %int512_7260 = torch.constant.int 512
    %int3072_7261 = torch.constant.int 3072
    %5806 = torch.prim.ListConstruct %int1_7259, %int512_7260, %int3072_7261 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5807 = torch.aten.view %5805, %5806 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %5808 = torch.aten.mul.Tensor %5540, %5807 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7262 = torch.constant.int 1
    %5809 = torch.aten.add.Tensor %5766, %5808, %int1_7262 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %5810 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.18.img_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.18.img_mod.lin.weight : tensor<18432x3072xf16>
    %5811 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_7263 = torch.constant.int 0
    %int1_7264 = torch.constant.int 1
    %5812 = torch.aten.transpose.int %5811, %int0_7263, %int1_7264 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.18.img_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.18.img_mod.lin.bias : tensor<18432xf16>
    %5813 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_7265 = torch.constant.int 6
    %5814 = torch.prims.convert_element_type %5813, %int6_7265 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_7266 = torch.constant.int 6
    %5815 = torch.prims.convert_element_type %5810, %int6_7266 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_7267 = torch.constant.int 6
    %5816 = torch.prims.convert_element_type %5812, %int6_7267 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %5817 = torch.aten.mm %5815, %5816 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_7268 = torch.constant.int 1
    %5818 = torch.aten.mul.Scalar %5817, %int1_7268 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_7269 = torch.constant.int 1
    %5819 = torch.aten.mul.Scalar %5814, %int1_7269 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_7270 = torch.constant.int 1
    %5820 = torch.aten.add.Tensor %5818, %5819, %int1_7270 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_7271 = torch.constant.int 5
    %5821 = torch.prims.convert_element_type %5820, %int5_7271 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_7272 = torch.constant.int 0
    %int0_7273 = torch.constant.int 0
    %int9223372036854775807_7274 = torch.constant.int 9223372036854775807
    %int1_7275 = torch.constant.int 1
    %5822 = torch.aten.slice.Tensor %5821, %int0_7272, %int0_7273, %int9223372036854775807_7274, %int1_7275 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_7276 = torch.constant.int 1
    %5823 = torch.aten.unsqueeze %5822, %int1_7276 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_7277 = torch.constant.int 2
    %int0_7278 = torch.constant.int 0
    %int9223372036854775807_7279 = torch.constant.int 9223372036854775807
    %int1_7280 = torch.constant.int 1
    %5824 = torch.aten.slice.Tensor %5823, %int2_7277, %int0_7278, %int9223372036854775807_7279, %int1_7280 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_7281 = torch.constant.int -1
    %int0_7282 = torch.constant.int 0
    %int3072_7283 = torch.constant.int 3072
    %int1_7284 = torch.constant.int 1
    %5825 = torch.aten.slice.Tensor %5824, %int-1_7281, %int0_7282, %int3072_7283, %int1_7284 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7285 = torch.constant.int -1
    %int3072_7286 = torch.constant.int 3072
    %int6144_7287 = torch.constant.int 6144
    %int1_7288 = torch.constant.int 1
    %5826 = torch.aten.slice.Tensor %5824, %int-1_7285, %int3072_7286, %int6144_7287, %int1_7288 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7289 = torch.constant.int -1
    %int6144_7290 = torch.constant.int 6144
    %int9216_7291 = torch.constant.int 9216
    %int1_7292 = torch.constant.int 1
    %5827 = torch.aten.slice.Tensor %5824, %int-1_7289, %int6144_7290, %int9216_7291, %int1_7292 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7293 = torch.constant.int -1
    %int9216_7294 = torch.constant.int 9216
    %int12288_7295 = torch.constant.int 12288
    %int1_7296 = torch.constant.int 1
    %5828 = torch.aten.slice.Tensor %5824, %int-1_7293, %int9216_7294, %int12288_7295, %int1_7296 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7297 = torch.constant.int -1
    %int12288_7298 = torch.constant.int 12288
    %int15360_7299 = torch.constant.int 15360
    %int1_7300 = torch.constant.int 1
    %5829 = torch.aten.slice.Tensor %5824, %int-1_7297, %int12288_7298, %int15360_7299, %int1_7300 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7301 = torch.constant.int -1
    %int15360_7302 = torch.constant.int 15360
    %int18432_7303 = torch.constant.int 18432
    %int1_7304 = torch.constant.int 1
    %5830 = torch.aten.slice.Tensor %5824, %int-1_7301, %int15360_7302, %int18432_7303, %int1_7304 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5831 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.double_blocks.18.txt_mod.lin.weight = util.global.load @__auto.sampler.double_blocks.18.txt_mod.lin.weight : tensor<18432x3072xf16>
    %5832 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_mod.lin.weight : tensor<18432x3072xf16> -> !torch.vtensor<[18432,3072],f16>
    %int0_7305 = torch.constant.int 0
    %int1_7306 = torch.constant.int 1
    %5833 = torch.aten.transpose.int %5832, %int0_7305, %int1_7306 : !torch.vtensor<[18432,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,18432],f16>
    %__auto.sampler.double_blocks.18.txt_mod.lin.bias = util.global.load @__auto.sampler.double_blocks.18.txt_mod.lin.bias : tensor<18432xf16>
    %5834 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_mod.lin.bias : tensor<18432xf16> -> !torch.vtensor<[18432],f16>
    %int6_7307 = torch.constant.int 6
    %5835 = torch.prims.convert_element_type %5834, %int6_7307 : !torch.vtensor<[18432],f16>, !torch.int -> !torch.vtensor<[18432],f32>
    %int6_7308 = torch.constant.int 6
    %5836 = torch.prims.convert_element_type %5831, %int6_7308 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_7309 = torch.constant.int 6
    %5837 = torch.prims.convert_element_type %5833, %int6_7309 : !torch.vtensor<[3072,18432],f16>, !torch.int -> !torch.vtensor<[3072,18432],f32>
    %5838 = torch.aten.mm %5836, %5837 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,18432],f32> -> !torch.vtensor<[1,18432],f32>
    %int1_7310 = torch.constant.int 1
    %5839 = torch.aten.mul.Scalar %5838, %int1_7310 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int1_7311 = torch.constant.int 1
    %5840 = torch.aten.mul.Scalar %5835, %int1_7311 : !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[18432],f32>
    %int1_7312 = torch.constant.int 1
    %5841 = torch.aten.add.Tensor %5839, %5840, %int1_7312 : !torch.vtensor<[1,18432],f32>, !torch.vtensor<[18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f32>
    %int5_7313 = torch.constant.int 5
    %5842 = torch.prims.convert_element_type %5841, %int5_7313 : !torch.vtensor<[1,18432],f32>, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int0_7314 = torch.constant.int 0
    %int0_7315 = torch.constant.int 0
    %int9223372036854775807_7316 = torch.constant.int 9223372036854775807
    %int1_7317 = torch.constant.int 1
    %5843 = torch.aten.slice.Tensor %5842, %int0_7314, %int0_7315, %int9223372036854775807_7316, %int1_7317 : !torch.vtensor<[1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,18432],f16>
    %int1_7318 = torch.constant.int 1
    %5844 = torch.aten.unsqueeze %5843, %int1_7318 : !torch.vtensor<[1,18432],f16>, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int2_7319 = torch.constant.int 2
    %int0_7320 = torch.constant.int 0
    %int9223372036854775807_7321 = torch.constant.int 9223372036854775807
    %int1_7322 = torch.constant.int 1
    %5845 = torch.aten.slice.Tensor %5844, %int2_7319, %int0_7320, %int9223372036854775807_7321, %int1_7322 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,18432],f16>
    %int-1_7323 = torch.constant.int -1
    %int0_7324 = torch.constant.int 0
    %int3072_7325 = torch.constant.int 3072
    %int1_7326 = torch.constant.int 1
    %5846 = torch.aten.slice.Tensor %5845, %int-1_7323, %int0_7324, %int3072_7325, %int1_7326 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7327 = torch.constant.int -1
    %int3072_7328 = torch.constant.int 3072
    %int6144_7329 = torch.constant.int 6144
    %int1_7330 = torch.constant.int 1
    %5847 = torch.aten.slice.Tensor %5845, %int-1_7327, %int3072_7328, %int6144_7329, %int1_7330 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7331 = torch.constant.int -1
    %int6144_7332 = torch.constant.int 6144
    %int9216_7333 = torch.constant.int 9216
    %int1_7334 = torch.constant.int 1
    %5848 = torch.aten.slice.Tensor %5845, %int-1_7331, %int6144_7332, %int9216_7333, %int1_7334 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7335 = torch.constant.int -1
    %int9216_7336 = torch.constant.int 9216
    %int12288_7337 = torch.constant.int 12288
    %int1_7338 = torch.constant.int 1
    %5849 = torch.aten.slice.Tensor %5845, %int-1_7335, %int9216_7336, %int12288_7337, %int1_7338 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7339 = torch.constant.int -1
    %int12288_7340 = torch.constant.int 12288
    %int15360_7341 = torch.constant.int 15360
    %int1_7342 = torch.constant.int 1
    %5850 = torch.aten.slice.Tensor %5845, %int-1_7339, %int12288_7340, %int15360_7341, %int1_7342 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7343 = torch.constant.int -1
    %int15360_7344 = torch.constant.int 15360
    %int18432_7345 = torch.constant.int 18432
    %int1_7346 = torch.constant.int 1
    %5851 = torch.aten.slice.Tensor %5845, %int-1_7343, %int15360_7344, %int18432_7345, %int1_7346 : !torch.vtensor<[1,1,18432],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_7347 = torch.constant.int 6
    %5852 = torch.prims.convert_element_type %5749, %int6_7347 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_7348 = torch.constant.int 2
    %5853 = torch.prim.ListConstruct %int2_7348 : (!torch.int) -> !torch.list<int>
    %int0_7349 = torch.constant.int 0
    %true_7350 = torch.constant.bool true
    %result0_7351, %result1_7352 = torch.aten.var_mean.correction %5852, %5853, %int0_7349, %true_7350 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_7353 = torch.constant.float 9.9999999999999995E-7
    %int1_7354 = torch.constant.int 1
    %5854 = torch.aten.add.Scalar %result0_7351, %float9.999990e-07_7353, %int1_7354 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %5855 = torch.aten.rsqrt %5854 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_7355 = torch.constant.int 1
    %5856 = torch.aten.sub.Tensor %5749, %result1_7352, %int1_7355 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %5857 = torch.aten.mul.Tensor %5856, %5855 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_7356 = torch.constant.int 5
    %5858 = torch.prims.convert_element_type %5857, %int5_7356 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7357 = torch.constant.int 1
    %int1_7358 = torch.constant.int 1
    %5859 = torch.aten.add.Scalar %5826, %int1_7357, %int1_7358 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5860 = torch.aten.mul.Tensor %5859, %5858 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7359 = torch.constant.int 1
    %5861 = torch.aten.add.Tensor %5860, %5825, %int1_7359 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_7360 = torch.constant.int 4096
    %int3072_7361 = torch.constant.int 3072
    %5862 = torch.prim.ListConstruct %int4096_7360, %int3072_7361 : (!torch.int, !torch.int) -> !torch.list<int>
    %5863 = torch.aten.view %5861, %5862 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.18.img_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.18.img_attn.qkv.weight : tensor<9216x3072xf16>
    %5864 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_7362 = torch.constant.int 0
    %int1_7363 = torch.constant.int 1
    %5865 = torch.aten.transpose.int %5864, %int0_7362, %int1_7363 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.18.img_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.18.img_attn.qkv.bias : tensor<9216xf16>
    %5866 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_7364 = torch.constant.int 6
    %5867 = torch.prims.convert_element_type %5866, %int6_7364 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_7365 = torch.constant.int 6
    %5868 = torch.prims.convert_element_type %5863, %int6_7365 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_7366 = torch.constant.int 6
    %5869 = torch.prims.convert_element_type %5865, %int6_7366 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %5870 = torch.aten.mm %5868, %5869 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[4096,9216],f32>
    %int1_7367 = torch.constant.int 1
    %5871 = torch.aten.mul.Scalar %5870, %int1_7367 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int1_7368 = torch.constant.int 1
    %5872 = torch.aten.mul.Scalar %5867, %int1_7368 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_7369 = torch.constant.int 1
    %5873 = torch.aten.add.Tensor %5871, %5872, %int1_7369 : !torch.vtensor<[4096,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f32>
    %int5_7370 = torch.constant.int 5
    %5874 = torch.prims.convert_element_type %5873, %int5_7370 : !torch.vtensor<[4096,9216],f32>, !torch.int -> !torch.vtensor<[4096,9216],f16>
    %int1_7371 = torch.constant.int 1
    %int4096_7372 = torch.constant.int 4096
    %int9216_7373 = torch.constant.int 9216
    %5875 = torch.prim.ListConstruct %int1_7371, %int4096_7372, %int9216_7373 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5876 = torch.aten.view %5874, %5875 : !torch.vtensor<[4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,9216],f16>
    %int1_7374 = torch.constant.int 1
    %int4096_7375 = torch.constant.int 4096
    %int3_7376 = torch.constant.int 3
    %int24_7377 = torch.constant.int 24
    %int128_7378 = torch.constant.int 128
    %5877 = torch.prim.ListConstruct %int1_7374, %int4096_7375, %int3_7376, %int24_7377, %int128_7378 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5878 = torch.aten.view %5876, %5877 : !torch.vtensor<[1,4096,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3,24,128],f16>
    %int2_7379 = torch.constant.int 2
    %int0_7380 = torch.constant.int 0
    %int3_7381 = torch.constant.int 3
    %int1_7382 = torch.constant.int 1
    %int4_7383 = torch.constant.int 4
    %5879 = torch.prim.ListConstruct %int2_7379, %int0_7380, %int3_7381, %int1_7382, %int4_7383 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5880 = torch.aten.permute %5878, %5879 : !torch.vtensor<[1,4096,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4096,128],f16>
    %int0_7384 = torch.constant.int 0
    %int0_7385 = torch.constant.int 0
    %5881 = torch.aten.select.int %5880, %int0_7384, %int0_7385 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_7386 = torch.constant.int 0
    %int1_7387 = torch.constant.int 1
    %5882 = torch.aten.select.int %5880, %int0_7386, %int1_7387 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int0_7388 = torch.constant.int 0
    %int2_7389 = torch.constant.int 2
    %5883 = torch.aten.select.int %5880, %int0_7388, %int2_7389 : !torch.vtensor<[3,1,24,4096,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_7390 = torch.constant.int 6
    %5884 = torch.prims.convert_element_type %5881, %int6_7390 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_7391 = torch.constant.int 2
    %5885 = torch.aten.pow.Tensor_Scalar %5884, %int2_7391 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_7392 = torch.constant.int -1
    %5886 = torch.prim.ListConstruct %int-1_7392 : (!torch.int) -> !torch.list<int>
    %true_7393 = torch.constant.bool true
    %none_7394 = torch.constant.none
    %5887 = torch.aten.mean.dim %5885, %5886, %true_7393, %none_7394 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_7395 = torch.constant.float 9.9999999999999995E-7
    %int1_7396 = torch.constant.int 1
    %5888 = torch.aten.add.Scalar %5887, %float9.999990e-07_7395, %int1_7396 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %5889 = torch.aten.rsqrt %5888 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %5890 = torch.aten.mul.Tensor %5884, %5889 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_7397 = torch.constant.int 5
    %5891 = torch.prims.convert_element_type %5890, %int5_7397 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.18.img_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.18.img_attn.norm.query_norm.scale : tensor<128xf16>
    %5892 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5893 = torch.aten.mul.Tensor %5891, %5892 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_7398 = torch.constant.int 6
    %5894 = torch.prims.convert_element_type %5882, %int6_7398 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int2_7399 = torch.constant.int 2
    %5895 = torch.aten.pow.Tensor_Scalar %5894, %int2_7399 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f32>
    %int-1_7400 = torch.constant.int -1
    %5896 = torch.prim.ListConstruct %int-1_7400 : (!torch.int) -> !torch.list<int>
    %true_7401 = torch.constant.bool true
    %none_7402 = torch.constant.none
    %5897 = torch.aten.mean.dim %5895, %5896, %true_7401, %none_7402 : !torch.vtensor<[1,24,4096,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4096,1],f32>
    %float9.999990e-07_7403 = torch.constant.float 9.9999999999999995E-7
    %int1_7404 = torch.constant.int 1
    %5898 = torch.aten.add.Scalar %5897, %float9.999990e-07_7403, %int1_7404 : !torch.vtensor<[1,24,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4096,1],f32>
    %5899 = torch.aten.rsqrt %5898 : !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,1],f32>
    %5900 = torch.aten.mul.Tensor %5894, %5899 : !torch.vtensor<[1,24,4096,128],f32>, !torch.vtensor<[1,24,4096,1],f32> -> !torch.vtensor<[1,24,4096,128],f32>
    %int5_7405 = torch.constant.int 5
    %5901 = torch.prims.convert_element_type %5900, %int5_7405 : !torch.vtensor<[1,24,4096,128],f32>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %__auto.sampler.double_blocks.18.img_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.18.img_attn.norm.key_norm.scale : tensor<128xf16>
    %5902 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5903 = torch.aten.mul.Tensor %5901, %5902 : !torch.vtensor<[1,24,4096,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_7406 = torch.constant.int 5
    %5904 = torch.prims.convert_element_type %5893, %int5_7406 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int5_7407 = torch.constant.int 5
    %5905 = torch.prims.convert_element_type %5903, %int5_7407 : !torch.vtensor<[1,24,4096,128],f16>, !torch.int -> !torch.vtensor<[1,24,4096,128],f16>
    %int6_7408 = torch.constant.int 6
    %5906 = torch.prims.convert_element_type %5809, %int6_7408 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_7409 = torch.constant.int 2
    %5907 = torch.prim.ListConstruct %int2_7409 : (!torch.int) -> !torch.list<int>
    %int0_7410 = torch.constant.int 0
    %true_7411 = torch.constant.bool true
    %result0_7412, %result1_7413 = torch.aten.var_mean.correction %5906, %5907, %int0_7410, %true_7411 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_7414 = torch.constant.float 9.9999999999999995E-7
    %int1_7415 = torch.constant.int 1
    %5908 = torch.aten.add.Scalar %result0_7412, %float9.999990e-07_7414, %int1_7415 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %5909 = torch.aten.rsqrt %5908 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_7416 = torch.constant.int 1
    %5910 = torch.aten.sub.Tensor %5809, %result1_7413, %int1_7416 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %5911 = torch.aten.mul.Tensor %5910, %5909 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_7417 = torch.constant.int 5
    %5912 = torch.prims.convert_element_type %5911, %int5_7417 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_7418 = torch.constant.int 1
    %int1_7419 = torch.constant.int 1
    %5913 = torch.aten.add.Scalar %5847, %int1_7418, %int1_7419 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %5914 = torch.aten.mul.Tensor %5913, %5912 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7420 = torch.constant.int 1
    %5915 = torch.aten.add.Tensor %5914, %5846, %int1_7420 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_7421 = torch.constant.int 512
    %int3072_7422 = torch.constant.int 3072
    %5916 = torch.prim.ListConstruct %int512_7421, %int3072_7422 : (!torch.int, !torch.int) -> !torch.list<int>
    %5917 = torch.aten.view %5915, %5916 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.18.txt_attn.qkv.weight = util.global.load @__auto.sampler.double_blocks.18.txt_attn.qkv.weight : tensor<9216x3072xf16>
    %5918 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_attn.qkv.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_7423 = torch.constant.int 0
    %int1_7424 = torch.constant.int 1
    %5919 = torch.aten.transpose.int %5918, %int0_7423, %int1_7424 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.double_blocks.18.txt_attn.qkv.bias = util.global.load @__auto.sampler.double_blocks.18.txt_attn.qkv.bias : tensor<9216xf16>
    %5920 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_attn.qkv.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_7425 = torch.constant.int 6
    %5921 = torch.prims.convert_element_type %5920, %int6_7425 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_7426 = torch.constant.int 6
    %5922 = torch.prims.convert_element_type %5917, %int6_7426 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_7427 = torch.constant.int 6
    %5923 = torch.prims.convert_element_type %5919, %int6_7427 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %5924 = torch.aten.mm %5922, %5923 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[512,9216],f32>
    %int1_7428 = torch.constant.int 1
    %5925 = torch.aten.mul.Scalar %5924, %int1_7428 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int1_7429 = torch.constant.int 1
    %5926 = torch.aten.mul.Scalar %5921, %int1_7429 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_7430 = torch.constant.int 1
    %5927 = torch.aten.add.Tensor %5925, %5926, %int1_7430 : !torch.vtensor<[512,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f32>
    %int5_7431 = torch.constant.int 5
    %5928 = torch.prims.convert_element_type %5927, %int5_7431 : !torch.vtensor<[512,9216],f32>, !torch.int -> !torch.vtensor<[512,9216],f16>
    %int1_7432 = torch.constant.int 1
    %int512_7433 = torch.constant.int 512
    %int9216_7434 = torch.constant.int 9216
    %5929 = torch.prim.ListConstruct %int1_7432, %int512_7433, %int9216_7434 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5930 = torch.aten.view %5928, %5929 : !torch.vtensor<[512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,9216],f16>
    %int1_7435 = torch.constant.int 1
    %int512_7436 = torch.constant.int 512
    %int3_7437 = torch.constant.int 3
    %int24_7438 = torch.constant.int 24
    %int128_7439 = torch.constant.int 128
    %5931 = torch.prim.ListConstruct %int1_7435, %int512_7436, %int3_7437, %int24_7438, %int128_7439 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5932 = torch.aten.view %5930, %5931 : !torch.vtensor<[1,512,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3,24,128],f16>
    %int2_7440 = torch.constant.int 2
    %int0_7441 = torch.constant.int 0
    %int3_7442 = torch.constant.int 3
    %int1_7443 = torch.constant.int 1
    %int4_7444 = torch.constant.int 4
    %5933 = torch.prim.ListConstruct %int2_7440, %int0_7441, %int3_7442, %int1_7443, %int4_7444 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5934 = torch.aten.permute %5932, %5933 : !torch.vtensor<[1,512,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,512,128],f16>
    %int0_7445 = torch.constant.int 0
    %int0_7446 = torch.constant.int 0
    %5935 = torch.aten.select.int %5934, %int0_7445, %int0_7446 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_7447 = torch.constant.int 0
    %int1_7448 = torch.constant.int 1
    %5936 = torch.aten.select.int %5934, %int0_7447, %int1_7448 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int0_7449 = torch.constant.int 0
    %int2_7450 = torch.constant.int 2
    %5937 = torch.aten.select.int %5934, %int0_7449, %int2_7450 : !torch.vtensor<[3,1,24,512,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int6_7451 = torch.constant.int 6
    %5938 = torch.prims.convert_element_type %5935, %int6_7451 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_7452 = torch.constant.int 2
    %5939 = torch.aten.pow.Tensor_Scalar %5938, %int2_7452 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_7453 = torch.constant.int -1
    %5940 = torch.prim.ListConstruct %int-1_7453 : (!torch.int) -> !torch.list<int>
    %true_7454 = torch.constant.bool true
    %none_7455 = torch.constant.none
    %5941 = torch.aten.mean.dim %5939, %5940, %true_7454, %none_7455 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_7456 = torch.constant.float 9.9999999999999995E-7
    %int1_7457 = torch.constant.int 1
    %5942 = torch.aten.add.Scalar %5941, %float9.999990e-07_7456, %int1_7457 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %5943 = torch.aten.rsqrt %5942 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %5944 = torch.aten.mul.Tensor %5938, %5943 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_7458 = torch.constant.int 5
    %5945 = torch.prims.convert_element_type %5944, %int5_7458 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.18.txt_attn.norm.query_norm.scale = util.global.load @__auto.sampler.double_blocks.18.txt_attn.norm.query_norm.scale : tensor<128xf16>
    %5946 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_attn.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5947 = torch.aten.mul.Tensor %5945, %5946 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int6_7459 = torch.constant.int 6
    %5948 = torch.prims.convert_element_type %5936, %int6_7459 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int2_7460 = torch.constant.int 2
    %5949 = torch.aten.pow.Tensor_Scalar %5948, %int2_7460 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f32>
    %int-1_7461 = torch.constant.int -1
    %5950 = torch.prim.ListConstruct %int-1_7461 : (!torch.int) -> !torch.list<int>
    %true_7462 = torch.constant.bool true
    %none_7463 = torch.constant.none
    %5951 = torch.aten.mean.dim %5949, %5950, %true_7462, %none_7463 : !torch.vtensor<[1,24,512,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,512,1],f32>
    %float9.999990e-07_7464 = torch.constant.float 9.9999999999999995E-7
    %int1_7465 = torch.constant.int 1
    %5952 = torch.aten.add.Scalar %5951, %float9.999990e-07_7464, %int1_7465 : !torch.vtensor<[1,24,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,512,1],f32>
    %5953 = torch.aten.rsqrt %5952 : !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,1],f32>
    %5954 = torch.aten.mul.Tensor %5948, %5953 : !torch.vtensor<[1,24,512,128],f32>, !torch.vtensor<[1,24,512,1],f32> -> !torch.vtensor<[1,24,512,128],f32>
    %int5_7466 = torch.constant.int 5
    %5955 = torch.prims.convert_element_type %5954, %int5_7466 : !torch.vtensor<[1,24,512,128],f32>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %__auto.sampler.double_blocks.18.txt_attn.norm.key_norm.scale = util.global.load @__auto.sampler.double_blocks.18.txt_attn.norm.key_norm.scale : tensor<128xf16>
    %5956 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_attn.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %5957 = torch.aten.mul.Tensor %5955, %5956 : !torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,512,128],f16>
    %int5_7467 = torch.constant.int 5
    %5958 = torch.prims.convert_element_type %5947, %int5_7467 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %int5_7468 = torch.constant.int 5
    %5959 = torch.prims.convert_element_type %5957, %int5_7468 : !torch.vtensor<[1,24,512,128],f16>, !torch.int -> !torch.vtensor<[1,24,512,128],f16>
    %5960 = torch.prim.ListConstruct %5958, %5904 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_7469 = torch.constant.int 2
    %5961 = torch.aten.cat %5960, %int2_7469 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %5962 = torch.prim.ListConstruct %5959, %5905 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_7470 = torch.constant.int 2
    %5963 = torch.aten.cat %5962, %int2_7470 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %5964 = torch.prim.ListConstruct %5937, %5883 : (!torch.vtensor<[1,24,512,128],f16>, !torch.vtensor<[1,24,4096,128],f16>) -> !torch.list<vtensor>
    %int2_7471 = torch.constant.int 2
    %5965 = torch.aten.cat %5964, %int2_7471 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_7472 = torch.constant.int 6
    %5966 = torch.prims.convert_element_type %5961, %int6_7472 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_7473 = torch.constant.int 1
    %int24_7474 = torch.constant.int 24
    %int4608_7475 = torch.constant.int 4608
    %int-1_7476 = torch.constant.int -1
    %int1_7477 = torch.constant.int 1
    %int2_7478 = torch.constant.int 2
    %5967 = torch.prim.ListConstruct %int1_7473, %int24_7474, %int4608_7475, %int-1_7476, %int1_7477, %int2_7478 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5968 = torch.aten.view %5966, %5967 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_7479 = torch.constant.int 6
    %5969 = torch.prims.convert_element_type %5963, %int6_7479 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_7480 = torch.constant.int 1
    %int24_7481 = torch.constant.int 24
    %int4608_7482 = torch.constant.int 4608
    %int-1_7483 = torch.constant.int -1
    %int1_7484 = torch.constant.int 1
    %int2_7485 = torch.constant.int 2
    %5970 = torch.prim.ListConstruct %int1_7480, %int24_7481, %int4608_7482, %int-1_7483, %int1_7484, %int2_7485 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5971 = torch.aten.view %5969, %5970 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_7486 = torch.constant.int 5
    %int0_7487 = torch.constant.int 0
    %5972 = torch.aten.select.int %211, %int5_7486, %int0_7487 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7488 = torch.constant.int 5
    %int0_7489 = torch.constant.int 0
    %5973 = torch.aten.select.int %5968, %int5_7488, %int0_7489 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5974 = torch.aten.mul.Tensor %5972, %5973 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7490 = torch.constant.int 5
    %int1_7491 = torch.constant.int 1
    %5975 = torch.aten.select.int %211, %int5_7490, %int1_7491 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7492 = torch.constant.int 5
    %int1_7493 = torch.constant.int 1
    %5976 = torch.aten.select.int %5968, %int5_7492, %int1_7493 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5977 = torch.aten.mul.Tensor %5975, %5976 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7494 = torch.constant.int 1
    %5978 = torch.aten.add.Tensor %5974, %5977, %int1_7494 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7495 = torch.constant.int 5
    %int0_7496 = torch.constant.int 0
    %5979 = torch.aten.select.int %211, %int5_7495, %int0_7496 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7497 = torch.constant.int 5
    %int0_7498 = torch.constant.int 0
    %5980 = torch.aten.select.int %5971, %int5_7497, %int0_7498 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5981 = torch.aten.mul.Tensor %5979, %5980 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7499 = torch.constant.int 5
    %int1_7500 = torch.constant.int 1
    %5982 = torch.aten.select.int %211, %int5_7499, %int1_7500 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7501 = torch.constant.int 5
    %int1_7502 = torch.constant.int 1
    %5983 = torch.aten.select.int %5971, %int5_7501, %int1_7502 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %5984 = torch.aten.mul.Tensor %5982, %5983 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7503 = torch.constant.int 1
    %5985 = torch.aten.add.Tensor %5981, %5984, %int1_7503 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7504 = torch.constant.int 1
    %int24_7505 = torch.constant.int 24
    %int4608_7506 = torch.constant.int 4608
    %int128_7507 = torch.constant.int 128
    %5986 = torch.prim.ListConstruct %int1_7504, %int24_7505, %int4608_7506, %int128_7507 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5987 = torch.aten.view %5978, %5986 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7508 = torch.constant.int 5
    %5988 = torch.prims.convert_element_type %5987, %int5_7508 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_7509 = torch.constant.int 1
    %int24_7510 = torch.constant.int 24
    %int4608_7511 = torch.constant.int 4608
    %int128_7512 = torch.constant.int 128
    %5989 = torch.prim.ListConstruct %int1_7509, %int24_7510, %int4608_7511, %int128_7512 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5990 = torch.aten.view %5985, %5989 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7513 = torch.constant.int 5
    %5991 = torch.prims.convert_element_type %5990, %int5_7513 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_7514 = torch.constant.float 0.000000e+00
    %false_7515 = torch.constant.bool false
    %none_7516 = torch.constant.none
    %none_7517 = torch.constant.none
    %5992:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%5988, %5991, %5965, %float0.000000e00_7514, %false_7515, %none_7516, %none_7517) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_7518 = torch.constant.int 0
    %int2_7519 = torch.constant.int 2
    %int1_7520 = torch.constant.int 1
    %int3_7521 = torch.constant.int 3
    %5993 = torch.prim.ListConstruct %int0_7518, %int2_7519, %int1_7520, %int3_7521 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5994 = torch.aten.permute %5992#0, %5993 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_7522 = torch.constant.int 1
    %int4608_7523 = torch.constant.int 4608
    %int3072_7524 = torch.constant.int 3072
    %5995 = torch.prim.ListConstruct %int1_7522, %int4608_7523, %int3072_7524 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %5996 = torch.aten.view %5994, %5995 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %int0_7525 = torch.constant.int 0
    %int0_7526 = torch.constant.int 0
    %int9223372036854775807_7527 = torch.constant.int 9223372036854775807
    %int1_7528 = torch.constant.int 1
    %5997 = torch.aten.slice.Tensor %5996, %int0_7525, %int0_7526, %int9223372036854775807_7527, %int1_7528 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_7529 = torch.constant.int 1
    %int0_7530 = torch.constant.int 0
    %int512_7531 = torch.constant.int 512
    %int1_7532 = torch.constant.int 1
    %5998 = torch.aten.slice.Tensor %5997, %int1_7529, %int0_7530, %int512_7531, %int1_7532 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int0_7533 = torch.constant.int 0
    %int0_7534 = torch.constant.int 0
    %int9223372036854775807_7535 = torch.constant.int 9223372036854775807
    %int1_7536 = torch.constant.int 1
    %5999 = torch.aten.slice.Tensor %5996, %int0_7533, %int0_7534, %int9223372036854775807_7535, %int1_7536 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_7537 = torch.constant.int 1
    %int512_7538 = torch.constant.int 512
    %int9223372036854775807_7539 = torch.constant.int 9223372036854775807
    %int1_7540 = torch.constant.int 1
    %6000 = torch.aten.slice.Tensor %5999, %int1_7537, %int512_7538, %int9223372036854775807_7539, %int1_7540 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_7541 = torch.constant.int 4096
    %int3072_7542 = torch.constant.int 3072
    %6001 = torch.prim.ListConstruct %int4096_7541, %int3072_7542 : (!torch.int, !torch.int) -> !torch.list<int>
    %6002 = torch.aten.view %6000, %6001 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.18.img_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.18.img_attn.proj.weight : tensor<3072x3072xf16>
    %6003 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_7543 = torch.constant.int 0
    %int1_7544 = torch.constant.int 1
    %6004 = torch.aten.transpose.int %6003, %int0_7543, %int1_7544 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.18.img_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.18.img_attn.proj.bias : tensor<3072xf16>
    %6005 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7545 = torch.constant.int 6
    %6006 = torch.prims.convert_element_type %6005, %int6_7545 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7546 = torch.constant.int 6
    %6007 = torch.prims.convert_element_type %6002, %int6_7546 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_7547 = torch.constant.int 6
    %6008 = torch.prims.convert_element_type %6004, %int6_7547 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %6009 = torch.aten.mm %6007, %6008 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_7548 = torch.constant.int 1
    %6010 = torch.aten.mul.Scalar %6009, %int1_7548 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_7549 = torch.constant.int 1
    %6011 = torch.aten.mul.Scalar %6006, %int1_7549 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7550 = torch.constant.int 1
    %6012 = torch.aten.add.Tensor %6010, %6011, %int1_7550 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_7551 = torch.constant.int 5
    %6013 = torch.prims.convert_element_type %6012, %int5_7551 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_7552 = torch.constant.int 1
    %int4096_7553 = torch.constant.int 4096
    %int3072_7554 = torch.constant.int 3072
    %6014 = torch.prim.ListConstruct %int1_7552, %int4096_7553, %int3072_7554 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6015 = torch.aten.view %6013, %6014 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %6016 = torch.aten.mul.Tensor %5827, %6015 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7555 = torch.constant.int 1
    %6017 = torch.aten.add.Tensor %5749, %6016, %int1_7555 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7556 = torch.constant.int 1
    %int1_7557 = torch.constant.int 1
    %6018 = torch.aten.add.Scalar %5829, %int1_7556, %int1_7557 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_7558 = torch.constant.int 6
    %6019 = torch.prims.convert_element_type %6017, %int6_7558 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_7559 = torch.constant.int 2
    %6020 = torch.prim.ListConstruct %int2_7559 : (!torch.int) -> !torch.list<int>
    %int0_7560 = torch.constant.int 0
    %true_7561 = torch.constant.bool true
    %result0_7562, %result1_7563 = torch.aten.var_mean.correction %6019, %6020, %int0_7560, %true_7561 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_7564 = torch.constant.float 9.9999999999999995E-7
    %int1_7565 = torch.constant.int 1
    %6021 = torch.aten.add.Scalar %result0_7562, %float9.999990e-07_7564, %int1_7565 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %6022 = torch.aten.rsqrt %6021 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_7566 = torch.constant.int 1
    %6023 = torch.aten.sub.Tensor %6017, %result1_7563, %int1_7566 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %6024 = torch.aten.mul.Tensor %6023, %6022 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_7567 = torch.constant.int 5
    %6025 = torch.prims.convert_element_type %6024, %int5_7567 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %6026 = torch.aten.mul.Tensor %6018, %6025 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7568 = torch.constant.int 1
    %6027 = torch.aten.add.Tensor %6026, %5828, %int1_7568 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_7569 = torch.constant.int 4096
    %int3072_7570 = torch.constant.int 3072
    %6028 = torch.prim.ListConstruct %int4096_7569, %int3072_7570 : (!torch.int, !torch.int) -> !torch.list<int>
    %6029 = torch.aten.view %6027, %6028 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.double_blocks.18.img_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.18.img_mlp.0.weight : tensor<12288x3072xf16>
    %6030 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_7571 = torch.constant.int 0
    %int1_7572 = torch.constant.int 1
    %6031 = torch.aten.transpose.int %6030, %int0_7571, %int1_7572 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.18.img_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.18.img_mlp.0.bias : tensor<12288xf16>
    %6032 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_7573 = torch.constant.int 6
    %6033 = torch.prims.convert_element_type %6032, %int6_7573 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_7574 = torch.constant.int 6
    %6034 = torch.prims.convert_element_type %6029, %int6_7574 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_7575 = torch.constant.int 6
    %6035 = torch.prims.convert_element_type %6031, %int6_7575 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %6036 = torch.aten.mm %6034, %6035 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[4096,12288],f32>
    %int1_7576 = torch.constant.int 1
    %6037 = torch.aten.mul.Scalar %6036, %int1_7576 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int1_7577 = torch.constant.int 1
    %6038 = torch.aten.mul.Scalar %6033, %int1_7577 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_7578 = torch.constant.int 1
    %6039 = torch.aten.add.Tensor %6037, %6038, %int1_7578 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int5_7579 = torch.constant.int 5
    %6040 = torch.prims.convert_element_type %6039, %int5_7579 : !torch.vtensor<[4096,12288],f32>, !torch.int -> !torch.vtensor<[4096,12288],f16>
    %int1_7580 = torch.constant.int 1
    %int4096_7581 = torch.constant.int 4096
    %int12288_7582 = torch.constant.int 12288
    %6041 = torch.prim.ListConstruct %int1_7580, %int4096_7581, %int12288_7582 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6042 = torch.aten.view %6040, %6041 : !torch.vtensor<[4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,12288],f16>
    %str_7583 = torch.constant.str "tanh"
    %6043 = torch.aten.gelu %6042, %str_7583 : !torch.vtensor<[1,4096,12288],f16>, !torch.str -> !torch.vtensor<[1,4096,12288],f16>
    %int4096_7584 = torch.constant.int 4096
    %int12288_7585 = torch.constant.int 12288
    %6044 = torch.prim.ListConstruct %int4096_7584, %int12288_7585 : (!torch.int, !torch.int) -> !torch.list<int>
    %6045 = torch.aten.view %6043, %6044 : !torch.vtensor<[1,4096,12288],f16>, !torch.list<int> -> !torch.vtensor<[4096,12288],f16>
    %__auto.sampler.double_blocks.18.img_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.18.img_mlp.2.weight : tensor<3072x12288xf16>
    %6046 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_7586 = torch.constant.int 0
    %int1_7587 = torch.constant.int 1
    %6047 = torch.aten.transpose.int %6046, %int0_7586, %int1_7587 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.18.img_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.18.img_mlp.2.bias : tensor<3072xf16>
    %6048 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.img_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7588 = torch.constant.int 6
    %6049 = torch.prims.convert_element_type %6048, %int6_7588 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7589 = torch.constant.int 6
    %6050 = torch.prims.convert_element_type %6045, %int6_7589 : !torch.vtensor<[4096,12288],f16>, !torch.int -> !torch.vtensor<[4096,12288],f32>
    %int6_7590 = torch.constant.int 6
    %6051 = torch.prims.convert_element_type %6047, %int6_7590 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %6052 = torch.aten.mm %6050, %6051 : !torch.vtensor<[4096,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[4096,3072],f32>
    %int1_7591 = torch.constant.int 1
    %6053 = torch.aten.mul.Scalar %6052, %int1_7591 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int1_7592 = torch.constant.int 1
    %6054 = torch.aten.mul.Scalar %6049, %int1_7592 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7593 = torch.constant.int 1
    %6055 = torch.aten.add.Tensor %6053, %6054, %int1_7593 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int5_7594 = torch.constant.int 5
    %6056 = torch.prims.convert_element_type %6055, %int5_7594 : !torch.vtensor<[4096,3072],f32>, !torch.int -> !torch.vtensor<[4096,3072],f16>
    %int1_7595 = torch.constant.int 1
    %int4096_7596 = torch.constant.int 4096
    %int3072_7597 = torch.constant.int 3072
    %6057 = torch.prim.ListConstruct %int1_7595, %int4096_7596, %int3072_7597 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6058 = torch.aten.view %6056, %6057 : !torch.vtensor<[4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,3072],f16>
    %6059 = torch.aten.mul.Tensor %5830, %6058 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int1_7598 = torch.constant.int 1
    %6060 = torch.aten.add.Tensor %6017, %6059, %int1_7598 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int512_7599 = torch.constant.int 512
    %int3072_7600 = torch.constant.int 3072
    %6061 = torch.prim.ListConstruct %int512_7599, %int3072_7600 : (!torch.int, !torch.int) -> !torch.list<int>
    %6062 = torch.aten.view %5998, %6061 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.18.txt_attn.proj.weight = util.global.load @__auto.sampler.double_blocks.18.txt_attn.proj.weight : tensor<3072x3072xf16>
    %6063 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_attn.proj.weight : tensor<3072x3072xf16> -> !torch.vtensor<[3072,3072],f16>
    %int0_7601 = torch.constant.int 0
    %int1_7602 = torch.constant.int 1
    %6064 = torch.aten.transpose.int %6063, %int0_7601, %int1_7602 : !torch.vtensor<[3072,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,3072],f16>
    %__auto.sampler.double_blocks.18.txt_attn.proj.bias = util.global.load @__auto.sampler.double_blocks.18.txt_attn.proj.bias : tensor<3072xf16>
    %6065 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_attn.proj.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7603 = torch.constant.int 6
    %6066 = torch.prims.convert_element_type %6065, %int6_7603 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7604 = torch.constant.int 6
    %6067 = torch.prims.convert_element_type %6062, %int6_7604 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_7605 = torch.constant.int 6
    %6068 = torch.prims.convert_element_type %6064, %int6_7605 : !torch.vtensor<[3072,3072],f16>, !torch.int -> !torch.vtensor<[3072,3072],f32>
    %6069 = torch.aten.mm %6067, %6068 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_7606 = torch.constant.int 1
    %6070 = torch.aten.mul.Scalar %6069, %int1_7606 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_7607 = torch.constant.int 1
    %6071 = torch.aten.mul.Scalar %6066, %int1_7607 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7608 = torch.constant.int 1
    %6072 = torch.aten.add.Tensor %6070, %6071, %int1_7608 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_7609 = torch.constant.int 5
    %6073 = torch.prims.convert_element_type %6072, %int5_7609 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_7610 = torch.constant.int 1
    %int512_7611 = torch.constant.int 512
    %int3072_7612 = torch.constant.int 3072
    %6074 = torch.prim.ListConstruct %int1_7610, %int512_7611, %int3072_7612 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6075 = torch.aten.view %6073, %6074 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %6076 = torch.aten.mul.Tensor %5848, %6075 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7613 = torch.constant.int 1
    %6077 = torch.aten.add.Tensor %5809, %6076, %int1_7613 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int1_7614 = torch.constant.int 1
    %int1_7615 = torch.constant.int 1
    %6078 = torch.aten.add.Scalar %5850, %int1_7614, %int1_7615 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_7616 = torch.constant.int 6
    %6079 = torch.prims.convert_element_type %6077, %int6_7616 : !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %int2_7617 = torch.constant.int 2
    %6080 = torch.prim.ListConstruct %int2_7617 : (!torch.int) -> !torch.list<int>
    %int0_7618 = torch.constant.int 0
    %true_7619 = torch.constant.bool true
    %result0_7620, %result1_7621 = torch.aten.var_mean.correction %6079, %6080, %int0_7618, %true_7619 : !torch.vtensor<[1,512,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,512,1],f32>, !torch.vtensor<[1,512,1],f32>
    %float9.999990e-07_7622 = torch.constant.float 9.9999999999999995E-7
    %int1_7623 = torch.constant.int 1
    %6081 = torch.aten.add.Scalar %result0_7620, %float9.999990e-07_7622, %int1_7623 : !torch.vtensor<[1,512,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,512,1],f32>
    %6082 = torch.aten.rsqrt %6081 : !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,1],f32>
    %int1_7624 = torch.constant.int 1
    %6083 = torch.aten.sub.Tensor %6077, %result1_7621, %int1_7624 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,1],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f32>
    %6084 = torch.aten.mul.Tensor %6083, %6082 : !torch.vtensor<[1,512,3072],f32>, !torch.vtensor<[1,512,1],f32> -> !torch.vtensor<[1,512,3072],f32>
    %int5_7625 = torch.constant.int 5
    %6085 = torch.prims.convert_element_type %6084, %int5_7625 : !torch.vtensor<[1,512,3072],f32>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %6086 = torch.aten.mul.Tensor %6078, %6085 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7626 = torch.constant.int 1
    %6087 = torch.aten.add.Tensor %6086, %5849, %int1_7626 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %int512_7627 = torch.constant.int 512
    %int3072_7628 = torch.constant.int 3072
    %6088 = torch.prim.ListConstruct %int512_7627, %int3072_7628 : (!torch.int, !torch.int) -> !torch.list<int>
    %6089 = torch.aten.view %6087, %6088 : !torch.vtensor<[1,512,3072],f16>, !torch.list<int> -> !torch.vtensor<[512,3072],f16>
    %__auto.sampler.double_blocks.18.txt_mlp.0.weight = util.global.load @__auto.sampler.double_blocks.18.txt_mlp.0.weight : tensor<12288x3072xf16>
    %6090 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_mlp.0.weight : tensor<12288x3072xf16> -> !torch.vtensor<[12288,3072],f16>
    %int0_7629 = torch.constant.int 0
    %int1_7630 = torch.constant.int 1
    %6091 = torch.aten.transpose.int %6090, %int0_7629, %int1_7630 : !torch.vtensor<[12288,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,12288],f16>
    %__auto.sampler.double_blocks.18.txt_mlp.0.bias = util.global.load @__auto.sampler.double_blocks.18.txt_mlp.0.bias : tensor<12288xf16>
    %6092 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_mlp.0.bias : tensor<12288xf16> -> !torch.vtensor<[12288],f16>
    %int6_7631 = torch.constant.int 6
    %6093 = torch.prims.convert_element_type %6092, %int6_7631 : !torch.vtensor<[12288],f16>, !torch.int -> !torch.vtensor<[12288],f32>
    %int6_7632 = torch.constant.int 6
    %6094 = torch.prims.convert_element_type %6089, %int6_7632 : !torch.vtensor<[512,3072],f16>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int6_7633 = torch.constant.int 6
    %6095 = torch.prims.convert_element_type %6091, %int6_7633 : !torch.vtensor<[3072,12288],f16>, !torch.int -> !torch.vtensor<[3072,12288],f32>
    %6096 = torch.aten.mm %6094, %6095 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072,12288],f32> -> !torch.vtensor<[512,12288],f32>
    %int1_7634 = torch.constant.int 1
    %6097 = torch.aten.mul.Scalar %6096, %int1_7634 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int1_7635 = torch.constant.int 1
    %6098 = torch.aten.mul.Scalar %6093, %int1_7635 : !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[12288],f32>
    %int1_7636 = torch.constant.int 1
    %6099 = torch.aten.add.Tensor %6097, %6098, %int1_7636 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int5_7637 = torch.constant.int 5
    %6100 = torch.prims.convert_element_type %6099, %int5_7637 : !torch.vtensor<[512,12288],f32>, !torch.int -> !torch.vtensor<[512,12288],f16>
    %int1_7638 = torch.constant.int 1
    %int512_7639 = torch.constant.int 512
    %int12288_7640 = torch.constant.int 12288
    %6101 = torch.prim.ListConstruct %int1_7638, %int512_7639, %int12288_7640 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6102 = torch.aten.view %6100, %6101 : !torch.vtensor<[512,12288],f16>, !torch.list<int> -> !torch.vtensor<[1,512,12288],f16>
    %str_7641 = torch.constant.str "tanh"
    %6103 = torch.aten.gelu %6102, %str_7641 : !torch.vtensor<[1,512,12288],f16>, !torch.str -> !torch.vtensor<[1,512,12288],f16>
    %int512_7642 = torch.constant.int 512
    %int12288_7643 = torch.constant.int 12288
    %6104 = torch.prim.ListConstruct %int512_7642, %int12288_7643 : (!torch.int, !torch.int) -> !torch.list<int>
    %6105 = torch.aten.view %6103, %6104 : !torch.vtensor<[1,512,12288],f16>, !torch.list<int> -> !torch.vtensor<[512,12288],f16>
    %__auto.sampler.double_blocks.18.txt_mlp.2.weight = util.global.load @__auto.sampler.double_blocks.18.txt_mlp.2.weight : tensor<3072x12288xf16>
    %6106 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_mlp.2.weight : tensor<3072x12288xf16> -> !torch.vtensor<[3072,12288],f16>
    %int0_7644 = torch.constant.int 0
    %int1_7645 = torch.constant.int 1
    %6107 = torch.aten.transpose.int %6106, %int0_7644, %int1_7645 : !torch.vtensor<[3072,12288],f16>, !torch.int, !torch.int -> !torch.vtensor<[12288,3072],f16>
    %__auto.sampler.double_blocks.18.txt_mlp.2.bias = util.global.load @__auto.sampler.double_blocks.18.txt_mlp.2.bias : tensor<3072xf16>
    %6108 = torch_c.from_builtin_tensor %__auto.sampler.double_blocks.18.txt_mlp.2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7646 = torch.constant.int 6
    %6109 = torch.prims.convert_element_type %6108, %int6_7646 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7647 = torch.constant.int 6
    %6110 = torch.prims.convert_element_type %6105, %int6_7647 : !torch.vtensor<[512,12288],f16>, !torch.int -> !torch.vtensor<[512,12288],f32>
    %int6_7648 = torch.constant.int 6
    %6111 = torch.prims.convert_element_type %6107, %int6_7648 : !torch.vtensor<[12288,3072],f16>, !torch.int -> !torch.vtensor<[12288,3072],f32>
    %6112 = torch.aten.mm %6110, %6111 : !torch.vtensor<[512,12288],f32>, !torch.vtensor<[12288,3072],f32> -> !torch.vtensor<[512,3072],f32>
    %int1_7649 = torch.constant.int 1
    %6113 = torch.aten.mul.Scalar %6112, %int1_7649 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int1_7650 = torch.constant.int 1
    %6114 = torch.aten.mul.Scalar %6109, %int1_7650 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7651 = torch.constant.int 1
    %6115 = torch.aten.add.Tensor %6113, %6114, %int1_7651 : !torch.vtensor<[512,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f32>
    %int5_7652 = torch.constant.int 5
    %6116 = torch.prims.convert_element_type %6115, %int5_7652 : !torch.vtensor<[512,3072],f32>, !torch.int -> !torch.vtensor<[512,3072],f16>
    %int1_7653 = torch.constant.int 1
    %int512_7654 = torch.constant.int 512
    %int3072_7655 = torch.constant.int 3072
    %6117 = torch.prim.ListConstruct %int1_7653, %int512_7654, %int3072_7655 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6118 = torch.aten.view %6116, %6117 : !torch.vtensor<[512,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,512,3072],f16>
    %6119 = torch.aten.mul.Tensor %5851, %6118 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,512,3072],f16> -> !torch.vtensor<[1,512,3072],f16>
    %int1_7656 = torch.constant.int 1
    %6120 = torch.aten.add.Tensor %6077, %6119, %int1_7656 : !torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,512,3072],f16>, !torch.int -> !torch.vtensor<[1,512,3072],f16>
    %6121 = torch.prim.ListConstruct %6120, %6060 : (!torch.vtensor<[1,512,3072],f16>, !torch.vtensor<[1,4096,3072],f16>) -> !torch.list<vtensor>
    %int1_7657 = torch.constant.int 1
    %6122 = torch.aten.cat %6121, %int1_7657 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6123 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.0.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.0.modulation.lin.weight : tensor<9216x3072xf16>
    %6124 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_7658 = torch.constant.int 0
    %int1_7659 = torch.constant.int 1
    %6125 = torch.aten.transpose.int %6124, %int0_7658, %int1_7659 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.0.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.0.modulation.lin.bias : tensor<9216xf16>
    %6126 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_7660 = torch.constant.int 6
    %6127 = torch.prims.convert_element_type %6126, %int6_7660 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_7661 = torch.constant.int 6
    %6128 = torch.prims.convert_element_type %6123, %int6_7661 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_7662 = torch.constant.int 6
    %6129 = torch.prims.convert_element_type %6125, %int6_7662 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %6130 = torch.aten.mm %6128, %6129 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_7663 = torch.constant.int 1
    %6131 = torch.aten.mul.Scalar %6130, %int1_7663 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_7664 = torch.constant.int 1
    %6132 = torch.aten.mul.Scalar %6127, %int1_7664 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_7665 = torch.constant.int 1
    %6133 = torch.aten.add.Tensor %6131, %6132, %int1_7665 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_7666 = torch.constant.int 5
    %6134 = torch.prims.convert_element_type %6133, %int5_7666 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_7667 = torch.constant.int 0
    %int0_7668 = torch.constant.int 0
    %int9223372036854775807_7669 = torch.constant.int 9223372036854775807
    %int1_7670 = torch.constant.int 1
    %6135 = torch.aten.slice.Tensor %6134, %int0_7667, %int0_7668, %int9223372036854775807_7669, %int1_7670 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_7671 = torch.constant.int 1
    %6136 = torch.aten.unsqueeze %6135, %int1_7671 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_7672 = torch.constant.int 2
    %int0_7673 = torch.constant.int 0
    %int9223372036854775807_7674 = torch.constant.int 9223372036854775807
    %int1_7675 = torch.constant.int 1
    %6137 = torch.aten.slice.Tensor %6136, %int2_7672, %int0_7673, %int9223372036854775807_7674, %int1_7675 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_7676 = torch.constant.int -1
    %int0_7677 = torch.constant.int 0
    %int3072_7678 = torch.constant.int 3072
    %int1_7679 = torch.constant.int 1
    %6138 = torch.aten.slice.Tensor %6137, %int-1_7676, %int0_7677, %int3072_7678, %int1_7679 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7680 = torch.constant.int -1
    %int3072_7681 = torch.constant.int 3072
    %int6144_7682 = torch.constant.int 6144
    %int1_7683 = torch.constant.int 1
    %6139 = torch.aten.slice.Tensor %6137, %int-1_7680, %int3072_7681, %int6144_7682, %int1_7683 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7684 = torch.constant.int -1
    %int6144_7685 = torch.constant.int 6144
    %int9216_7686 = torch.constant.int 9216
    %int1_7687 = torch.constant.int 1
    %6140 = torch.aten.slice.Tensor %6137, %int-1_7684, %int6144_7685, %int9216_7686, %int1_7687 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_7688 = torch.constant.int 1
    %int1_7689 = torch.constant.int 1
    %6141 = torch.aten.add.Scalar %6139, %int1_7688, %int1_7689 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_7690 = torch.constant.int 6
    %6142 = torch.prims.convert_element_type %6122, %int6_7690 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_7691 = torch.constant.int 2
    %6143 = torch.prim.ListConstruct %int2_7691 : (!torch.int) -> !torch.list<int>
    %int0_7692 = torch.constant.int 0
    %true_7693 = torch.constant.bool true
    %result0_7694, %result1_7695 = torch.aten.var_mean.correction %6142, %6143, %int0_7692, %true_7693 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_7696 = torch.constant.float 9.9999999999999995E-7
    %int1_7697 = torch.constant.int 1
    %6144 = torch.aten.add.Scalar %result0_7694, %float9.999990e-07_7696, %int1_7697 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %6145 = torch.aten.rsqrt %6144 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_7698 = torch.constant.int 1
    %6146 = torch.aten.sub.Tensor %6122, %result1_7695, %int1_7698 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %6147 = torch.aten.mul.Tensor %6146, %6145 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_7699 = torch.constant.int 5
    %6148 = torch.prims.convert_element_type %6147, %int5_7699 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6149 = torch.aten.mul.Tensor %6141, %6148 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_7700 = torch.constant.int 1
    %6150 = torch.aten.add.Tensor %6149, %6138, %int1_7700 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_7701 = torch.constant.int 4608
    %int3072_7702 = torch.constant.int 3072
    %6151 = torch.prim.ListConstruct %int4608_7701, %int3072_7702 : (!torch.int, !torch.int) -> !torch.list<int>
    %6152 = torch.aten.view %6150, %6151 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.0.linear1.weight = util.global.load @__auto.sampler.single_blocks.0.linear1.weight : tensor<21504x3072xf16>
    %6153 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_7703 = torch.constant.int 0
    %int1_7704 = torch.constant.int 1
    %6154 = torch.aten.transpose.int %6153, %int0_7703, %int1_7704 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.0.linear1.bias = util.global.load @__auto.sampler.single_blocks.0.linear1.bias : tensor<21504xf16>
    %6155 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_7705 = torch.constant.int 6
    %6156 = torch.prims.convert_element_type %6155, %int6_7705 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_7706 = torch.constant.int 6
    %6157 = torch.prims.convert_element_type %6152, %int6_7706 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_7707 = torch.constant.int 6
    %6158 = torch.prims.convert_element_type %6154, %int6_7707 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %6159 = torch.aten.mm %6157, %6158 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_7708 = torch.constant.int 1
    %6160 = torch.aten.mul.Scalar %6159, %int1_7708 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_7709 = torch.constant.int 1
    %6161 = torch.aten.mul.Scalar %6156, %int1_7709 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_7710 = torch.constant.int 1
    %6162 = torch.aten.add.Tensor %6160, %6161, %int1_7710 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_7711 = torch.constant.int 5
    %6163 = torch.prims.convert_element_type %6162, %int5_7711 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_7712 = torch.constant.int 1
    %int4608_7713 = torch.constant.int 4608
    %int21504 = torch.constant.int 21504
    %6164 = torch.prim.ListConstruct %int1_7712, %int4608_7713, %int21504 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6165 = torch.aten.view %6163, %6164 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_7714 = torch.constant.int -1
    %int0_7715 = torch.constant.int 0
    %int9216_7716 = torch.constant.int 9216
    %int1_7717 = torch.constant.int 1
    %6166 = torch.aten.slice.Tensor %6165, %int-1_7714, %int0_7715, %int9216_7716, %int1_7717 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_7718 = torch.constant.int -1
    %int9216_7719 = torch.constant.int 9216
    %int21504_7720 = torch.constant.int 21504
    %int1_7721 = torch.constant.int 1
    %6167 = torch.aten.slice.Tensor %6165, %int-1_7718, %int9216_7719, %int21504_7720, %int1_7721 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_7722 = torch.constant.int 1
    %int4608_7723 = torch.constant.int 4608
    %int3_7724 = torch.constant.int 3
    %int24_7725 = torch.constant.int 24
    %int128_7726 = torch.constant.int 128
    %6168 = torch.prim.ListConstruct %int1_7722, %int4608_7723, %int3_7724, %int24_7725, %int128_7726 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6169 = torch.aten.view %6166, %6168 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_7727 = torch.constant.int 2
    %int0_7728 = torch.constant.int 0
    %int3_7729 = torch.constant.int 3
    %int1_7730 = torch.constant.int 1
    %int4_7731 = torch.constant.int 4
    %6170 = torch.prim.ListConstruct %int2_7727, %int0_7728, %int3_7729, %int1_7730, %int4_7731 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6171 = torch.aten.permute %6169, %6170 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_7732 = torch.constant.int 0
    %int0_7733 = torch.constant.int 0
    %6172 = torch.aten.select.int %6171, %int0_7732, %int0_7733 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_7734 = torch.constant.int 0
    %int1_7735 = torch.constant.int 1
    %6173 = torch.aten.select.int %6171, %int0_7734, %int1_7735 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_7736 = torch.constant.int 0
    %int2_7737 = torch.constant.int 2
    %6174 = torch.aten.select.int %6171, %int0_7736, %int2_7737 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_7738 = torch.constant.int 6
    %6175 = torch.prims.convert_element_type %6172, %int6_7738 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_7739 = torch.constant.int 2
    %6176 = torch.aten.pow.Tensor_Scalar %6175, %int2_7739 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_7740 = torch.constant.int -1
    %6177 = torch.prim.ListConstruct %int-1_7740 : (!torch.int) -> !torch.list<int>
    %true_7741 = torch.constant.bool true
    %none_7742 = torch.constant.none
    %6178 = torch.aten.mean.dim %6176, %6177, %true_7741, %none_7742 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_7743 = torch.constant.float 9.9999999999999995E-7
    %int1_7744 = torch.constant.int 1
    %6179 = torch.aten.add.Scalar %6178, %float9.999990e-07_7743, %int1_7744 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6180 = torch.aten.rsqrt %6179 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6181 = torch.aten.mul.Tensor %6175, %6180 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7745 = torch.constant.int 5
    %6182 = torch.prims.convert_element_type %6181, %int5_7745 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.0.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.0.norm.query_norm.scale : tensor<128xf16>
    %6183 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6184 = torch.aten.mul.Tensor %6182, %6183 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_7746 = torch.constant.int 6
    %6185 = torch.prims.convert_element_type %6173, %int6_7746 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_7747 = torch.constant.int 2
    %6186 = torch.aten.pow.Tensor_Scalar %6185, %int2_7747 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_7748 = torch.constant.int -1
    %6187 = torch.prim.ListConstruct %int-1_7748 : (!torch.int) -> !torch.list<int>
    %true_7749 = torch.constant.bool true
    %none_7750 = torch.constant.none
    %6188 = torch.aten.mean.dim %6186, %6187, %true_7749, %none_7750 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_7751 = torch.constant.float 9.9999999999999995E-7
    %int1_7752 = torch.constant.int 1
    %6189 = torch.aten.add.Scalar %6188, %float9.999990e-07_7751, %int1_7752 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6190 = torch.aten.rsqrt %6189 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6191 = torch.aten.mul.Tensor %6185, %6190 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7753 = torch.constant.int 5
    %6192 = torch.prims.convert_element_type %6191, %int5_7753 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.0.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.0.norm.key_norm.scale : tensor<128xf16>
    %6193 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6194 = torch.aten.mul.Tensor %6192, %6193 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_7754 = torch.constant.int 5
    %6195 = torch.prims.convert_element_type %6184, %int5_7754 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_7755 = torch.constant.int 5
    %6196 = torch.prims.convert_element_type %6194, %int5_7755 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_7756 = torch.constant.int 6
    %6197 = torch.prims.convert_element_type %6195, %int6_7756 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_7757 = torch.constant.int 1
    %int24_7758 = torch.constant.int 24
    %int4608_7759 = torch.constant.int 4608
    %int64_7760 = torch.constant.int 64
    %int1_7761 = torch.constant.int 1
    %int2_7762 = torch.constant.int 2
    %6198 = torch.prim.ListConstruct %int1_7757, %int24_7758, %int4608_7759, %int64_7760, %int1_7761, %int2_7762 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6199 = torch.aten.view %6197, %6198 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_7763 = torch.constant.int 6
    %6200 = torch.prims.convert_element_type %6196, %int6_7763 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_7764 = torch.constant.int 1
    %int24_7765 = torch.constant.int 24
    %int4608_7766 = torch.constant.int 4608
    %int64_7767 = torch.constant.int 64
    %int1_7768 = torch.constant.int 1
    %int2_7769 = torch.constant.int 2
    %6201 = torch.prim.ListConstruct %int1_7764, %int24_7765, %int4608_7766, %int64_7767, %int1_7768, %int2_7769 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6202 = torch.aten.view %6200, %6201 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_7770 = torch.constant.int 5
    %int0_7771 = torch.constant.int 0
    %6203 = torch.aten.select.int %211, %int5_7770, %int0_7771 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7772 = torch.constant.int 5
    %int0_7773 = torch.constant.int 0
    %6204 = torch.aten.select.int %6199, %int5_7772, %int0_7773 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6205 = torch.aten.mul.Tensor %6203, %6204 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7774 = torch.constant.int 5
    %int1_7775 = torch.constant.int 1
    %6206 = torch.aten.select.int %211, %int5_7774, %int1_7775 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7776 = torch.constant.int 5
    %int1_7777 = torch.constant.int 1
    %6207 = torch.aten.select.int %6199, %int5_7776, %int1_7777 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6208 = torch.aten.mul.Tensor %6206, %6207 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7778 = torch.constant.int 1
    %6209 = torch.aten.add.Tensor %6205, %6208, %int1_7778 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7779 = torch.constant.int 5
    %int0_7780 = torch.constant.int 0
    %6210 = torch.aten.select.int %211, %int5_7779, %int0_7780 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7781 = torch.constant.int 5
    %int0_7782 = torch.constant.int 0
    %6211 = torch.aten.select.int %6202, %int5_7781, %int0_7782 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6212 = torch.aten.mul.Tensor %6210, %6211 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7783 = torch.constant.int 5
    %int1_7784 = torch.constant.int 1
    %6213 = torch.aten.select.int %211, %int5_7783, %int1_7784 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7785 = torch.constant.int 5
    %int1_7786 = torch.constant.int 1
    %6214 = torch.aten.select.int %6202, %int5_7785, %int1_7786 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6215 = torch.aten.mul.Tensor %6213, %6214 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7787 = torch.constant.int 1
    %6216 = torch.aten.add.Tensor %6212, %6215, %int1_7787 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7788 = torch.constant.int 1
    %int24_7789 = torch.constant.int 24
    %int4608_7790 = torch.constant.int 4608
    %int128_7791 = torch.constant.int 128
    %6217 = torch.prim.ListConstruct %int1_7788, %int24_7789, %int4608_7790, %int128_7791 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6218 = torch.aten.view %6209, %6217 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7792 = torch.constant.int 5
    %6219 = torch.prims.convert_element_type %6218, %int5_7792 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_7793 = torch.constant.int 1
    %int24_7794 = torch.constant.int 24
    %int4608_7795 = torch.constant.int 4608
    %int128_7796 = torch.constant.int 128
    %6220 = torch.prim.ListConstruct %int1_7793, %int24_7794, %int4608_7795, %int128_7796 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6221 = torch.aten.view %6216, %6220 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7797 = torch.constant.int 5
    %6222 = torch.prims.convert_element_type %6221, %int5_7797 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_7798 = torch.constant.float 0.000000e+00
    %false_7799 = torch.constant.bool false
    %none_7800 = torch.constant.none
    %none_7801 = torch.constant.none
    %6223:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%6219, %6222, %6174, %float0.000000e00_7798, %false_7799, %none_7800, %none_7801) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_7802 = torch.constant.int 0
    %int2_7803 = torch.constant.int 2
    %int1_7804 = torch.constant.int 1
    %int3_7805 = torch.constant.int 3
    %6224 = torch.prim.ListConstruct %int0_7802, %int2_7803, %int1_7804, %int3_7805 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6225 = torch.aten.permute %6223#0, %6224 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_7806 = torch.constant.int 1
    %int4608_7807 = torch.constant.int 4608
    %int3072_7808 = torch.constant.int 3072
    %6226 = torch.prim.ListConstruct %int1_7806, %int4608_7807, %int3072_7808 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6227 = torch.aten.view %6225, %6226 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_7809 = torch.constant.str "tanh"
    %6228 = torch.aten.gelu %6167, %str_7809 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %6229 = torch.prim.ListConstruct %6227, %6228 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_7810 = torch.constant.int 2
    %6230 = torch.aten.cat %6229, %int2_7810 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_7811 = torch.constant.int 4608
    %int15360_7812 = torch.constant.int 15360
    %6231 = torch.prim.ListConstruct %int4608_7811, %int15360_7812 : (!torch.int, !torch.int) -> !torch.list<int>
    %6232 = torch.aten.view %6230, %6231 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.0.linear2.weight = util.global.load @__auto.sampler.single_blocks.0.linear2.weight : tensor<3072x15360xf16>
    %6233 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_7813 = torch.constant.int 0
    %int1_7814 = torch.constant.int 1
    %6234 = torch.aten.transpose.int %6233, %int0_7813, %int1_7814 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.0.linear2.bias = util.global.load @__auto.sampler.single_blocks.0.linear2.bias : tensor<3072xf16>
    %6235 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.0.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7815 = torch.constant.int 6
    %6236 = torch.prims.convert_element_type %6235, %int6_7815 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7816 = torch.constant.int 6
    %6237 = torch.prims.convert_element_type %6232, %int6_7816 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_7817 = torch.constant.int 6
    %6238 = torch.prims.convert_element_type %6234, %int6_7817 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %6239 = torch.aten.mm %6237, %6238 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_7818 = torch.constant.int 1
    %6240 = torch.aten.mul.Scalar %6239, %int1_7818 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_7819 = torch.constant.int 1
    %6241 = torch.aten.mul.Scalar %6236, %int1_7819 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7820 = torch.constant.int 1
    %6242 = torch.aten.add.Tensor %6240, %6241, %int1_7820 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_7821 = torch.constant.int 5
    %6243 = torch.prims.convert_element_type %6242, %int5_7821 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_7822 = torch.constant.int 1
    %int4608_7823 = torch.constant.int 4608
    %int3072_7824 = torch.constant.int 3072
    %6244 = torch.prim.ListConstruct %int1_7822, %int4608_7823, %int3072_7824 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6245 = torch.aten.view %6243, %6244 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %6246 = torch.aten.mul.Tensor %6140, %6245 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_7825 = torch.constant.int 1
    %6247 = torch.aten.add.Tensor %6122, %6246, %int1_7825 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6248 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.1.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.1.modulation.lin.weight : tensor<9216x3072xf16>
    %6249 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_7826 = torch.constant.int 0
    %int1_7827 = torch.constant.int 1
    %6250 = torch.aten.transpose.int %6249, %int0_7826, %int1_7827 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.1.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.1.modulation.lin.bias : tensor<9216xf16>
    %6251 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_7828 = torch.constant.int 6
    %6252 = torch.prims.convert_element_type %6251, %int6_7828 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_7829 = torch.constant.int 6
    %6253 = torch.prims.convert_element_type %6248, %int6_7829 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_7830 = torch.constant.int 6
    %6254 = torch.prims.convert_element_type %6250, %int6_7830 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %6255 = torch.aten.mm %6253, %6254 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_7831 = torch.constant.int 1
    %6256 = torch.aten.mul.Scalar %6255, %int1_7831 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_7832 = torch.constant.int 1
    %6257 = torch.aten.mul.Scalar %6252, %int1_7832 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_7833 = torch.constant.int 1
    %6258 = torch.aten.add.Tensor %6256, %6257, %int1_7833 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_7834 = torch.constant.int 5
    %6259 = torch.prims.convert_element_type %6258, %int5_7834 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_7835 = torch.constant.int 0
    %int0_7836 = torch.constant.int 0
    %int9223372036854775807_7837 = torch.constant.int 9223372036854775807
    %int1_7838 = torch.constant.int 1
    %6260 = torch.aten.slice.Tensor %6259, %int0_7835, %int0_7836, %int9223372036854775807_7837, %int1_7838 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_7839 = torch.constant.int 1
    %6261 = torch.aten.unsqueeze %6260, %int1_7839 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_7840 = torch.constant.int 2
    %int0_7841 = torch.constant.int 0
    %int9223372036854775807_7842 = torch.constant.int 9223372036854775807
    %int1_7843 = torch.constant.int 1
    %6262 = torch.aten.slice.Tensor %6261, %int2_7840, %int0_7841, %int9223372036854775807_7842, %int1_7843 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_7844 = torch.constant.int -1
    %int0_7845 = torch.constant.int 0
    %int3072_7846 = torch.constant.int 3072
    %int1_7847 = torch.constant.int 1
    %6263 = torch.aten.slice.Tensor %6262, %int-1_7844, %int0_7845, %int3072_7846, %int1_7847 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7848 = torch.constant.int -1
    %int3072_7849 = torch.constant.int 3072
    %int6144_7850 = torch.constant.int 6144
    %int1_7851 = torch.constant.int 1
    %6264 = torch.aten.slice.Tensor %6262, %int-1_7848, %int3072_7849, %int6144_7850, %int1_7851 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_7852 = torch.constant.int -1
    %int6144_7853 = torch.constant.int 6144
    %int9216_7854 = torch.constant.int 9216
    %int1_7855 = torch.constant.int 1
    %6265 = torch.aten.slice.Tensor %6262, %int-1_7852, %int6144_7853, %int9216_7854, %int1_7855 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_7856 = torch.constant.int 1
    %int1_7857 = torch.constant.int 1
    %6266 = torch.aten.add.Scalar %6264, %int1_7856, %int1_7857 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_7858 = torch.constant.int 6
    %6267 = torch.prims.convert_element_type %6247, %int6_7858 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_7859 = torch.constant.int 2
    %6268 = torch.prim.ListConstruct %int2_7859 : (!torch.int) -> !torch.list<int>
    %int0_7860 = torch.constant.int 0
    %true_7861 = torch.constant.bool true
    %result0_7862, %result1_7863 = torch.aten.var_mean.correction %6267, %6268, %int0_7860, %true_7861 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_7864 = torch.constant.float 9.9999999999999995E-7
    %int1_7865 = torch.constant.int 1
    %6269 = torch.aten.add.Scalar %result0_7862, %float9.999990e-07_7864, %int1_7865 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %6270 = torch.aten.rsqrt %6269 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_7866 = torch.constant.int 1
    %6271 = torch.aten.sub.Tensor %6247, %result1_7863, %int1_7866 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %6272 = torch.aten.mul.Tensor %6271, %6270 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_7867 = torch.constant.int 5
    %6273 = torch.prims.convert_element_type %6272, %int5_7867 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6274 = torch.aten.mul.Tensor %6266, %6273 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_7868 = torch.constant.int 1
    %6275 = torch.aten.add.Tensor %6274, %6263, %int1_7868 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_7869 = torch.constant.int 4608
    %int3072_7870 = torch.constant.int 3072
    %6276 = torch.prim.ListConstruct %int4608_7869, %int3072_7870 : (!torch.int, !torch.int) -> !torch.list<int>
    %6277 = torch.aten.view %6275, %6276 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.1.linear1.weight = util.global.load @__auto.sampler.single_blocks.1.linear1.weight : tensor<21504x3072xf16>
    %6278 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_7871 = torch.constant.int 0
    %int1_7872 = torch.constant.int 1
    %6279 = torch.aten.transpose.int %6278, %int0_7871, %int1_7872 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.1.linear1.bias = util.global.load @__auto.sampler.single_blocks.1.linear1.bias : tensor<21504xf16>
    %6280 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_7873 = torch.constant.int 6
    %6281 = torch.prims.convert_element_type %6280, %int6_7873 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_7874 = torch.constant.int 6
    %6282 = torch.prims.convert_element_type %6277, %int6_7874 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_7875 = torch.constant.int 6
    %6283 = torch.prims.convert_element_type %6279, %int6_7875 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %6284 = torch.aten.mm %6282, %6283 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_7876 = torch.constant.int 1
    %6285 = torch.aten.mul.Scalar %6284, %int1_7876 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_7877 = torch.constant.int 1
    %6286 = torch.aten.mul.Scalar %6281, %int1_7877 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_7878 = torch.constant.int 1
    %6287 = torch.aten.add.Tensor %6285, %6286, %int1_7878 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_7879 = torch.constant.int 5
    %6288 = torch.prims.convert_element_type %6287, %int5_7879 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_7880 = torch.constant.int 1
    %int4608_7881 = torch.constant.int 4608
    %int21504_7882 = torch.constant.int 21504
    %6289 = torch.prim.ListConstruct %int1_7880, %int4608_7881, %int21504_7882 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6290 = torch.aten.view %6288, %6289 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_7883 = torch.constant.int -1
    %int0_7884 = torch.constant.int 0
    %int9216_7885 = torch.constant.int 9216
    %int1_7886 = torch.constant.int 1
    %6291 = torch.aten.slice.Tensor %6290, %int-1_7883, %int0_7884, %int9216_7885, %int1_7886 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_7887 = torch.constant.int -1
    %int9216_7888 = torch.constant.int 9216
    %int21504_7889 = torch.constant.int 21504
    %int1_7890 = torch.constant.int 1
    %6292 = torch.aten.slice.Tensor %6290, %int-1_7887, %int9216_7888, %int21504_7889, %int1_7890 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_7891 = torch.constant.int 1
    %int4608_7892 = torch.constant.int 4608
    %int3_7893 = torch.constant.int 3
    %int24_7894 = torch.constant.int 24
    %int128_7895 = torch.constant.int 128
    %6293 = torch.prim.ListConstruct %int1_7891, %int4608_7892, %int3_7893, %int24_7894, %int128_7895 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6294 = torch.aten.view %6291, %6293 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_7896 = torch.constant.int 2
    %int0_7897 = torch.constant.int 0
    %int3_7898 = torch.constant.int 3
    %int1_7899 = torch.constant.int 1
    %int4_7900 = torch.constant.int 4
    %6295 = torch.prim.ListConstruct %int2_7896, %int0_7897, %int3_7898, %int1_7899, %int4_7900 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6296 = torch.aten.permute %6294, %6295 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_7901 = torch.constant.int 0
    %int0_7902 = torch.constant.int 0
    %6297 = torch.aten.select.int %6296, %int0_7901, %int0_7902 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_7903 = torch.constant.int 0
    %int1_7904 = torch.constant.int 1
    %6298 = torch.aten.select.int %6296, %int0_7903, %int1_7904 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_7905 = torch.constant.int 0
    %int2_7906 = torch.constant.int 2
    %6299 = torch.aten.select.int %6296, %int0_7905, %int2_7906 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_7907 = torch.constant.int 6
    %6300 = torch.prims.convert_element_type %6297, %int6_7907 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_7908 = torch.constant.int 2
    %6301 = torch.aten.pow.Tensor_Scalar %6300, %int2_7908 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_7909 = torch.constant.int -1
    %6302 = torch.prim.ListConstruct %int-1_7909 : (!torch.int) -> !torch.list<int>
    %true_7910 = torch.constant.bool true
    %none_7911 = torch.constant.none
    %6303 = torch.aten.mean.dim %6301, %6302, %true_7910, %none_7911 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_7912 = torch.constant.float 9.9999999999999995E-7
    %int1_7913 = torch.constant.int 1
    %6304 = torch.aten.add.Scalar %6303, %float9.999990e-07_7912, %int1_7913 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6305 = torch.aten.rsqrt %6304 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6306 = torch.aten.mul.Tensor %6300, %6305 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7914 = torch.constant.int 5
    %6307 = torch.prims.convert_element_type %6306, %int5_7914 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.1.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.1.norm.query_norm.scale : tensor<128xf16>
    %6308 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6309 = torch.aten.mul.Tensor %6307, %6308 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_7915 = torch.constant.int 6
    %6310 = torch.prims.convert_element_type %6298, %int6_7915 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_7916 = torch.constant.int 2
    %6311 = torch.aten.pow.Tensor_Scalar %6310, %int2_7916 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_7917 = torch.constant.int -1
    %6312 = torch.prim.ListConstruct %int-1_7917 : (!torch.int) -> !torch.list<int>
    %true_7918 = torch.constant.bool true
    %none_7919 = torch.constant.none
    %6313 = torch.aten.mean.dim %6311, %6312, %true_7918, %none_7919 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_7920 = torch.constant.float 9.9999999999999995E-7
    %int1_7921 = torch.constant.int 1
    %6314 = torch.aten.add.Scalar %6313, %float9.999990e-07_7920, %int1_7921 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6315 = torch.aten.rsqrt %6314 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6316 = torch.aten.mul.Tensor %6310, %6315 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7922 = torch.constant.int 5
    %6317 = torch.prims.convert_element_type %6316, %int5_7922 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.1.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.1.norm.key_norm.scale : tensor<128xf16>
    %6318 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6319 = torch.aten.mul.Tensor %6317, %6318 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_7923 = torch.constant.int 5
    %6320 = torch.prims.convert_element_type %6309, %int5_7923 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_7924 = torch.constant.int 5
    %6321 = torch.prims.convert_element_type %6319, %int5_7924 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_7925 = torch.constant.int 6
    %6322 = torch.prims.convert_element_type %6320, %int6_7925 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_7926 = torch.constant.int 1
    %int24_7927 = torch.constant.int 24
    %int4608_7928 = torch.constant.int 4608
    %int64_7929 = torch.constant.int 64
    %int1_7930 = torch.constant.int 1
    %int2_7931 = torch.constant.int 2
    %6323 = torch.prim.ListConstruct %int1_7926, %int24_7927, %int4608_7928, %int64_7929, %int1_7930, %int2_7931 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6324 = torch.aten.view %6322, %6323 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_7932 = torch.constant.int 6
    %6325 = torch.prims.convert_element_type %6321, %int6_7932 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_7933 = torch.constant.int 1
    %int24_7934 = torch.constant.int 24
    %int4608_7935 = torch.constant.int 4608
    %int64_7936 = torch.constant.int 64
    %int1_7937 = torch.constant.int 1
    %int2_7938 = torch.constant.int 2
    %6326 = torch.prim.ListConstruct %int1_7933, %int24_7934, %int4608_7935, %int64_7936, %int1_7937, %int2_7938 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6327 = torch.aten.view %6325, %6326 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_7939 = torch.constant.int 5
    %int0_7940 = torch.constant.int 0
    %6328 = torch.aten.select.int %211, %int5_7939, %int0_7940 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7941 = torch.constant.int 5
    %int0_7942 = torch.constant.int 0
    %6329 = torch.aten.select.int %6324, %int5_7941, %int0_7942 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6330 = torch.aten.mul.Tensor %6328, %6329 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7943 = torch.constant.int 5
    %int1_7944 = torch.constant.int 1
    %6331 = torch.aten.select.int %211, %int5_7943, %int1_7944 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7945 = torch.constant.int 5
    %int1_7946 = torch.constant.int 1
    %6332 = torch.aten.select.int %6324, %int5_7945, %int1_7946 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6333 = torch.aten.mul.Tensor %6331, %6332 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7947 = torch.constant.int 1
    %6334 = torch.aten.add.Tensor %6330, %6333, %int1_7947 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7948 = torch.constant.int 5
    %int0_7949 = torch.constant.int 0
    %6335 = torch.aten.select.int %211, %int5_7948, %int0_7949 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7950 = torch.constant.int 5
    %int0_7951 = torch.constant.int 0
    %6336 = torch.aten.select.int %6327, %int5_7950, %int0_7951 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6337 = torch.aten.mul.Tensor %6335, %6336 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_7952 = torch.constant.int 5
    %int1_7953 = torch.constant.int 1
    %6338 = torch.aten.select.int %211, %int5_7952, %int1_7953 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_7954 = torch.constant.int 5
    %int1_7955 = torch.constant.int 1
    %6339 = torch.aten.select.int %6327, %int5_7954, %int1_7955 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6340 = torch.aten.mul.Tensor %6338, %6339 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7956 = torch.constant.int 1
    %6341 = torch.aten.add.Tensor %6337, %6340, %int1_7956 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_7957 = torch.constant.int 1
    %int24_7958 = torch.constant.int 24
    %int4608_7959 = torch.constant.int 4608
    %int128_7960 = torch.constant.int 128
    %6342 = torch.prim.ListConstruct %int1_7957, %int24_7958, %int4608_7959, %int128_7960 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6343 = torch.aten.view %6334, %6342 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7961 = torch.constant.int 5
    %6344 = torch.prims.convert_element_type %6343, %int5_7961 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_7962 = torch.constant.int 1
    %int24_7963 = torch.constant.int 24
    %int4608_7964 = torch.constant.int 4608
    %int128_7965 = torch.constant.int 128
    %6345 = torch.prim.ListConstruct %int1_7962, %int24_7963, %int4608_7964, %int128_7965 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6346 = torch.aten.view %6341, %6345 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_7966 = torch.constant.int 5
    %6347 = torch.prims.convert_element_type %6346, %int5_7966 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_7967 = torch.constant.float 0.000000e+00
    %false_7968 = torch.constant.bool false
    %none_7969 = torch.constant.none
    %none_7970 = torch.constant.none
    %6348:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%6344, %6347, %6299, %float0.000000e00_7967, %false_7968, %none_7969, %none_7970) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_7971 = torch.constant.int 0
    %int2_7972 = torch.constant.int 2
    %int1_7973 = torch.constant.int 1
    %int3_7974 = torch.constant.int 3
    %6349 = torch.prim.ListConstruct %int0_7971, %int2_7972, %int1_7973, %int3_7974 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6350 = torch.aten.permute %6348#0, %6349 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_7975 = torch.constant.int 1
    %int4608_7976 = torch.constant.int 4608
    %int3072_7977 = torch.constant.int 3072
    %6351 = torch.prim.ListConstruct %int1_7975, %int4608_7976, %int3072_7977 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6352 = torch.aten.view %6350, %6351 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_7978 = torch.constant.str "tanh"
    %6353 = torch.aten.gelu %6292, %str_7978 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %6354 = torch.prim.ListConstruct %6352, %6353 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_7979 = torch.constant.int 2
    %6355 = torch.aten.cat %6354, %int2_7979 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_7980 = torch.constant.int 4608
    %int15360_7981 = torch.constant.int 15360
    %6356 = torch.prim.ListConstruct %int4608_7980, %int15360_7981 : (!torch.int, !torch.int) -> !torch.list<int>
    %6357 = torch.aten.view %6355, %6356 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.1.linear2.weight = util.global.load @__auto.sampler.single_blocks.1.linear2.weight : tensor<3072x15360xf16>
    %6358 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_7982 = torch.constant.int 0
    %int1_7983 = torch.constant.int 1
    %6359 = torch.aten.transpose.int %6358, %int0_7982, %int1_7983 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.1.linear2.bias = util.global.load @__auto.sampler.single_blocks.1.linear2.bias : tensor<3072xf16>
    %6360 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.1.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_7984 = torch.constant.int 6
    %6361 = torch.prims.convert_element_type %6360, %int6_7984 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_7985 = torch.constant.int 6
    %6362 = torch.prims.convert_element_type %6357, %int6_7985 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_7986 = torch.constant.int 6
    %6363 = torch.prims.convert_element_type %6359, %int6_7986 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %6364 = torch.aten.mm %6362, %6363 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_7987 = torch.constant.int 1
    %6365 = torch.aten.mul.Scalar %6364, %int1_7987 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_7988 = torch.constant.int 1
    %6366 = torch.aten.mul.Scalar %6361, %int1_7988 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_7989 = torch.constant.int 1
    %6367 = torch.aten.add.Tensor %6365, %6366, %int1_7989 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_7990 = torch.constant.int 5
    %6368 = torch.prims.convert_element_type %6367, %int5_7990 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_7991 = torch.constant.int 1
    %int4608_7992 = torch.constant.int 4608
    %int3072_7993 = torch.constant.int 3072
    %6369 = torch.prim.ListConstruct %int1_7991, %int4608_7992, %int3072_7993 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6370 = torch.aten.view %6368, %6369 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %6371 = torch.aten.mul.Tensor %6265, %6370 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_7994 = torch.constant.int 1
    %6372 = torch.aten.add.Tensor %6247, %6371, %int1_7994 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6373 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.2.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.2.modulation.lin.weight : tensor<9216x3072xf16>
    %6374 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_7995 = torch.constant.int 0
    %int1_7996 = torch.constant.int 1
    %6375 = torch.aten.transpose.int %6374, %int0_7995, %int1_7996 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.2.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.2.modulation.lin.bias : tensor<9216xf16>
    %6376 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_7997 = torch.constant.int 6
    %6377 = torch.prims.convert_element_type %6376, %int6_7997 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_7998 = torch.constant.int 6
    %6378 = torch.prims.convert_element_type %6373, %int6_7998 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_7999 = torch.constant.int 6
    %6379 = torch.prims.convert_element_type %6375, %int6_7999 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %6380 = torch.aten.mm %6378, %6379 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_8000 = torch.constant.int 1
    %6381 = torch.aten.mul.Scalar %6380, %int1_8000 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_8001 = torch.constant.int 1
    %6382 = torch.aten.mul.Scalar %6377, %int1_8001 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_8002 = torch.constant.int 1
    %6383 = torch.aten.add.Tensor %6381, %6382, %int1_8002 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_8003 = torch.constant.int 5
    %6384 = torch.prims.convert_element_type %6383, %int5_8003 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_8004 = torch.constant.int 0
    %int0_8005 = torch.constant.int 0
    %int9223372036854775807_8006 = torch.constant.int 9223372036854775807
    %int1_8007 = torch.constant.int 1
    %6385 = torch.aten.slice.Tensor %6384, %int0_8004, %int0_8005, %int9223372036854775807_8006, %int1_8007 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_8008 = torch.constant.int 1
    %6386 = torch.aten.unsqueeze %6385, %int1_8008 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_8009 = torch.constant.int 2
    %int0_8010 = torch.constant.int 0
    %int9223372036854775807_8011 = torch.constant.int 9223372036854775807
    %int1_8012 = torch.constant.int 1
    %6387 = torch.aten.slice.Tensor %6386, %int2_8009, %int0_8010, %int9223372036854775807_8011, %int1_8012 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_8013 = torch.constant.int -1
    %int0_8014 = torch.constant.int 0
    %int3072_8015 = torch.constant.int 3072
    %int1_8016 = torch.constant.int 1
    %6388 = torch.aten.slice.Tensor %6387, %int-1_8013, %int0_8014, %int3072_8015, %int1_8016 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8017 = torch.constant.int -1
    %int3072_8018 = torch.constant.int 3072
    %int6144_8019 = torch.constant.int 6144
    %int1_8020 = torch.constant.int 1
    %6389 = torch.aten.slice.Tensor %6387, %int-1_8017, %int3072_8018, %int6144_8019, %int1_8020 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8021 = torch.constant.int -1
    %int6144_8022 = torch.constant.int 6144
    %int9216_8023 = torch.constant.int 9216
    %int1_8024 = torch.constant.int 1
    %6390 = torch.aten.slice.Tensor %6387, %int-1_8021, %int6144_8022, %int9216_8023, %int1_8024 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_8025 = torch.constant.int 1
    %int1_8026 = torch.constant.int 1
    %6391 = torch.aten.add.Scalar %6389, %int1_8025, %int1_8026 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_8027 = torch.constant.int 6
    %6392 = torch.prims.convert_element_type %6372, %int6_8027 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_8028 = torch.constant.int 2
    %6393 = torch.prim.ListConstruct %int2_8028 : (!torch.int) -> !torch.list<int>
    %int0_8029 = torch.constant.int 0
    %true_8030 = torch.constant.bool true
    %result0_8031, %result1_8032 = torch.aten.var_mean.correction %6392, %6393, %int0_8029, %true_8030 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_8033 = torch.constant.float 9.9999999999999995E-7
    %int1_8034 = torch.constant.int 1
    %6394 = torch.aten.add.Scalar %result0_8031, %float9.999990e-07_8033, %int1_8034 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %6395 = torch.aten.rsqrt %6394 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_8035 = torch.constant.int 1
    %6396 = torch.aten.sub.Tensor %6372, %result1_8032, %int1_8035 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %6397 = torch.aten.mul.Tensor %6396, %6395 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_8036 = torch.constant.int 5
    %6398 = torch.prims.convert_element_type %6397, %int5_8036 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6399 = torch.aten.mul.Tensor %6391, %6398 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_8037 = torch.constant.int 1
    %6400 = torch.aten.add.Tensor %6399, %6388, %int1_8037 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_8038 = torch.constant.int 4608
    %int3072_8039 = torch.constant.int 3072
    %6401 = torch.prim.ListConstruct %int4608_8038, %int3072_8039 : (!torch.int, !torch.int) -> !torch.list<int>
    %6402 = torch.aten.view %6400, %6401 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.2.linear1.weight = util.global.load @__auto.sampler.single_blocks.2.linear1.weight : tensor<21504x3072xf16>
    %6403 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_8040 = torch.constant.int 0
    %int1_8041 = torch.constant.int 1
    %6404 = torch.aten.transpose.int %6403, %int0_8040, %int1_8041 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.2.linear1.bias = util.global.load @__auto.sampler.single_blocks.2.linear1.bias : tensor<21504xf16>
    %6405 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_8042 = torch.constant.int 6
    %6406 = torch.prims.convert_element_type %6405, %int6_8042 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_8043 = torch.constant.int 6
    %6407 = torch.prims.convert_element_type %6402, %int6_8043 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_8044 = torch.constant.int 6
    %6408 = torch.prims.convert_element_type %6404, %int6_8044 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %6409 = torch.aten.mm %6407, %6408 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_8045 = torch.constant.int 1
    %6410 = torch.aten.mul.Scalar %6409, %int1_8045 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_8046 = torch.constant.int 1
    %6411 = torch.aten.mul.Scalar %6406, %int1_8046 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_8047 = torch.constant.int 1
    %6412 = torch.aten.add.Tensor %6410, %6411, %int1_8047 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_8048 = torch.constant.int 5
    %6413 = torch.prims.convert_element_type %6412, %int5_8048 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_8049 = torch.constant.int 1
    %int4608_8050 = torch.constant.int 4608
    %int21504_8051 = torch.constant.int 21504
    %6414 = torch.prim.ListConstruct %int1_8049, %int4608_8050, %int21504_8051 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6415 = torch.aten.view %6413, %6414 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_8052 = torch.constant.int -1
    %int0_8053 = torch.constant.int 0
    %int9216_8054 = torch.constant.int 9216
    %int1_8055 = torch.constant.int 1
    %6416 = torch.aten.slice.Tensor %6415, %int-1_8052, %int0_8053, %int9216_8054, %int1_8055 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_8056 = torch.constant.int -1
    %int9216_8057 = torch.constant.int 9216
    %int21504_8058 = torch.constant.int 21504
    %int1_8059 = torch.constant.int 1
    %6417 = torch.aten.slice.Tensor %6415, %int-1_8056, %int9216_8057, %int21504_8058, %int1_8059 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_8060 = torch.constant.int 1
    %int4608_8061 = torch.constant.int 4608
    %int3_8062 = torch.constant.int 3
    %int24_8063 = torch.constant.int 24
    %int128_8064 = torch.constant.int 128
    %6418 = torch.prim.ListConstruct %int1_8060, %int4608_8061, %int3_8062, %int24_8063, %int128_8064 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6419 = torch.aten.view %6416, %6418 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_8065 = torch.constant.int 2
    %int0_8066 = torch.constant.int 0
    %int3_8067 = torch.constant.int 3
    %int1_8068 = torch.constant.int 1
    %int4_8069 = torch.constant.int 4
    %6420 = torch.prim.ListConstruct %int2_8065, %int0_8066, %int3_8067, %int1_8068, %int4_8069 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6421 = torch.aten.permute %6419, %6420 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_8070 = torch.constant.int 0
    %int0_8071 = torch.constant.int 0
    %6422 = torch.aten.select.int %6421, %int0_8070, %int0_8071 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_8072 = torch.constant.int 0
    %int1_8073 = torch.constant.int 1
    %6423 = torch.aten.select.int %6421, %int0_8072, %int1_8073 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_8074 = torch.constant.int 0
    %int2_8075 = torch.constant.int 2
    %6424 = torch.aten.select.int %6421, %int0_8074, %int2_8075 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8076 = torch.constant.int 6
    %6425 = torch.prims.convert_element_type %6422, %int6_8076 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_8077 = torch.constant.int 2
    %6426 = torch.aten.pow.Tensor_Scalar %6425, %int2_8077 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_8078 = torch.constant.int -1
    %6427 = torch.prim.ListConstruct %int-1_8078 : (!torch.int) -> !torch.list<int>
    %true_8079 = torch.constant.bool true
    %none_8080 = torch.constant.none
    %6428 = torch.aten.mean.dim %6426, %6427, %true_8079, %none_8080 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_8081 = torch.constant.float 9.9999999999999995E-7
    %int1_8082 = torch.constant.int 1
    %6429 = torch.aten.add.Scalar %6428, %float9.999990e-07_8081, %int1_8082 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6430 = torch.aten.rsqrt %6429 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6431 = torch.aten.mul.Tensor %6425, %6430 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8083 = torch.constant.int 5
    %6432 = torch.prims.convert_element_type %6431, %int5_8083 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.2.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.2.norm.query_norm.scale : tensor<128xf16>
    %6433 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6434 = torch.aten.mul.Tensor %6432, %6433 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8084 = torch.constant.int 6
    %6435 = torch.prims.convert_element_type %6423, %int6_8084 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_8085 = torch.constant.int 2
    %6436 = torch.aten.pow.Tensor_Scalar %6435, %int2_8085 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_8086 = torch.constant.int -1
    %6437 = torch.prim.ListConstruct %int-1_8086 : (!torch.int) -> !torch.list<int>
    %true_8087 = torch.constant.bool true
    %none_8088 = torch.constant.none
    %6438 = torch.aten.mean.dim %6436, %6437, %true_8087, %none_8088 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_8089 = torch.constant.float 9.9999999999999995E-7
    %int1_8090 = torch.constant.int 1
    %6439 = torch.aten.add.Scalar %6438, %float9.999990e-07_8089, %int1_8090 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6440 = torch.aten.rsqrt %6439 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6441 = torch.aten.mul.Tensor %6435, %6440 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8091 = torch.constant.int 5
    %6442 = torch.prims.convert_element_type %6441, %int5_8091 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.2.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.2.norm.key_norm.scale : tensor<128xf16>
    %6443 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6444 = torch.aten.mul.Tensor %6442, %6443 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_8092 = torch.constant.int 5
    %6445 = torch.prims.convert_element_type %6434, %int5_8092 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_8093 = torch.constant.int 5
    %6446 = torch.prims.convert_element_type %6444, %int5_8093 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8094 = torch.constant.int 6
    %6447 = torch.prims.convert_element_type %6445, %int6_8094 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8095 = torch.constant.int 1
    %int24_8096 = torch.constant.int 24
    %int4608_8097 = torch.constant.int 4608
    %int64_8098 = torch.constant.int 64
    %int1_8099 = torch.constant.int 1
    %int2_8100 = torch.constant.int 2
    %6448 = torch.prim.ListConstruct %int1_8095, %int24_8096, %int4608_8097, %int64_8098, %int1_8099, %int2_8100 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6449 = torch.aten.view %6447, %6448 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_8101 = torch.constant.int 6
    %6450 = torch.prims.convert_element_type %6446, %int6_8101 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8102 = torch.constant.int 1
    %int24_8103 = torch.constant.int 24
    %int4608_8104 = torch.constant.int 4608
    %int64_8105 = torch.constant.int 64
    %int1_8106 = torch.constant.int 1
    %int2_8107 = torch.constant.int 2
    %6451 = torch.prim.ListConstruct %int1_8102, %int24_8103, %int4608_8104, %int64_8105, %int1_8106, %int2_8107 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6452 = torch.aten.view %6450, %6451 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_8108 = torch.constant.int 5
    %int0_8109 = torch.constant.int 0
    %6453 = torch.aten.select.int %211, %int5_8108, %int0_8109 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8110 = torch.constant.int 5
    %int0_8111 = torch.constant.int 0
    %6454 = torch.aten.select.int %6449, %int5_8110, %int0_8111 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6455 = torch.aten.mul.Tensor %6453, %6454 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8112 = torch.constant.int 5
    %int1_8113 = torch.constant.int 1
    %6456 = torch.aten.select.int %211, %int5_8112, %int1_8113 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8114 = torch.constant.int 5
    %int1_8115 = torch.constant.int 1
    %6457 = torch.aten.select.int %6449, %int5_8114, %int1_8115 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6458 = torch.aten.mul.Tensor %6456, %6457 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8116 = torch.constant.int 1
    %6459 = torch.aten.add.Tensor %6455, %6458, %int1_8116 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8117 = torch.constant.int 5
    %int0_8118 = torch.constant.int 0
    %6460 = torch.aten.select.int %211, %int5_8117, %int0_8118 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8119 = torch.constant.int 5
    %int0_8120 = torch.constant.int 0
    %6461 = torch.aten.select.int %6452, %int5_8119, %int0_8120 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6462 = torch.aten.mul.Tensor %6460, %6461 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8121 = torch.constant.int 5
    %int1_8122 = torch.constant.int 1
    %6463 = torch.aten.select.int %211, %int5_8121, %int1_8122 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8123 = torch.constant.int 5
    %int1_8124 = torch.constant.int 1
    %6464 = torch.aten.select.int %6452, %int5_8123, %int1_8124 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6465 = torch.aten.mul.Tensor %6463, %6464 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8125 = torch.constant.int 1
    %6466 = torch.aten.add.Tensor %6462, %6465, %int1_8125 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8126 = torch.constant.int 1
    %int24_8127 = torch.constant.int 24
    %int4608_8128 = torch.constant.int 4608
    %int128_8129 = torch.constant.int 128
    %6467 = torch.prim.ListConstruct %int1_8126, %int24_8127, %int4608_8128, %int128_8129 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6468 = torch.aten.view %6459, %6467 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8130 = torch.constant.int 5
    %6469 = torch.prims.convert_element_type %6468, %int5_8130 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_8131 = torch.constant.int 1
    %int24_8132 = torch.constant.int 24
    %int4608_8133 = torch.constant.int 4608
    %int128_8134 = torch.constant.int 128
    %6470 = torch.prim.ListConstruct %int1_8131, %int24_8132, %int4608_8133, %int128_8134 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6471 = torch.aten.view %6466, %6470 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8135 = torch.constant.int 5
    %6472 = torch.prims.convert_element_type %6471, %int5_8135 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_8136 = torch.constant.float 0.000000e+00
    %false_8137 = torch.constant.bool false
    %none_8138 = torch.constant.none
    %none_8139 = torch.constant.none
    %6473:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%6469, %6472, %6424, %float0.000000e00_8136, %false_8137, %none_8138, %none_8139) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_8140 = torch.constant.int 0
    %int2_8141 = torch.constant.int 2
    %int1_8142 = torch.constant.int 1
    %int3_8143 = torch.constant.int 3
    %6474 = torch.prim.ListConstruct %int0_8140, %int2_8141, %int1_8142, %int3_8143 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6475 = torch.aten.permute %6473#0, %6474 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_8144 = torch.constant.int 1
    %int4608_8145 = torch.constant.int 4608
    %int3072_8146 = torch.constant.int 3072
    %6476 = torch.prim.ListConstruct %int1_8144, %int4608_8145, %int3072_8146 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6477 = torch.aten.view %6475, %6476 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_8147 = torch.constant.str "tanh"
    %6478 = torch.aten.gelu %6417, %str_8147 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %6479 = torch.prim.ListConstruct %6477, %6478 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_8148 = torch.constant.int 2
    %6480 = torch.aten.cat %6479, %int2_8148 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_8149 = torch.constant.int 4608
    %int15360_8150 = torch.constant.int 15360
    %6481 = torch.prim.ListConstruct %int4608_8149, %int15360_8150 : (!torch.int, !torch.int) -> !torch.list<int>
    %6482 = torch.aten.view %6480, %6481 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.2.linear2.weight = util.global.load @__auto.sampler.single_blocks.2.linear2.weight : tensor<3072x15360xf16>
    %6483 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_8151 = torch.constant.int 0
    %int1_8152 = torch.constant.int 1
    %6484 = torch.aten.transpose.int %6483, %int0_8151, %int1_8152 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.2.linear2.bias = util.global.load @__auto.sampler.single_blocks.2.linear2.bias : tensor<3072xf16>
    %6485 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.2.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_8153 = torch.constant.int 6
    %6486 = torch.prims.convert_element_type %6485, %int6_8153 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_8154 = torch.constant.int 6
    %6487 = torch.prims.convert_element_type %6482, %int6_8154 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_8155 = torch.constant.int 6
    %6488 = torch.prims.convert_element_type %6484, %int6_8155 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %6489 = torch.aten.mm %6487, %6488 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_8156 = torch.constant.int 1
    %6490 = torch.aten.mul.Scalar %6489, %int1_8156 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_8157 = torch.constant.int 1
    %6491 = torch.aten.mul.Scalar %6486, %int1_8157 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_8158 = torch.constant.int 1
    %6492 = torch.aten.add.Tensor %6490, %6491, %int1_8158 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_8159 = torch.constant.int 5
    %6493 = torch.prims.convert_element_type %6492, %int5_8159 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_8160 = torch.constant.int 1
    %int4608_8161 = torch.constant.int 4608
    %int3072_8162 = torch.constant.int 3072
    %6494 = torch.prim.ListConstruct %int1_8160, %int4608_8161, %int3072_8162 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6495 = torch.aten.view %6493, %6494 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %6496 = torch.aten.mul.Tensor %6390, %6495 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_8163 = torch.constant.int 1
    %6497 = torch.aten.add.Tensor %6372, %6496, %int1_8163 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6498 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.3.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.3.modulation.lin.weight : tensor<9216x3072xf16>
    %6499 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_8164 = torch.constant.int 0
    %int1_8165 = torch.constant.int 1
    %6500 = torch.aten.transpose.int %6499, %int0_8164, %int1_8165 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.3.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.3.modulation.lin.bias : tensor<9216xf16>
    %6501 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_8166 = torch.constant.int 6
    %6502 = torch.prims.convert_element_type %6501, %int6_8166 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_8167 = torch.constant.int 6
    %6503 = torch.prims.convert_element_type %6498, %int6_8167 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_8168 = torch.constant.int 6
    %6504 = torch.prims.convert_element_type %6500, %int6_8168 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %6505 = torch.aten.mm %6503, %6504 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_8169 = torch.constant.int 1
    %6506 = torch.aten.mul.Scalar %6505, %int1_8169 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_8170 = torch.constant.int 1
    %6507 = torch.aten.mul.Scalar %6502, %int1_8170 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_8171 = torch.constant.int 1
    %6508 = torch.aten.add.Tensor %6506, %6507, %int1_8171 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_8172 = torch.constant.int 5
    %6509 = torch.prims.convert_element_type %6508, %int5_8172 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_8173 = torch.constant.int 0
    %int0_8174 = torch.constant.int 0
    %int9223372036854775807_8175 = torch.constant.int 9223372036854775807
    %int1_8176 = torch.constant.int 1
    %6510 = torch.aten.slice.Tensor %6509, %int0_8173, %int0_8174, %int9223372036854775807_8175, %int1_8176 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_8177 = torch.constant.int 1
    %6511 = torch.aten.unsqueeze %6510, %int1_8177 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_8178 = torch.constant.int 2
    %int0_8179 = torch.constant.int 0
    %int9223372036854775807_8180 = torch.constant.int 9223372036854775807
    %int1_8181 = torch.constant.int 1
    %6512 = torch.aten.slice.Tensor %6511, %int2_8178, %int0_8179, %int9223372036854775807_8180, %int1_8181 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_8182 = torch.constant.int -1
    %int0_8183 = torch.constant.int 0
    %int3072_8184 = torch.constant.int 3072
    %int1_8185 = torch.constant.int 1
    %6513 = torch.aten.slice.Tensor %6512, %int-1_8182, %int0_8183, %int3072_8184, %int1_8185 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8186 = torch.constant.int -1
    %int3072_8187 = torch.constant.int 3072
    %int6144_8188 = torch.constant.int 6144
    %int1_8189 = torch.constant.int 1
    %6514 = torch.aten.slice.Tensor %6512, %int-1_8186, %int3072_8187, %int6144_8188, %int1_8189 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8190 = torch.constant.int -1
    %int6144_8191 = torch.constant.int 6144
    %int9216_8192 = torch.constant.int 9216
    %int1_8193 = torch.constant.int 1
    %6515 = torch.aten.slice.Tensor %6512, %int-1_8190, %int6144_8191, %int9216_8192, %int1_8193 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_8194 = torch.constant.int 1
    %int1_8195 = torch.constant.int 1
    %6516 = torch.aten.add.Scalar %6514, %int1_8194, %int1_8195 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_8196 = torch.constant.int 6
    %6517 = torch.prims.convert_element_type %6497, %int6_8196 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_8197 = torch.constant.int 2
    %6518 = torch.prim.ListConstruct %int2_8197 : (!torch.int) -> !torch.list<int>
    %int0_8198 = torch.constant.int 0
    %true_8199 = torch.constant.bool true
    %result0_8200, %result1_8201 = torch.aten.var_mean.correction %6517, %6518, %int0_8198, %true_8199 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_8202 = torch.constant.float 9.9999999999999995E-7
    %int1_8203 = torch.constant.int 1
    %6519 = torch.aten.add.Scalar %result0_8200, %float9.999990e-07_8202, %int1_8203 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %6520 = torch.aten.rsqrt %6519 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_8204 = torch.constant.int 1
    %6521 = torch.aten.sub.Tensor %6497, %result1_8201, %int1_8204 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %6522 = torch.aten.mul.Tensor %6521, %6520 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_8205 = torch.constant.int 5
    %6523 = torch.prims.convert_element_type %6522, %int5_8205 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6524 = torch.aten.mul.Tensor %6516, %6523 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_8206 = torch.constant.int 1
    %6525 = torch.aten.add.Tensor %6524, %6513, %int1_8206 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_8207 = torch.constant.int 4608
    %int3072_8208 = torch.constant.int 3072
    %6526 = torch.prim.ListConstruct %int4608_8207, %int3072_8208 : (!torch.int, !torch.int) -> !torch.list<int>
    %6527 = torch.aten.view %6525, %6526 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.3.linear1.weight = util.global.load @__auto.sampler.single_blocks.3.linear1.weight : tensor<21504x3072xf16>
    %6528 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_8209 = torch.constant.int 0
    %int1_8210 = torch.constant.int 1
    %6529 = torch.aten.transpose.int %6528, %int0_8209, %int1_8210 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.3.linear1.bias = util.global.load @__auto.sampler.single_blocks.3.linear1.bias : tensor<21504xf16>
    %6530 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_8211 = torch.constant.int 6
    %6531 = torch.prims.convert_element_type %6530, %int6_8211 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_8212 = torch.constant.int 6
    %6532 = torch.prims.convert_element_type %6527, %int6_8212 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_8213 = torch.constant.int 6
    %6533 = torch.prims.convert_element_type %6529, %int6_8213 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %6534 = torch.aten.mm %6532, %6533 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_8214 = torch.constant.int 1
    %6535 = torch.aten.mul.Scalar %6534, %int1_8214 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_8215 = torch.constant.int 1
    %6536 = torch.aten.mul.Scalar %6531, %int1_8215 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_8216 = torch.constant.int 1
    %6537 = torch.aten.add.Tensor %6535, %6536, %int1_8216 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_8217 = torch.constant.int 5
    %6538 = torch.prims.convert_element_type %6537, %int5_8217 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_8218 = torch.constant.int 1
    %int4608_8219 = torch.constant.int 4608
    %int21504_8220 = torch.constant.int 21504
    %6539 = torch.prim.ListConstruct %int1_8218, %int4608_8219, %int21504_8220 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6540 = torch.aten.view %6538, %6539 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_8221 = torch.constant.int -1
    %int0_8222 = torch.constant.int 0
    %int9216_8223 = torch.constant.int 9216
    %int1_8224 = torch.constant.int 1
    %6541 = torch.aten.slice.Tensor %6540, %int-1_8221, %int0_8222, %int9216_8223, %int1_8224 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_8225 = torch.constant.int -1
    %int9216_8226 = torch.constant.int 9216
    %int21504_8227 = torch.constant.int 21504
    %int1_8228 = torch.constant.int 1
    %6542 = torch.aten.slice.Tensor %6540, %int-1_8225, %int9216_8226, %int21504_8227, %int1_8228 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_8229 = torch.constant.int 1
    %int4608_8230 = torch.constant.int 4608
    %int3_8231 = torch.constant.int 3
    %int24_8232 = torch.constant.int 24
    %int128_8233 = torch.constant.int 128
    %6543 = torch.prim.ListConstruct %int1_8229, %int4608_8230, %int3_8231, %int24_8232, %int128_8233 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6544 = torch.aten.view %6541, %6543 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_8234 = torch.constant.int 2
    %int0_8235 = torch.constant.int 0
    %int3_8236 = torch.constant.int 3
    %int1_8237 = torch.constant.int 1
    %int4_8238 = torch.constant.int 4
    %6545 = torch.prim.ListConstruct %int2_8234, %int0_8235, %int3_8236, %int1_8237, %int4_8238 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6546 = torch.aten.permute %6544, %6545 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_8239 = torch.constant.int 0
    %int0_8240 = torch.constant.int 0
    %6547 = torch.aten.select.int %6546, %int0_8239, %int0_8240 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_8241 = torch.constant.int 0
    %int1_8242 = torch.constant.int 1
    %6548 = torch.aten.select.int %6546, %int0_8241, %int1_8242 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_8243 = torch.constant.int 0
    %int2_8244 = torch.constant.int 2
    %6549 = torch.aten.select.int %6546, %int0_8243, %int2_8244 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8245 = torch.constant.int 6
    %6550 = torch.prims.convert_element_type %6547, %int6_8245 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_8246 = torch.constant.int 2
    %6551 = torch.aten.pow.Tensor_Scalar %6550, %int2_8246 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_8247 = torch.constant.int -1
    %6552 = torch.prim.ListConstruct %int-1_8247 : (!torch.int) -> !torch.list<int>
    %true_8248 = torch.constant.bool true
    %none_8249 = torch.constant.none
    %6553 = torch.aten.mean.dim %6551, %6552, %true_8248, %none_8249 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_8250 = torch.constant.float 9.9999999999999995E-7
    %int1_8251 = torch.constant.int 1
    %6554 = torch.aten.add.Scalar %6553, %float9.999990e-07_8250, %int1_8251 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6555 = torch.aten.rsqrt %6554 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6556 = torch.aten.mul.Tensor %6550, %6555 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8252 = torch.constant.int 5
    %6557 = torch.prims.convert_element_type %6556, %int5_8252 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.3.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.3.norm.query_norm.scale : tensor<128xf16>
    %6558 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6559 = torch.aten.mul.Tensor %6557, %6558 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8253 = torch.constant.int 6
    %6560 = torch.prims.convert_element_type %6548, %int6_8253 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_8254 = torch.constant.int 2
    %6561 = torch.aten.pow.Tensor_Scalar %6560, %int2_8254 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_8255 = torch.constant.int -1
    %6562 = torch.prim.ListConstruct %int-1_8255 : (!torch.int) -> !torch.list<int>
    %true_8256 = torch.constant.bool true
    %none_8257 = torch.constant.none
    %6563 = torch.aten.mean.dim %6561, %6562, %true_8256, %none_8257 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_8258 = torch.constant.float 9.9999999999999995E-7
    %int1_8259 = torch.constant.int 1
    %6564 = torch.aten.add.Scalar %6563, %float9.999990e-07_8258, %int1_8259 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6565 = torch.aten.rsqrt %6564 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6566 = torch.aten.mul.Tensor %6560, %6565 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8260 = torch.constant.int 5
    %6567 = torch.prims.convert_element_type %6566, %int5_8260 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.3.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.3.norm.key_norm.scale : tensor<128xf16>
    %6568 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6569 = torch.aten.mul.Tensor %6567, %6568 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_8261 = torch.constant.int 5
    %6570 = torch.prims.convert_element_type %6559, %int5_8261 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_8262 = torch.constant.int 5
    %6571 = torch.prims.convert_element_type %6569, %int5_8262 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8263 = torch.constant.int 6
    %6572 = torch.prims.convert_element_type %6570, %int6_8263 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8264 = torch.constant.int 1
    %int24_8265 = torch.constant.int 24
    %int4608_8266 = torch.constant.int 4608
    %int64_8267 = torch.constant.int 64
    %int1_8268 = torch.constant.int 1
    %int2_8269 = torch.constant.int 2
    %6573 = torch.prim.ListConstruct %int1_8264, %int24_8265, %int4608_8266, %int64_8267, %int1_8268, %int2_8269 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6574 = torch.aten.view %6572, %6573 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_8270 = torch.constant.int 6
    %6575 = torch.prims.convert_element_type %6571, %int6_8270 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8271 = torch.constant.int 1
    %int24_8272 = torch.constant.int 24
    %int4608_8273 = torch.constant.int 4608
    %int64_8274 = torch.constant.int 64
    %int1_8275 = torch.constant.int 1
    %int2_8276 = torch.constant.int 2
    %6576 = torch.prim.ListConstruct %int1_8271, %int24_8272, %int4608_8273, %int64_8274, %int1_8275, %int2_8276 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6577 = torch.aten.view %6575, %6576 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_8277 = torch.constant.int 5
    %int0_8278 = torch.constant.int 0
    %6578 = torch.aten.select.int %211, %int5_8277, %int0_8278 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8279 = torch.constant.int 5
    %int0_8280 = torch.constant.int 0
    %6579 = torch.aten.select.int %6574, %int5_8279, %int0_8280 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6580 = torch.aten.mul.Tensor %6578, %6579 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8281 = torch.constant.int 5
    %int1_8282 = torch.constant.int 1
    %6581 = torch.aten.select.int %211, %int5_8281, %int1_8282 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8283 = torch.constant.int 5
    %int1_8284 = torch.constant.int 1
    %6582 = torch.aten.select.int %6574, %int5_8283, %int1_8284 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6583 = torch.aten.mul.Tensor %6581, %6582 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8285 = torch.constant.int 1
    %6584 = torch.aten.add.Tensor %6580, %6583, %int1_8285 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8286 = torch.constant.int 5
    %int0_8287 = torch.constant.int 0
    %6585 = torch.aten.select.int %211, %int5_8286, %int0_8287 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8288 = torch.constant.int 5
    %int0_8289 = torch.constant.int 0
    %6586 = torch.aten.select.int %6577, %int5_8288, %int0_8289 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6587 = torch.aten.mul.Tensor %6585, %6586 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8290 = torch.constant.int 5
    %int1_8291 = torch.constant.int 1
    %6588 = torch.aten.select.int %211, %int5_8290, %int1_8291 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8292 = torch.constant.int 5
    %int1_8293 = torch.constant.int 1
    %6589 = torch.aten.select.int %6577, %int5_8292, %int1_8293 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6590 = torch.aten.mul.Tensor %6588, %6589 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8294 = torch.constant.int 1
    %6591 = torch.aten.add.Tensor %6587, %6590, %int1_8294 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8295 = torch.constant.int 1
    %int24_8296 = torch.constant.int 24
    %int4608_8297 = torch.constant.int 4608
    %int128_8298 = torch.constant.int 128
    %6592 = torch.prim.ListConstruct %int1_8295, %int24_8296, %int4608_8297, %int128_8298 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6593 = torch.aten.view %6584, %6592 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8299 = torch.constant.int 5
    %6594 = torch.prims.convert_element_type %6593, %int5_8299 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_8300 = torch.constant.int 1
    %int24_8301 = torch.constant.int 24
    %int4608_8302 = torch.constant.int 4608
    %int128_8303 = torch.constant.int 128
    %6595 = torch.prim.ListConstruct %int1_8300, %int24_8301, %int4608_8302, %int128_8303 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6596 = torch.aten.view %6591, %6595 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8304 = torch.constant.int 5
    %6597 = torch.prims.convert_element_type %6596, %int5_8304 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_8305 = torch.constant.float 0.000000e+00
    %false_8306 = torch.constant.bool false
    %none_8307 = torch.constant.none
    %none_8308 = torch.constant.none
    %6598:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%6594, %6597, %6549, %float0.000000e00_8305, %false_8306, %none_8307, %none_8308) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_8309 = torch.constant.int 0
    %int2_8310 = torch.constant.int 2
    %int1_8311 = torch.constant.int 1
    %int3_8312 = torch.constant.int 3
    %6599 = torch.prim.ListConstruct %int0_8309, %int2_8310, %int1_8311, %int3_8312 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6600 = torch.aten.permute %6598#0, %6599 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_8313 = torch.constant.int 1
    %int4608_8314 = torch.constant.int 4608
    %int3072_8315 = torch.constant.int 3072
    %6601 = torch.prim.ListConstruct %int1_8313, %int4608_8314, %int3072_8315 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6602 = torch.aten.view %6600, %6601 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_8316 = torch.constant.str "tanh"
    %6603 = torch.aten.gelu %6542, %str_8316 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %6604 = torch.prim.ListConstruct %6602, %6603 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_8317 = torch.constant.int 2
    %6605 = torch.aten.cat %6604, %int2_8317 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_8318 = torch.constant.int 4608
    %int15360_8319 = torch.constant.int 15360
    %6606 = torch.prim.ListConstruct %int4608_8318, %int15360_8319 : (!torch.int, !torch.int) -> !torch.list<int>
    %6607 = torch.aten.view %6605, %6606 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.3.linear2.weight = util.global.load @__auto.sampler.single_blocks.3.linear2.weight : tensor<3072x15360xf16>
    %6608 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_8320 = torch.constant.int 0
    %int1_8321 = torch.constant.int 1
    %6609 = torch.aten.transpose.int %6608, %int0_8320, %int1_8321 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.3.linear2.bias = util.global.load @__auto.sampler.single_blocks.3.linear2.bias : tensor<3072xf16>
    %6610 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.3.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_8322 = torch.constant.int 6
    %6611 = torch.prims.convert_element_type %6610, %int6_8322 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_8323 = torch.constant.int 6
    %6612 = torch.prims.convert_element_type %6607, %int6_8323 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_8324 = torch.constant.int 6
    %6613 = torch.prims.convert_element_type %6609, %int6_8324 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %6614 = torch.aten.mm %6612, %6613 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_8325 = torch.constant.int 1
    %6615 = torch.aten.mul.Scalar %6614, %int1_8325 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_8326 = torch.constant.int 1
    %6616 = torch.aten.mul.Scalar %6611, %int1_8326 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_8327 = torch.constant.int 1
    %6617 = torch.aten.add.Tensor %6615, %6616, %int1_8327 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_8328 = torch.constant.int 5
    %6618 = torch.prims.convert_element_type %6617, %int5_8328 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_8329 = torch.constant.int 1
    %int4608_8330 = torch.constant.int 4608
    %int3072_8331 = torch.constant.int 3072
    %6619 = torch.prim.ListConstruct %int1_8329, %int4608_8330, %int3072_8331 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6620 = torch.aten.view %6618, %6619 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %6621 = torch.aten.mul.Tensor %6515, %6620 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_8332 = torch.constant.int 1
    %6622 = torch.aten.add.Tensor %6497, %6621, %int1_8332 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6623 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.4.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.4.modulation.lin.weight : tensor<9216x3072xf16>
    %6624 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_8333 = torch.constant.int 0
    %int1_8334 = torch.constant.int 1
    %6625 = torch.aten.transpose.int %6624, %int0_8333, %int1_8334 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.4.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.4.modulation.lin.bias : tensor<9216xf16>
    %6626 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_8335 = torch.constant.int 6
    %6627 = torch.prims.convert_element_type %6626, %int6_8335 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_8336 = torch.constant.int 6
    %6628 = torch.prims.convert_element_type %6623, %int6_8336 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_8337 = torch.constant.int 6
    %6629 = torch.prims.convert_element_type %6625, %int6_8337 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %6630 = torch.aten.mm %6628, %6629 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_8338 = torch.constant.int 1
    %6631 = torch.aten.mul.Scalar %6630, %int1_8338 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_8339 = torch.constant.int 1
    %6632 = torch.aten.mul.Scalar %6627, %int1_8339 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_8340 = torch.constant.int 1
    %6633 = torch.aten.add.Tensor %6631, %6632, %int1_8340 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_8341 = torch.constant.int 5
    %6634 = torch.prims.convert_element_type %6633, %int5_8341 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_8342 = torch.constant.int 0
    %int0_8343 = torch.constant.int 0
    %int9223372036854775807_8344 = torch.constant.int 9223372036854775807
    %int1_8345 = torch.constant.int 1
    %6635 = torch.aten.slice.Tensor %6634, %int0_8342, %int0_8343, %int9223372036854775807_8344, %int1_8345 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_8346 = torch.constant.int 1
    %6636 = torch.aten.unsqueeze %6635, %int1_8346 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_8347 = torch.constant.int 2
    %int0_8348 = torch.constant.int 0
    %int9223372036854775807_8349 = torch.constant.int 9223372036854775807
    %int1_8350 = torch.constant.int 1
    %6637 = torch.aten.slice.Tensor %6636, %int2_8347, %int0_8348, %int9223372036854775807_8349, %int1_8350 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_8351 = torch.constant.int -1
    %int0_8352 = torch.constant.int 0
    %int3072_8353 = torch.constant.int 3072
    %int1_8354 = torch.constant.int 1
    %6638 = torch.aten.slice.Tensor %6637, %int-1_8351, %int0_8352, %int3072_8353, %int1_8354 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8355 = torch.constant.int -1
    %int3072_8356 = torch.constant.int 3072
    %int6144_8357 = torch.constant.int 6144
    %int1_8358 = torch.constant.int 1
    %6639 = torch.aten.slice.Tensor %6637, %int-1_8355, %int3072_8356, %int6144_8357, %int1_8358 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8359 = torch.constant.int -1
    %int6144_8360 = torch.constant.int 6144
    %int9216_8361 = torch.constant.int 9216
    %int1_8362 = torch.constant.int 1
    %6640 = torch.aten.slice.Tensor %6637, %int-1_8359, %int6144_8360, %int9216_8361, %int1_8362 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_8363 = torch.constant.int 1
    %int1_8364 = torch.constant.int 1
    %6641 = torch.aten.add.Scalar %6639, %int1_8363, %int1_8364 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_8365 = torch.constant.int 6
    %6642 = torch.prims.convert_element_type %6622, %int6_8365 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_8366 = torch.constant.int 2
    %6643 = torch.prim.ListConstruct %int2_8366 : (!torch.int) -> !torch.list<int>
    %int0_8367 = torch.constant.int 0
    %true_8368 = torch.constant.bool true
    %result0_8369, %result1_8370 = torch.aten.var_mean.correction %6642, %6643, %int0_8367, %true_8368 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_8371 = torch.constant.float 9.9999999999999995E-7
    %int1_8372 = torch.constant.int 1
    %6644 = torch.aten.add.Scalar %result0_8369, %float9.999990e-07_8371, %int1_8372 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %6645 = torch.aten.rsqrt %6644 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_8373 = torch.constant.int 1
    %6646 = torch.aten.sub.Tensor %6622, %result1_8370, %int1_8373 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %6647 = torch.aten.mul.Tensor %6646, %6645 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_8374 = torch.constant.int 5
    %6648 = torch.prims.convert_element_type %6647, %int5_8374 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6649 = torch.aten.mul.Tensor %6641, %6648 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_8375 = torch.constant.int 1
    %6650 = torch.aten.add.Tensor %6649, %6638, %int1_8375 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_8376 = torch.constant.int 4608
    %int3072_8377 = torch.constant.int 3072
    %6651 = torch.prim.ListConstruct %int4608_8376, %int3072_8377 : (!torch.int, !torch.int) -> !torch.list<int>
    %6652 = torch.aten.view %6650, %6651 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.4.linear1.weight = util.global.load @__auto.sampler.single_blocks.4.linear1.weight : tensor<21504x3072xf16>
    %6653 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_8378 = torch.constant.int 0
    %int1_8379 = torch.constant.int 1
    %6654 = torch.aten.transpose.int %6653, %int0_8378, %int1_8379 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.4.linear1.bias = util.global.load @__auto.sampler.single_blocks.4.linear1.bias : tensor<21504xf16>
    %6655 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_8380 = torch.constant.int 6
    %6656 = torch.prims.convert_element_type %6655, %int6_8380 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_8381 = torch.constant.int 6
    %6657 = torch.prims.convert_element_type %6652, %int6_8381 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_8382 = torch.constant.int 6
    %6658 = torch.prims.convert_element_type %6654, %int6_8382 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %6659 = torch.aten.mm %6657, %6658 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_8383 = torch.constant.int 1
    %6660 = torch.aten.mul.Scalar %6659, %int1_8383 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_8384 = torch.constant.int 1
    %6661 = torch.aten.mul.Scalar %6656, %int1_8384 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_8385 = torch.constant.int 1
    %6662 = torch.aten.add.Tensor %6660, %6661, %int1_8385 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_8386 = torch.constant.int 5
    %6663 = torch.prims.convert_element_type %6662, %int5_8386 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_8387 = torch.constant.int 1
    %int4608_8388 = torch.constant.int 4608
    %int21504_8389 = torch.constant.int 21504
    %6664 = torch.prim.ListConstruct %int1_8387, %int4608_8388, %int21504_8389 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6665 = torch.aten.view %6663, %6664 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_8390 = torch.constant.int -1
    %int0_8391 = torch.constant.int 0
    %int9216_8392 = torch.constant.int 9216
    %int1_8393 = torch.constant.int 1
    %6666 = torch.aten.slice.Tensor %6665, %int-1_8390, %int0_8391, %int9216_8392, %int1_8393 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_8394 = torch.constant.int -1
    %int9216_8395 = torch.constant.int 9216
    %int21504_8396 = torch.constant.int 21504
    %int1_8397 = torch.constant.int 1
    %6667 = torch.aten.slice.Tensor %6665, %int-1_8394, %int9216_8395, %int21504_8396, %int1_8397 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_8398 = torch.constant.int 1
    %int4608_8399 = torch.constant.int 4608
    %int3_8400 = torch.constant.int 3
    %int24_8401 = torch.constant.int 24
    %int128_8402 = torch.constant.int 128
    %6668 = torch.prim.ListConstruct %int1_8398, %int4608_8399, %int3_8400, %int24_8401, %int128_8402 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6669 = torch.aten.view %6666, %6668 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_8403 = torch.constant.int 2
    %int0_8404 = torch.constant.int 0
    %int3_8405 = torch.constant.int 3
    %int1_8406 = torch.constant.int 1
    %int4_8407 = torch.constant.int 4
    %6670 = torch.prim.ListConstruct %int2_8403, %int0_8404, %int3_8405, %int1_8406, %int4_8407 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6671 = torch.aten.permute %6669, %6670 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_8408 = torch.constant.int 0
    %int0_8409 = torch.constant.int 0
    %6672 = torch.aten.select.int %6671, %int0_8408, %int0_8409 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_8410 = torch.constant.int 0
    %int1_8411 = torch.constant.int 1
    %6673 = torch.aten.select.int %6671, %int0_8410, %int1_8411 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_8412 = torch.constant.int 0
    %int2_8413 = torch.constant.int 2
    %6674 = torch.aten.select.int %6671, %int0_8412, %int2_8413 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8414 = torch.constant.int 6
    %6675 = torch.prims.convert_element_type %6672, %int6_8414 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_8415 = torch.constant.int 2
    %6676 = torch.aten.pow.Tensor_Scalar %6675, %int2_8415 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_8416 = torch.constant.int -1
    %6677 = torch.prim.ListConstruct %int-1_8416 : (!torch.int) -> !torch.list<int>
    %true_8417 = torch.constant.bool true
    %none_8418 = torch.constant.none
    %6678 = torch.aten.mean.dim %6676, %6677, %true_8417, %none_8418 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_8419 = torch.constant.float 9.9999999999999995E-7
    %int1_8420 = torch.constant.int 1
    %6679 = torch.aten.add.Scalar %6678, %float9.999990e-07_8419, %int1_8420 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6680 = torch.aten.rsqrt %6679 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6681 = torch.aten.mul.Tensor %6675, %6680 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8421 = torch.constant.int 5
    %6682 = torch.prims.convert_element_type %6681, %int5_8421 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.4.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.4.norm.query_norm.scale : tensor<128xf16>
    %6683 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6684 = torch.aten.mul.Tensor %6682, %6683 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8422 = torch.constant.int 6
    %6685 = torch.prims.convert_element_type %6673, %int6_8422 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_8423 = torch.constant.int 2
    %6686 = torch.aten.pow.Tensor_Scalar %6685, %int2_8423 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_8424 = torch.constant.int -1
    %6687 = torch.prim.ListConstruct %int-1_8424 : (!torch.int) -> !torch.list<int>
    %true_8425 = torch.constant.bool true
    %none_8426 = torch.constant.none
    %6688 = torch.aten.mean.dim %6686, %6687, %true_8425, %none_8426 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_8427 = torch.constant.float 9.9999999999999995E-7
    %int1_8428 = torch.constant.int 1
    %6689 = torch.aten.add.Scalar %6688, %float9.999990e-07_8427, %int1_8428 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6690 = torch.aten.rsqrt %6689 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6691 = torch.aten.mul.Tensor %6685, %6690 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8429 = torch.constant.int 5
    %6692 = torch.prims.convert_element_type %6691, %int5_8429 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.4.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.4.norm.key_norm.scale : tensor<128xf16>
    %6693 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6694 = torch.aten.mul.Tensor %6692, %6693 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_8430 = torch.constant.int 5
    %6695 = torch.prims.convert_element_type %6684, %int5_8430 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_8431 = torch.constant.int 5
    %6696 = torch.prims.convert_element_type %6694, %int5_8431 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8432 = torch.constant.int 6
    %6697 = torch.prims.convert_element_type %6695, %int6_8432 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8433 = torch.constant.int 1
    %int24_8434 = torch.constant.int 24
    %int4608_8435 = torch.constant.int 4608
    %int64_8436 = torch.constant.int 64
    %int1_8437 = torch.constant.int 1
    %int2_8438 = torch.constant.int 2
    %6698 = torch.prim.ListConstruct %int1_8433, %int24_8434, %int4608_8435, %int64_8436, %int1_8437, %int2_8438 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6699 = torch.aten.view %6697, %6698 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_8439 = torch.constant.int 6
    %6700 = torch.prims.convert_element_type %6696, %int6_8439 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8440 = torch.constant.int 1
    %int24_8441 = torch.constant.int 24
    %int4608_8442 = torch.constant.int 4608
    %int64_8443 = torch.constant.int 64
    %int1_8444 = torch.constant.int 1
    %int2_8445 = torch.constant.int 2
    %6701 = torch.prim.ListConstruct %int1_8440, %int24_8441, %int4608_8442, %int64_8443, %int1_8444, %int2_8445 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6702 = torch.aten.view %6700, %6701 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_8446 = torch.constant.int 5
    %int0_8447 = torch.constant.int 0
    %6703 = torch.aten.select.int %211, %int5_8446, %int0_8447 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8448 = torch.constant.int 5
    %int0_8449 = torch.constant.int 0
    %6704 = torch.aten.select.int %6699, %int5_8448, %int0_8449 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6705 = torch.aten.mul.Tensor %6703, %6704 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8450 = torch.constant.int 5
    %int1_8451 = torch.constant.int 1
    %6706 = torch.aten.select.int %211, %int5_8450, %int1_8451 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8452 = torch.constant.int 5
    %int1_8453 = torch.constant.int 1
    %6707 = torch.aten.select.int %6699, %int5_8452, %int1_8453 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6708 = torch.aten.mul.Tensor %6706, %6707 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8454 = torch.constant.int 1
    %6709 = torch.aten.add.Tensor %6705, %6708, %int1_8454 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8455 = torch.constant.int 5
    %int0_8456 = torch.constant.int 0
    %6710 = torch.aten.select.int %211, %int5_8455, %int0_8456 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8457 = torch.constant.int 5
    %int0_8458 = torch.constant.int 0
    %6711 = torch.aten.select.int %6702, %int5_8457, %int0_8458 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6712 = torch.aten.mul.Tensor %6710, %6711 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8459 = torch.constant.int 5
    %int1_8460 = torch.constant.int 1
    %6713 = torch.aten.select.int %211, %int5_8459, %int1_8460 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8461 = torch.constant.int 5
    %int1_8462 = torch.constant.int 1
    %6714 = torch.aten.select.int %6702, %int5_8461, %int1_8462 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6715 = torch.aten.mul.Tensor %6713, %6714 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8463 = torch.constant.int 1
    %6716 = torch.aten.add.Tensor %6712, %6715, %int1_8463 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8464 = torch.constant.int 1
    %int24_8465 = torch.constant.int 24
    %int4608_8466 = torch.constant.int 4608
    %int128_8467 = torch.constant.int 128
    %6717 = torch.prim.ListConstruct %int1_8464, %int24_8465, %int4608_8466, %int128_8467 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6718 = torch.aten.view %6709, %6717 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8468 = torch.constant.int 5
    %6719 = torch.prims.convert_element_type %6718, %int5_8468 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_8469 = torch.constant.int 1
    %int24_8470 = torch.constant.int 24
    %int4608_8471 = torch.constant.int 4608
    %int128_8472 = torch.constant.int 128
    %6720 = torch.prim.ListConstruct %int1_8469, %int24_8470, %int4608_8471, %int128_8472 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6721 = torch.aten.view %6716, %6720 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8473 = torch.constant.int 5
    %6722 = torch.prims.convert_element_type %6721, %int5_8473 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_8474 = torch.constant.float 0.000000e+00
    %false_8475 = torch.constant.bool false
    %none_8476 = torch.constant.none
    %none_8477 = torch.constant.none
    %6723:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%6719, %6722, %6674, %float0.000000e00_8474, %false_8475, %none_8476, %none_8477) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_8478 = torch.constant.int 0
    %int2_8479 = torch.constant.int 2
    %int1_8480 = torch.constant.int 1
    %int3_8481 = torch.constant.int 3
    %6724 = torch.prim.ListConstruct %int0_8478, %int2_8479, %int1_8480, %int3_8481 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6725 = torch.aten.permute %6723#0, %6724 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_8482 = torch.constant.int 1
    %int4608_8483 = torch.constant.int 4608
    %int3072_8484 = torch.constant.int 3072
    %6726 = torch.prim.ListConstruct %int1_8482, %int4608_8483, %int3072_8484 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6727 = torch.aten.view %6725, %6726 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_8485 = torch.constant.str "tanh"
    %6728 = torch.aten.gelu %6667, %str_8485 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %6729 = torch.prim.ListConstruct %6727, %6728 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_8486 = torch.constant.int 2
    %6730 = torch.aten.cat %6729, %int2_8486 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_8487 = torch.constant.int 4608
    %int15360_8488 = torch.constant.int 15360
    %6731 = torch.prim.ListConstruct %int4608_8487, %int15360_8488 : (!torch.int, !torch.int) -> !torch.list<int>
    %6732 = torch.aten.view %6730, %6731 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.4.linear2.weight = util.global.load @__auto.sampler.single_blocks.4.linear2.weight : tensor<3072x15360xf16>
    %6733 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_8489 = torch.constant.int 0
    %int1_8490 = torch.constant.int 1
    %6734 = torch.aten.transpose.int %6733, %int0_8489, %int1_8490 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.4.linear2.bias = util.global.load @__auto.sampler.single_blocks.4.linear2.bias : tensor<3072xf16>
    %6735 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.4.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_8491 = torch.constant.int 6
    %6736 = torch.prims.convert_element_type %6735, %int6_8491 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_8492 = torch.constant.int 6
    %6737 = torch.prims.convert_element_type %6732, %int6_8492 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_8493 = torch.constant.int 6
    %6738 = torch.prims.convert_element_type %6734, %int6_8493 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %6739 = torch.aten.mm %6737, %6738 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_8494 = torch.constant.int 1
    %6740 = torch.aten.mul.Scalar %6739, %int1_8494 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_8495 = torch.constant.int 1
    %6741 = torch.aten.mul.Scalar %6736, %int1_8495 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_8496 = torch.constant.int 1
    %6742 = torch.aten.add.Tensor %6740, %6741, %int1_8496 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_8497 = torch.constant.int 5
    %6743 = torch.prims.convert_element_type %6742, %int5_8497 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_8498 = torch.constant.int 1
    %int4608_8499 = torch.constant.int 4608
    %int3072_8500 = torch.constant.int 3072
    %6744 = torch.prim.ListConstruct %int1_8498, %int4608_8499, %int3072_8500 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6745 = torch.aten.view %6743, %6744 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %6746 = torch.aten.mul.Tensor %6640, %6745 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_8501 = torch.constant.int 1
    %6747 = torch.aten.add.Tensor %6622, %6746, %int1_8501 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6748 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.5.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.5.modulation.lin.weight : tensor<9216x3072xf16>
    %6749 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_8502 = torch.constant.int 0
    %int1_8503 = torch.constant.int 1
    %6750 = torch.aten.transpose.int %6749, %int0_8502, %int1_8503 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.5.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.5.modulation.lin.bias : tensor<9216xf16>
    %6751 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_8504 = torch.constant.int 6
    %6752 = torch.prims.convert_element_type %6751, %int6_8504 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_8505 = torch.constant.int 6
    %6753 = torch.prims.convert_element_type %6748, %int6_8505 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_8506 = torch.constant.int 6
    %6754 = torch.prims.convert_element_type %6750, %int6_8506 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %6755 = torch.aten.mm %6753, %6754 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_8507 = torch.constant.int 1
    %6756 = torch.aten.mul.Scalar %6755, %int1_8507 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_8508 = torch.constant.int 1
    %6757 = torch.aten.mul.Scalar %6752, %int1_8508 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_8509 = torch.constant.int 1
    %6758 = torch.aten.add.Tensor %6756, %6757, %int1_8509 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_8510 = torch.constant.int 5
    %6759 = torch.prims.convert_element_type %6758, %int5_8510 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_8511 = torch.constant.int 0
    %int0_8512 = torch.constant.int 0
    %int9223372036854775807_8513 = torch.constant.int 9223372036854775807
    %int1_8514 = torch.constant.int 1
    %6760 = torch.aten.slice.Tensor %6759, %int0_8511, %int0_8512, %int9223372036854775807_8513, %int1_8514 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_8515 = torch.constant.int 1
    %6761 = torch.aten.unsqueeze %6760, %int1_8515 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_8516 = torch.constant.int 2
    %int0_8517 = torch.constant.int 0
    %int9223372036854775807_8518 = torch.constant.int 9223372036854775807
    %int1_8519 = torch.constant.int 1
    %6762 = torch.aten.slice.Tensor %6761, %int2_8516, %int0_8517, %int9223372036854775807_8518, %int1_8519 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_8520 = torch.constant.int -1
    %int0_8521 = torch.constant.int 0
    %int3072_8522 = torch.constant.int 3072
    %int1_8523 = torch.constant.int 1
    %6763 = torch.aten.slice.Tensor %6762, %int-1_8520, %int0_8521, %int3072_8522, %int1_8523 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8524 = torch.constant.int -1
    %int3072_8525 = torch.constant.int 3072
    %int6144_8526 = torch.constant.int 6144
    %int1_8527 = torch.constant.int 1
    %6764 = torch.aten.slice.Tensor %6762, %int-1_8524, %int3072_8525, %int6144_8526, %int1_8527 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8528 = torch.constant.int -1
    %int6144_8529 = torch.constant.int 6144
    %int9216_8530 = torch.constant.int 9216
    %int1_8531 = torch.constant.int 1
    %6765 = torch.aten.slice.Tensor %6762, %int-1_8528, %int6144_8529, %int9216_8530, %int1_8531 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_8532 = torch.constant.int 1
    %int1_8533 = torch.constant.int 1
    %6766 = torch.aten.add.Scalar %6764, %int1_8532, %int1_8533 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_8534 = torch.constant.int 6
    %6767 = torch.prims.convert_element_type %6747, %int6_8534 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_8535 = torch.constant.int 2
    %6768 = torch.prim.ListConstruct %int2_8535 : (!torch.int) -> !torch.list<int>
    %int0_8536 = torch.constant.int 0
    %true_8537 = torch.constant.bool true
    %result0_8538, %result1_8539 = torch.aten.var_mean.correction %6767, %6768, %int0_8536, %true_8537 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_8540 = torch.constant.float 9.9999999999999995E-7
    %int1_8541 = torch.constant.int 1
    %6769 = torch.aten.add.Scalar %result0_8538, %float9.999990e-07_8540, %int1_8541 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %6770 = torch.aten.rsqrt %6769 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_8542 = torch.constant.int 1
    %6771 = torch.aten.sub.Tensor %6747, %result1_8539, %int1_8542 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %6772 = torch.aten.mul.Tensor %6771, %6770 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_8543 = torch.constant.int 5
    %6773 = torch.prims.convert_element_type %6772, %int5_8543 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6774 = torch.aten.mul.Tensor %6766, %6773 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_8544 = torch.constant.int 1
    %6775 = torch.aten.add.Tensor %6774, %6763, %int1_8544 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_8545 = torch.constant.int 4608
    %int3072_8546 = torch.constant.int 3072
    %6776 = torch.prim.ListConstruct %int4608_8545, %int3072_8546 : (!torch.int, !torch.int) -> !torch.list<int>
    %6777 = torch.aten.view %6775, %6776 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.5.linear1.weight = util.global.load @__auto.sampler.single_blocks.5.linear1.weight : tensor<21504x3072xf16>
    %6778 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_8547 = torch.constant.int 0
    %int1_8548 = torch.constant.int 1
    %6779 = torch.aten.transpose.int %6778, %int0_8547, %int1_8548 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.5.linear1.bias = util.global.load @__auto.sampler.single_blocks.5.linear1.bias : tensor<21504xf16>
    %6780 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_8549 = torch.constant.int 6
    %6781 = torch.prims.convert_element_type %6780, %int6_8549 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_8550 = torch.constant.int 6
    %6782 = torch.prims.convert_element_type %6777, %int6_8550 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_8551 = torch.constant.int 6
    %6783 = torch.prims.convert_element_type %6779, %int6_8551 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %6784 = torch.aten.mm %6782, %6783 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_8552 = torch.constant.int 1
    %6785 = torch.aten.mul.Scalar %6784, %int1_8552 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_8553 = torch.constant.int 1
    %6786 = torch.aten.mul.Scalar %6781, %int1_8553 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_8554 = torch.constant.int 1
    %6787 = torch.aten.add.Tensor %6785, %6786, %int1_8554 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_8555 = torch.constant.int 5
    %6788 = torch.prims.convert_element_type %6787, %int5_8555 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_8556 = torch.constant.int 1
    %int4608_8557 = torch.constant.int 4608
    %int21504_8558 = torch.constant.int 21504
    %6789 = torch.prim.ListConstruct %int1_8556, %int4608_8557, %int21504_8558 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6790 = torch.aten.view %6788, %6789 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_8559 = torch.constant.int -1
    %int0_8560 = torch.constant.int 0
    %int9216_8561 = torch.constant.int 9216
    %int1_8562 = torch.constant.int 1
    %6791 = torch.aten.slice.Tensor %6790, %int-1_8559, %int0_8560, %int9216_8561, %int1_8562 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_8563 = torch.constant.int -1
    %int9216_8564 = torch.constant.int 9216
    %int21504_8565 = torch.constant.int 21504
    %int1_8566 = torch.constant.int 1
    %6792 = torch.aten.slice.Tensor %6790, %int-1_8563, %int9216_8564, %int21504_8565, %int1_8566 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_8567 = torch.constant.int 1
    %int4608_8568 = torch.constant.int 4608
    %int3_8569 = torch.constant.int 3
    %int24_8570 = torch.constant.int 24
    %int128_8571 = torch.constant.int 128
    %6793 = torch.prim.ListConstruct %int1_8567, %int4608_8568, %int3_8569, %int24_8570, %int128_8571 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6794 = torch.aten.view %6791, %6793 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_8572 = torch.constant.int 2
    %int0_8573 = torch.constant.int 0
    %int3_8574 = torch.constant.int 3
    %int1_8575 = torch.constant.int 1
    %int4_8576 = torch.constant.int 4
    %6795 = torch.prim.ListConstruct %int2_8572, %int0_8573, %int3_8574, %int1_8575, %int4_8576 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6796 = torch.aten.permute %6794, %6795 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_8577 = torch.constant.int 0
    %int0_8578 = torch.constant.int 0
    %6797 = torch.aten.select.int %6796, %int0_8577, %int0_8578 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_8579 = torch.constant.int 0
    %int1_8580 = torch.constant.int 1
    %6798 = torch.aten.select.int %6796, %int0_8579, %int1_8580 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_8581 = torch.constant.int 0
    %int2_8582 = torch.constant.int 2
    %6799 = torch.aten.select.int %6796, %int0_8581, %int2_8582 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8583 = torch.constant.int 6
    %6800 = torch.prims.convert_element_type %6797, %int6_8583 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_8584 = torch.constant.int 2
    %6801 = torch.aten.pow.Tensor_Scalar %6800, %int2_8584 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_8585 = torch.constant.int -1
    %6802 = torch.prim.ListConstruct %int-1_8585 : (!torch.int) -> !torch.list<int>
    %true_8586 = torch.constant.bool true
    %none_8587 = torch.constant.none
    %6803 = torch.aten.mean.dim %6801, %6802, %true_8586, %none_8587 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_8588 = torch.constant.float 9.9999999999999995E-7
    %int1_8589 = torch.constant.int 1
    %6804 = torch.aten.add.Scalar %6803, %float9.999990e-07_8588, %int1_8589 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6805 = torch.aten.rsqrt %6804 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6806 = torch.aten.mul.Tensor %6800, %6805 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8590 = torch.constant.int 5
    %6807 = torch.prims.convert_element_type %6806, %int5_8590 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.5.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.5.norm.query_norm.scale : tensor<128xf16>
    %6808 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6809 = torch.aten.mul.Tensor %6807, %6808 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8591 = torch.constant.int 6
    %6810 = torch.prims.convert_element_type %6798, %int6_8591 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_8592 = torch.constant.int 2
    %6811 = torch.aten.pow.Tensor_Scalar %6810, %int2_8592 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_8593 = torch.constant.int -1
    %6812 = torch.prim.ListConstruct %int-1_8593 : (!torch.int) -> !torch.list<int>
    %true_8594 = torch.constant.bool true
    %none_8595 = torch.constant.none
    %6813 = torch.aten.mean.dim %6811, %6812, %true_8594, %none_8595 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_8596 = torch.constant.float 9.9999999999999995E-7
    %int1_8597 = torch.constant.int 1
    %6814 = torch.aten.add.Scalar %6813, %float9.999990e-07_8596, %int1_8597 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6815 = torch.aten.rsqrt %6814 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6816 = torch.aten.mul.Tensor %6810, %6815 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8598 = torch.constant.int 5
    %6817 = torch.prims.convert_element_type %6816, %int5_8598 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.5.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.5.norm.key_norm.scale : tensor<128xf16>
    %6818 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6819 = torch.aten.mul.Tensor %6817, %6818 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_8599 = torch.constant.int 5
    %6820 = torch.prims.convert_element_type %6809, %int5_8599 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_8600 = torch.constant.int 5
    %6821 = torch.prims.convert_element_type %6819, %int5_8600 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8601 = torch.constant.int 6
    %6822 = torch.prims.convert_element_type %6820, %int6_8601 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8602 = torch.constant.int 1
    %int24_8603 = torch.constant.int 24
    %int4608_8604 = torch.constant.int 4608
    %int64_8605 = torch.constant.int 64
    %int1_8606 = torch.constant.int 1
    %int2_8607 = torch.constant.int 2
    %6823 = torch.prim.ListConstruct %int1_8602, %int24_8603, %int4608_8604, %int64_8605, %int1_8606, %int2_8607 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6824 = torch.aten.view %6822, %6823 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_8608 = torch.constant.int 6
    %6825 = torch.prims.convert_element_type %6821, %int6_8608 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8609 = torch.constant.int 1
    %int24_8610 = torch.constant.int 24
    %int4608_8611 = torch.constant.int 4608
    %int64_8612 = torch.constant.int 64
    %int1_8613 = torch.constant.int 1
    %int2_8614 = torch.constant.int 2
    %6826 = torch.prim.ListConstruct %int1_8609, %int24_8610, %int4608_8611, %int64_8612, %int1_8613, %int2_8614 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6827 = torch.aten.view %6825, %6826 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_8615 = torch.constant.int 5
    %int0_8616 = torch.constant.int 0
    %6828 = torch.aten.select.int %211, %int5_8615, %int0_8616 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8617 = torch.constant.int 5
    %int0_8618 = torch.constant.int 0
    %6829 = torch.aten.select.int %6824, %int5_8617, %int0_8618 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6830 = torch.aten.mul.Tensor %6828, %6829 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8619 = torch.constant.int 5
    %int1_8620 = torch.constant.int 1
    %6831 = torch.aten.select.int %211, %int5_8619, %int1_8620 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8621 = torch.constant.int 5
    %int1_8622 = torch.constant.int 1
    %6832 = torch.aten.select.int %6824, %int5_8621, %int1_8622 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6833 = torch.aten.mul.Tensor %6831, %6832 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8623 = torch.constant.int 1
    %6834 = torch.aten.add.Tensor %6830, %6833, %int1_8623 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8624 = torch.constant.int 5
    %int0_8625 = torch.constant.int 0
    %6835 = torch.aten.select.int %211, %int5_8624, %int0_8625 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8626 = torch.constant.int 5
    %int0_8627 = torch.constant.int 0
    %6836 = torch.aten.select.int %6827, %int5_8626, %int0_8627 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6837 = torch.aten.mul.Tensor %6835, %6836 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8628 = torch.constant.int 5
    %int1_8629 = torch.constant.int 1
    %6838 = torch.aten.select.int %211, %int5_8628, %int1_8629 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8630 = torch.constant.int 5
    %int1_8631 = torch.constant.int 1
    %6839 = torch.aten.select.int %6827, %int5_8630, %int1_8631 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6840 = torch.aten.mul.Tensor %6838, %6839 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8632 = torch.constant.int 1
    %6841 = torch.aten.add.Tensor %6837, %6840, %int1_8632 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8633 = torch.constant.int 1
    %int24_8634 = torch.constant.int 24
    %int4608_8635 = torch.constant.int 4608
    %int128_8636 = torch.constant.int 128
    %6842 = torch.prim.ListConstruct %int1_8633, %int24_8634, %int4608_8635, %int128_8636 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6843 = torch.aten.view %6834, %6842 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8637 = torch.constant.int 5
    %6844 = torch.prims.convert_element_type %6843, %int5_8637 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_8638 = torch.constant.int 1
    %int24_8639 = torch.constant.int 24
    %int4608_8640 = torch.constant.int 4608
    %int128_8641 = torch.constant.int 128
    %6845 = torch.prim.ListConstruct %int1_8638, %int24_8639, %int4608_8640, %int128_8641 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6846 = torch.aten.view %6841, %6845 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8642 = torch.constant.int 5
    %6847 = torch.prims.convert_element_type %6846, %int5_8642 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_8643 = torch.constant.float 0.000000e+00
    %false_8644 = torch.constant.bool false
    %none_8645 = torch.constant.none
    %none_8646 = torch.constant.none
    %6848:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%6844, %6847, %6799, %float0.000000e00_8643, %false_8644, %none_8645, %none_8646) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_8647 = torch.constant.int 0
    %int2_8648 = torch.constant.int 2
    %int1_8649 = torch.constant.int 1
    %int3_8650 = torch.constant.int 3
    %6849 = torch.prim.ListConstruct %int0_8647, %int2_8648, %int1_8649, %int3_8650 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6850 = torch.aten.permute %6848#0, %6849 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_8651 = torch.constant.int 1
    %int4608_8652 = torch.constant.int 4608
    %int3072_8653 = torch.constant.int 3072
    %6851 = torch.prim.ListConstruct %int1_8651, %int4608_8652, %int3072_8653 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6852 = torch.aten.view %6850, %6851 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_8654 = torch.constant.str "tanh"
    %6853 = torch.aten.gelu %6792, %str_8654 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %6854 = torch.prim.ListConstruct %6852, %6853 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_8655 = torch.constant.int 2
    %6855 = torch.aten.cat %6854, %int2_8655 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_8656 = torch.constant.int 4608
    %int15360_8657 = torch.constant.int 15360
    %6856 = torch.prim.ListConstruct %int4608_8656, %int15360_8657 : (!torch.int, !torch.int) -> !torch.list<int>
    %6857 = torch.aten.view %6855, %6856 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.5.linear2.weight = util.global.load @__auto.sampler.single_blocks.5.linear2.weight : tensor<3072x15360xf16>
    %6858 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_8658 = torch.constant.int 0
    %int1_8659 = torch.constant.int 1
    %6859 = torch.aten.transpose.int %6858, %int0_8658, %int1_8659 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.5.linear2.bias = util.global.load @__auto.sampler.single_blocks.5.linear2.bias : tensor<3072xf16>
    %6860 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.5.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_8660 = torch.constant.int 6
    %6861 = torch.prims.convert_element_type %6860, %int6_8660 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_8661 = torch.constant.int 6
    %6862 = torch.prims.convert_element_type %6857, %int6_8661 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_8662 = torch.constant.int 6
    %6863 = torch.prims.convert_element_type %6859, %int6_8662 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %6864 = torch.aten.mm %6862, %6863 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_8663 = torch.constant.int 1
    %6865 = torch.aten.mul.Scalar %6864, %int1_8663 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_8664 = torch.constant.int 1
    %6866 = torch.aten.mul.Scalar %6861, %int1_8664 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_8665 = torch.constant.int 1
    %6867 = torch.aten.add.Tensor %6865, %6866, %int1_8665 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_8666 = torch.constant.int 5
    %6868 = torch.prims.convert_element_type %6867, %int5_8666 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_8667 = torch.constant.int 1
    %int4608_8668 = torch.constant.int 4608
    %int3072_8669 = torch.constant.int 3072
    %6869 = torch.prim.ListConstruct %int1_8667, %int4608_8668, %int3072_8669 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6870 = torch.aten.view %6868, %6869 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %6871 = torch.aten.mul.Tensor %6765, %6870 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_8670 = torch.constant.int 1
    %6872 = torch.aten.add.Tensor %6747, %6871, %int1_8670 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6873 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.6.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.6.modulation.lin.weight : tensor<9216x3072xf16>
    %6874 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_8671 = torch.constant.int 0
    %int1_8672 = torch.constant.int 1
    %6875 = torch.aten.transpose.int %6874, %int0_8671, %int1_8672 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.6.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.6.modulation.lin.bias : tensor<9216xf16>
    %6876 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_8673 = torch.constant.int 6
    %6877 = torch.prims.convert_element_type %6876, %int6_8673 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_8674 = torch.constant.int 6
    %6878 = torch.prims.convert_element_type %6873, %int6_8674 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_8675 = torch.constant.int 6
    %6879 = torch.prims.convert_element_type %6875, %int6_8675 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %6880 = torch.aten.mm %6878, %6879 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_8676 = torch.constant.int 1
    %6881 = torch.aten.mul.Scalar %6880, %int1_8676 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_8677 = torch.constant.int 1
    %6882 = torch.aten.mul.Scalar %6877, %int1_8677 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_8678 = torch.constant.int 1
    %6883 = torch.aten.add.Tensor %6881, %6882, %int1_8678 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_8679 = torch.constant.int 5
    %6884 = torch.prims.convert_element_type %6883, %int5_8679 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_8680 = torch.constant.int 0
    %int0_8681 = torch.constant.int 0
    %int9223372036854775807_8682 = torch.constant.int 9223372036854775807
    %int1_8683 = torch.constant.int 1
    %6885 = torch.aten.slice.Tensor %6884, %int0_8680, %int0_8681, %int9223372036854775807_8682, %int1_8683 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_8684 = torch.constant.int 1
    %6886 = torch.aten.unsqueeze %6885, %int1_8684 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_8685 = torch.constant.int 2
    %int0_8686 = torch.constant.int 0
    %int9223372036854775807_8687 = torch.constant.int 9223372036854775807
    %int1_8688 = torch.constant.int 1
    %6887 = torch.aten.slice.Tensor %6886, %int2_8685, %int0_8686, %int9223372036854775807_8687, %int1_8688 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_8689 = torch.constant.int -1
    %int0_8690 = torch.constant.int 0
    %int3072_8691 = torch.constant.int 3072
    %int1_8692 = torch.constant.int 1
    %6888 = torch.aten.slice.Tensor %6887, %int-1_8689, %int0_8690, %int3072_8691, %int1_8692 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8693 = torch.constant.int -1
    %int3072_8694 = torch.constant.int 3072
    %int6144_8695 = torch.constant.int 6144
    %int1_8696 = torch.constant.int 1
    %6889 = torch.aten.slice.Tensor %6887, %int-1_8693, %int3072_8694, %int6144_8695, %int1_8696 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8697 = torch.constant.int -1
    %int6144_8698 = torch.constant.int 6144
    %int9216_8699 = torch.constant.int 9216
    %int1_8700 = torch.constant.int 1
    %6890 = torch.aten.slice.Tensor %6887, %int-1_8697, %int6144_8698, %int9216_8699, %int1_8700 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_8701 = torch.constant.int 1
    %int1_8702 = torch.constant.int 1
    %6891 = torch.aten.add.Scalar %6889, %int1_8701, %int1_8702 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_8703 = torch.constant.int 6
    %6892 = torch.prims.convert_element_type %6872, %int6_8703 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_8704 = torch.constant.int 2
    %6893 = torch.prim.ListConstruct %int2_8704 : (!torch.int) -> !torch.list<int>
    %int0_8705 = torch.constant.int 0
    %true_8706 = torch.constant.bool true
    %result0_8707, %result1_8708 = torch.aten.var_mean.correction %6892, %6893, %int0_8705, %true_8706 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_8709 = torch.constant.float 9.9999999999999995E-7
    %int1_8710 = torch.constant.int 1
    %6894 = torch.aten.add.Scalar %result0_8707, %float9.999990e-07_8709, %int1_8710 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %6895 = torch.aten.rsqrt %6894 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_8711 = torch.constant.int 1
    %6896 = torch.aten.sub.Tensor %6872, %result1_8708, %int1_8711 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %6897 = torch.aten.mul.Tensor %6896, %6895 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_8712 = torch.constant.int 5
    %6898 = torch.prims.convert_element_type %6897, %int5_8712 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6899 = torch.aten.mul.Tensor %6891, %6898 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_8713 = torch.constant.int 1
    %6900 = torch.aten.add.Tensor %6899, %6888, %int1_8713 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_8714 = torch.constant.int 4608
    %int3072_8715 = torch.constant.int 3072
    %6901 = torch.prim.ListConstruct %int4608_8714, %int3072_8715 : (!torch.int, !torch.int) -> !torch.list<int>
    %6902 = torch.aten.view %6900, %6901 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.6.linear1.weight = util.global.load @__auto.sampler.single_blocks.6.linear1.weight : tensor<21504x3072xf16>
    %6903 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_8716 = torch.constant.int 0
    %int1_8717 = torch.constant.int 1
    %6904 = torch.aten.transpose.int %6903, %int0_8716, %int1_8717 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.6.linear1.bias = util.global.load @__auto.sampler.single_blocks.6.linear1.bias : tensor<21504xf16>
    %6905 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_8718 = torch.constant.int 6
    %6906 = torch.prims.convert_element_type %6905, %int6_8718 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_8719 = torch.constant.int 6
    %6907 = torch.prims.convert_element_type %6902, %int6_8719 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_8720 = torch.constant.int 6
    %6908 = torch.prims.convert_element_type %6904, %int6_8720 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %6909 = torch.aten.mm %6907, %6908 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_8721 = torch.constant.int 1
    %6910 = torch.aten.mul.Scalar %6909, %int1_8721 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_8722 = torch.constant.int 1
    %6911 = torch.aten.mul.Scalar %6906, %int1_8722 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_8723 = torch.constant.int 1
    %6912 = torch.aten.add.Tensor %6910, %6911, %int1_8723 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_8724 = torch.constant.int 5
    %6913 = torch.prims.convert_element_type %6912, %int5_8724 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_8725 = torch.constant.int 1
    %int4608_8726 = torch.constant.int 4608
    %int21504_8727 = torch.constant.int 21504
    %6914 = torch.prim.ListConstruct %int1_8725, %int4608_8726, %int21504_8727 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6915 = torch.aten.view %6913, %6914 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_8728 = torch.constant.int -1
    %int0_8729 = torch.constant.int 0
    %int9216_8730 = torch.constant.int 9216
    %int1_8731 = torch.constant.int 1
    %6916 = torch.aten.slice.Tensor %6915, %int-1_8728, %int0_8729, %int9216_8730, %int1_8731 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_8732 = torch.constant.int -1
    %int9216_8733 = torch.constant.int 9216
    %int21504_8734 = torch.constant.int 21504
    %int1_8735 = torch.constant.int 1
    %6917 = torch.aten.slice.Tensor %6915, %int-1_8732, %int9216_8733, %int21504_8734, %int1_8735 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_8736 = torch.constant.int 1
    %int4608_8737 = torch.constant.int 4608
    %int3_8738 = torch.constant.int 3
    %int24_8739 = torch.constant.int 24
    %int128_8740 = torch.constant.int 128
    %6918 = torch.prim.ListConstruct %int1_8736, %int4608_8737, %int3_8738, %int24_8739, %int128_8740 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6919 = torch.aten.view %6916, %6918 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_8741 = torch.constant.int 2
    %int0_8742 = torch.constant.int 0
    %int3_8743 = torch.constant.int 3
    %int1_8744 = torch.constant.int 1
    %int4_8745 = torch.constant.int 4
    %6920 = torch.prim.ListConstruct %int2_8741, %int0_8742, %int3_8743, %int1_8744, %int4_8745 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6921 = torch.aten.permute %6919, %6920 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_8746 = torch.constant.int 0
    %int0_8747 = torch.constant.int 0
    %6922 = torch.aten.select.int %6921, %int0_8746, %int0_8747 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_8748 = torch.constant.int 0
    %int1_8749 = torch.constant.int 1
    %6923 = torch.aten.select.int %6921, %int0_8748, %int1_8749 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_8750 = torch.constant.int 0
    %int2_8751 = torch.constant.int 2
    %6924 = torch.aten.select.int %6921, %int0_8750, %int2_8751 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8752 = torch.constant.int 6
    %6925 = torch.prims.convert_element_type %6922, %int6_8752 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_8753 = torch.constant.int 2
    %6926 = torch.aten.pow.Tensor_Scalar %6925, %int2_8753 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_8754 = torch.constant.int -1
    %6927 = torch.prim.ListConstruct %int-1_8754 : (!torch.int) -> !torch.list<int>
    %true_8755 = torch.constant.bool true
    %none_8756 = torch.constant.none
    %6928 = torch.aten.mean.dim %6926, %6927, %true_8755, %none_8756 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_8757 = torch.constant.float 9.9999999999999995E-7
    %int1_8758 = torch.constant.int 1
    %6929 = torch.aten.add.Scalar %6928, %float9.999990e-07_8757, %int1_8758 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6930 = torch.aten.rsqrt %6929 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6931 = torch.aten.mul.Tensor %6925, %6930 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8759 = torch.constant.int 5
    %6932 = torch.prims.convert_element_type %6931, %int5_8759 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.6.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.6.norm.query_norm.scale : tensor<128xf16>
    %6933 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6934 = torch.aten.mul.Tensor %6932, %6933 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8760 = torch.constant.int 6
    %6935 = torch.prims.convert_element_type %6923, %int6_8760 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_8761 = torch.constant.int 2
    %6936 = torch.aten.pow.Tensor_Scalar %6935, %int2_8761 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_8762 = torch.constant.int -1
    %6937 = torch.prim.ListConstruct %int-1_8762 : (!torch.int) -> !torch.list<int>
    %true_8763 = torch.constant.bool true
    %none_8764 = torch.constant.none
    %6938 = torch.aten.mean.dim %6936, %6937, %true_8763, %none_8764 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_8765 = torch.constant.float 9.9999999999999995E-7
    %int1_8766 = torch.constant.int 1
    %6939 = torch.aten.add.Scalar %6938, %float9.999990e-07_8765, %int1_8766 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %6940 = torch.aten.rsqrt %6939 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %6941 = torch.aten.mul.Tensor %6935, %6940 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8767 = torch.constant.int 5
    %6942 = torch.prims.convert_element_type %6941, %int5_8767 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.6.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.6.norm.key_norm.scale : tensor<128xf16>
    %6943 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %6944 = torch.aten.mul.Tensor %6942, %6943 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_8768 = torch.constant.int 5
    %6945 = torch.prims.convert_element_type %6934, %int5_8768 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_8769 = torch.constant.int 5
    %6946 = torch.prims.convert_element_type %6944, %int5_8769 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8770 = torch.constant.int 6
    %6947 = torch.prims.convert_element_type %6945, %int6_8770 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8771 = torch.constant.int 1
    %int24_8772 = torch.constant.int 24
    %int4608_8773 = torch.constant.int 4608
    %int64_8774 = torch.constant.int 64
    %int1_8775 = torch.constant.int 1
    %int2_8776 = torch.constant.int 2
    %6948 = torch.prim.ListConstruct %int1_8771, %int24_8772, %int4608_8773, %int64_8774, %int1_8775, %int2_8776 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6949 = torch.aten.view %6947, %6948 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_8777 = torch.constant.int 6
    %6950 = torch.prims.convert_element_type %6946, %int6_8777 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8778 = torch.constant.int 1
    %int24_8779 = torch.constant.int 24
    %int4608_8780 = torch.constant.int 4608
    %int64_8781 = torch.constant.int 64
    %int1_8782 = torch.constant.int 1
    %int2_8783 = torch.constant.int 2
    %6951 = torch.prim.ListConstruct %int1_8778, %int24_8779, %int4608_8780, %int64_8781, %int1_8782, %int2_8783 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6952 = torch.aten.view %6950, %6951 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_8784 = torch.constant.int 5
    %int0_8785 = torch.constant.int 0
    %6953 = torch.aten.select.int %211, %int5_8784, %int0_8785 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8786 = torch.constant.int 5
    %int0_8787 = torch.constant.int 0
    %6954 = torch.aten.select.int %6949, %int5_8786, %int0_8787 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6955 = torch.aten.mul.Tensor %6953, %6954 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8788 = torch.constant.int 5
    %int1_8789 = torch.constant.int 1
    %6956 = torch.aten.select.int %211, %int5_8788, %int1_8789 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8790 = torch.constant.int 5
    %int1_8791 = torch.constant.int 1
    %6957 = torch.aten.select.int %6949, %int5_8790, %int1_8791 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6958 = torch.aten.mul.Tensor %6956, %6957 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8792 = torch.constant.int 1
    %6959 = torch.aten.add.Tensor %6955, %6958, %int1_8792 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8793 = torch.constant.int 5
    %int0_8794 = torch.constant.int 0
    %6960 = torch.aten.select.int %211, %int5_8793, %int0_8794 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8795 = torch.constant.int 5
    %int0_8796 = torch.constant.int 0
    %6961 = torch.aten.select.int %6952, %int5_8795, %int0_8796 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6962 = torch.aten.mul.Tensor %6960, %6961 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8797 = torch.constant.int 5
    %int1_8798 = torch.constant.int 1
    %6963 = torch.aten.select.int %211, %int5_8797, %int1_8798 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8799 = torch.constant.int 5
    %int1_8800 = torch.constant.int 1
    %6964 = torch.aten.select.int %6952, %int5_8799, %int1_8800 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %6965 = torch.aten.mul.Tensor %6963, %6964 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8801 = torch.constant.int 1
    %6966 = torch.aten.add.Tensor %6962, %6965, %int1_8801 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8802 = torch.constant.int 1
    %int24_8803 = torch.constant.int 24
    %int4608_8804 = torch.constant.int 4608
    %int128_8805 = torch.constant.int 128
    %6967 = torch.prim.ListConstruct %int1_8802, %int24_8803, %int4608_8804, %int128_8805 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6968 = torch.aten.view %6959, %6967 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8806 = torch.constant.int 5
    %6969 = torch.prims.convert_element_type %6968, %int5_8806 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_8807 = torch.constant.int 1
    %int24_8808 = torch.constant.int 24
    %int4608_8809 = torch.constant.int 4608
    %int128_8810 = torch.constant.int 128
    %6970 = torch.prim.ListConstruct %int1_8807, %int24_8808, %int4608_8809, %int128_8810 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6971 = torch.aten.view %6966, %6970 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8811 = torch.constant.int 5
    %6972 = torch.prims.convert_element_type %6971, %int5_8811 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_8812 = torch.constant.float 0.000000e+00
    %false_8813 = torch.constant.bool false
    %none_8814 = torch.constant.none
    %none_8815 = torch.constant.none
    %6973:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%6969, %6972, %6924, %float0.000000e00_8812, %false_8813, %none_8814, %none_8815) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_8816 = torch.constant.int 0
    %int2_8817 = torch.constant.int 2
    %int1_8818 = torch.constant.int 1
    %int3_8819 = torch.constant.int 3
    %6974 = torch.prim.ListConstruct %int0_8816, %int2_8817, %int1_8818, %int3_8819 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6975 = torch.aten.permute %6973#0, %6974 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_8820 = torch.constant.int 1
    %int4608_8821 = torch.constant.int 4608
    %int3072_8822 = torch.constant.int 3072
    %6976 = torch.prim.ListConstruct %int1_8820, %int4608_8821, %int3072_8822 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6977 = torch.aten.view %6975, %6976 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_8823 = torch.constant.str "tanh"
    %6978 = torch.aten.gelu %6917, %str_8823 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %6979 = torch.prim.ListConstruct %6977, %6978 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_8824 = torch.constant.int 2
    %6980 = torch.aten.cat %6979, %int2_8824 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_8825 = torch.constant.int 4608
    %int15360_8826 = torch.constant.int 15360
    %6981 = torch.prim.ListConstruct %int4608_8825, %int15360_8826 : (!torch.int, !torch.int) -> !torch.list<int>
    %6982 = torch.aten.view %6980, %6981 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.6.linear2.weight = util.global.load @__auto.sampler.single_blocks.6.linear2.weight : tensor<3072x15360xf16>
    %6983 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_8827 = torch.constant.int 0
    %int1_8828 = torch.constant.int 1
    %6984 = torch.aten.transpose.int %6983, %int0_8827, %int1_8828 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.6.linear2.bias = util.global.load @__auto.sampler.single_blocks.6.linear2.bias : tensor<3072xf16>
    %6985 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.6.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_8829 = torch.constant.int 6
    %6986 = torch.prims.convert_element_type %6985, %int6_8829 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_8830 = torch.constant.int 6
    %6987 = torch.prims.convert_element_type %6982, %int6_8830 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_8831 = torch.constant.int 6
    %6988 = torch.prims.convert_element_type %6984, %int6_8831 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %6989 = torch.aten.mm %6987, %6988 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_8832 = torch.constant.int 1
    %6990 = torch.aten.mul.Scalar %6989, %int1_8832 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_8833 = torch.constant.int 1
    %6991 = torch.aten.mul.Scalar %6986, %int1_8833 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_8834 = torch.constant.int 1
    %6992 = torch.aten.add.Tensor %6990, %6991, %int1_8834 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_8835 = torch.constant.int 5
    %6993 = torch.prims.convert_element_type %6992, %int5_8835 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_8836 = torch.constant.int 1
    %int4608_8837 = torch.constant.int 4608
    %int3072_8838 = torch.constant.int 3072
    %6994 = torch.prim.ListConstruct %int1_8836, %int4608_8837, %int3072_8838 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %6995 = torch.aten.view %6993, %6994 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %6996 = torch.aten.mul.Tensor %6890, %6995 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_8839 = torch.constant.int 1
    %6997 = torch.aten.add.Tensor %6872, %6996, %int1_8839 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %6998 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.7.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.7.modulation.lin.weight : tensor<9216x3072xf16>
    %6999 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_8840 = torch.constant.int 0
    %int1_8841 = torch.constant.int 1
    %7000 = torch.aten.transpose.int %6999, %int0_8840, %int1_8841 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.7.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.7.modulation.lin.bias : tensor<9216xf16>
    %7001 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_8842 = torch.constant.int 6
    %7002 = torch.prims.convert_element_type %7001, %int6_8842 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_8843 = torch.constant.int 6
    %7003 = torch.prims.convert_element_type %6998, %int6_8843 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_8844 = torch.constant.int 6
    %7004 = torch.prims.convert_element_type %7000, %int6_8844 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7005 = torch.aten.mm %7003, %7004 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_8845 = torch.constant.int 1
    %7006 = torch.aten.mul.Scalar %7005, %int1_8845 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_8846 = torch.constant.int 1
    %7007 = torch.aten.mul.Scalar %7002, %int1_8846 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_8847 = torch.constant.int 1
    %7008 = torch.aten.add.Tensor %7006, %7007, %int1_8847 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_8848 = torch.constant.int 5
    %7009 = torch.prims.convert_element_type %7008, %int5_8848 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_8849 = torch.constant.int 0
    %int0_8850 = torch.constant.int 0
    %int9223372036854775807_8851 = torch.constant.int 9223372036854775807
    %int1_8852 = torch.constant.int 1
    %7010 = torch.aten.slice.Tensor %7009, %int0_8849, %int0_8850, %int9223372036854775807_8851, %int1_8852 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_8853 = torch.constant.int 1
    %7011 = torch.aten.unsqueeze %7010, %int1_8853 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_8854 = torch.constant.int 2
    %int0_8855 = torch.constant.int 0
    %int9223372036854775807_8856 = torch.constant.int 9223372036854775807
    %int1_8857 = torch.constant.int 1
    %7012 = torch.aten.slice.Tensor %7011, %int2_8854, %int0_8855, %int9223372036854775807_8856, %int1_8857 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_8858 = torch.constant.int -1
    %int0_8859 = torch.constant.int 0
    %int3072_8860 = torch.constant.int 3072
    %int1_8861 = torch.constant.int 1
    %7013 = torch.aten.slice.Tensor %7012, %int-1_8858, %int0_8859, %int3072_8860, %int1_8861 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8862 = torch.constant.int -1
    %int3072_8863 = torch.constant.int 3072
    %int6144_8864 = torch.constant.int 6144
    %int1_8865 = torch.constant.int 1
    %7014 = torch.aten.slice.Tensor %7012, %int-1_8862, %int3072_8863, %int6144_8864, %int1_8865 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_8866 = torch.constant.int -1
    %int6144_8867 = torch.constant.int 6144
    %int9216_8868 = torch.constant.int 9216
    %int1_8869 = torch.constant.int 1
    %7015 = torch.aten.slice.Tensor %7012, %int-1_8866, %int6144_8867, %int9216_8868, %int1_8869 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_8870 = torch.constant.int 1
    %int1_8871 = torch.constant.int 1
    %7016 = torch.aten.add.Scalar %7014, %int1_8870, %int1_8871 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_8872 = torch.constant.int 6
    %7017 = torch.prims.convert_element_type %6997, %int6_8872 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_8873 = torch.constant.int 2
    %7018 = torch.prim.ListConstruct %int2_8873 : (!torch.int) -> !torch.list<int>
    %int0_8874 = torch.constant.int 0
    %true_8875 = torch.constant.bool true
    %result0_8876, %result1_8877 = torch.aten.var_mean.correction %7017, %7018, %int0_8874, %true_8875 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_8878 = torch.constant.float 9.9999999999999995E-7
    %int1_8879 = torch.constant.int 1
    %7019 = torch.aten.add.Scalar %result0_8876, %float9.999990e-07_8878, %int1_8879 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7020 = torch.aten.rsqrt %7019 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_8880 = torch.constant.int 1
    %7021 = torch.aten.sub.Tensor %6997, %result1_8877, %int1_8880 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7022 = torch.aten.mul.Tensor %7021, %7020 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_8881 = torch.constant.int 5
    %7023 = torch.prims.convert_element_type %7022, %int5_8881 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7024 = torch.aten.mul.Tensor %7016, %7023 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_8882 = torch.constant.int 1
    %7025 = torch.aten.add.Tensor %7024, %7013, %int1_8882 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_8883 = torch.constant.int 4608
    %int3072_8884 = torch.constant.int 3072
    %7026 = torch.prim.ListConstruct %int4608_8883, %int3072_8884 : (!torch.int, !torch.int) -> !torch.list<int>
    %7027 = torch.aten.view %7025, %7026 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.7.linear1.weight = util.global.load @__auto.sampler.single_blocks.7.linear1.weight : tensor<21504x3072xf16>
    %7028 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_8885 = torch.constant.int 0
    %int1_8886 = torch.constant.int 1
    %7029 = torch.aten.transpose.int %7028, %int0_8885, %int1_8886 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.7.linear1.bias = util.global.load @__auto.sampler.single_blocks.7.linear1.bias : tensor<21504xf16>
    %7030 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_8887 = torch.constant.int 6
    %7031 = torch.prims.convert_element_type %7030, %int6_8887 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_8888 = torch.constant.int 6
    %7032 = torch.prims.convert_element_type %7027, %int6_8888 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_8889 = torch.constant.int 6
    %7033 = torch.prims.convert_element_type %7029, %int6_8889 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7034 = torch.aten.mm %7032, %7033 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_8890 = torch.constant.int 1
    %7035 = torch.aten.mul.Scalar %7034, %int1_8890 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_8891 = torch.constant.int 1
    %7036 = torch.aten.mul.Scalar %7031, %int1_8891 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_8892 = torch.constant.int 1
    %7037 = torch.aten.add.Tensor %7035, %7036, %int1_8892 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_8893 = torch.constant.int 5
    %7038 = torch.prims.convert_element_type %7037, %int5_8893 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_8894 = torch.constant.int 1
    %int4608_8895 = torch.constant.int 4608
    %int21504_8896 = torch.constant.int 21504
    %7039 = torch.prim.ListConstruct %int1_8894, %int4608_8895, %int21504_8896 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7040 = torch.aten.view %7038, %7039 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_8897 = torch.constant.int -1
    %int0_8898 = torch.constant.int 0
    %int9216_8899 = torch.constant.int 9216
    %int1_8900 = torch.constant.int 1
    %7041 = torch.aten.slice.Tensor %7040, %int-1_8897, %int0_8898, %int9216_8899, %int1_8900 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_8901 = torch.constant.int -1
    %int9216_8902 = torch.constant.int 9216
    %int21504_8903 = torch.constant.int 21504
    %int1_8904 = torch.constant.int 1
    %7042 = torch.aten.slice.Tensor %7040, %int-1_8901, %int9216_8902, %int21504_8903, %int1_8904 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_8905 = torch.constant.int 1
    %int4608_8906 = torch.constant.int 4608
    %int3_8907 = torch.constant.int 3
    %int24_8908 = torch.constant.int 24
    %int128_8909 = torch.constant.int 128
    %7043 = torch.prim.ListConstruct %int1_8905, %int4608_8906, %int3_8907, %int24_8908, %int128_8909 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7044 = torch.aten.view %7041, %7043 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_8910 = torch.constant.int 2
    %int0_8911 = torch.constant.int 0
    %int3_8912 = torch.constant.int 3
    %int1_8913 = torch.constant.int 1
    %int4_8914 = torch.constant.int 4
    %7045 = torch.prim.ListConstruct %int2_8910, %int0_8911, %int3_8912, %int1_8913, %int4_8914 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7046 = torch.aten.permute %7044, %7045 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_8915 = torch.constant.int 0
    %int0_8916 = torch.constant.int 0
    %7047 = torch.aten.select.int %7046, %int0_8915, %int0_8916 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_8917 = torch.constant.int 0
    %int1_8918 = torch.constant.int 1
    %7048 = torch.aten.select.int %7046, %int0_8917, %int1_8918 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_8919 = torch.constant.int 0
    %int2_8920 = torch.constant.int 2
    %7049 = torch.aten.select.int %7046, %int0_8919, %int2_8920 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8921 = torch.constant.int 6
    %7050 = torch.prims.convert_element_type %7047, %int6_8921 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_8922 = torch.constant.int 2
    %7051 = torch.aten.pow.Tensor_Scalar %7050, %int2_8922 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_8923 = torch.constant.int -1
    %7052 = torch.prim.ListConstruct %int-1_8923 : (!torch.int) -> !torch.list<int>
    %true_8924 = torch.constant.bool true
    %none_8925 = torch.constant.none
    %7053 = torch.aten.mean.dim %7051, %7052, %true_8924, %none_8925 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_8926 = torch.constant.float 9.9999999999999995E-7
    %int1_8927 = torch.constant.int 1
    %7054 = torch.aten.add.Scalar %7053, %float9.999990e-07_8926, %int1_8927 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7055 = torch.aten.rsqrt %7054 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7056 = torch.aten.mul.Tensor %7050, %7055 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8928 = torch.constant.int 5
    %7057 = torch.prims.convert_element_type %7056, %int5_8928 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.7.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.7.norm.query_norm.scale : tensor<128xf16>
    %7058 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7059 = torch.aten.mul.Tensor %7057, %7058 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8929 = torch.constant.int 6
    %7060 = torch.prims.convert_element_type %7048, %int6_8929 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_8930 = torch.constant.int 2
    %7061 = torch.aten.pow.Tensor_Scalar %7060, %int2_8930 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_8931 = torch.constant.int -1
    %7062 = torch.prim.ListConstruct %int-1_8931 : (!torch.int) -> !torch.list<int>
    %true_8932 = torch.constant.bool true
    %none_8933 = torch.constant.none
    %7063 = torch.aten.mean.dim %7061, %7062, %true_8932, %none_8933 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_8934 = torch.constant.float 9.9999999999999995E-7
    %int1_8935 = torch.constant.int 1
    %7064 = torch.aten.add.Scalar %7063, %float9.999990e-07_8934, %int1_8935 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7065 = torch.aten.rsqrt %7064 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7066 = torch.aten.mul.Tensor %7060, %7065 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8936 = torch.constant.int 5
    %7067 = torch.prims.convert_element_type %7066, %int5_8936 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.7.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.7.norm.key_norm.scale : tensor<128xf16>
    %7068 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7069 = torch.aten.mul.Tensor %7067, %7068 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_8937 = torch.constant.int 5
    %7070 = torch.prims.convert_element_type %7059, %int5_8937 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_8938 = torch.constant.int 5
    %7071 = torch.prims.convert_element_type %7069, %int5_8938 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_8939 = torch.constant.int 6
    %7072 = torch.prims.convert_element_type %7070, %int6_8939 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8940 = torch.constant.int 1
    %int24_8941 = torch.constant.int 24
    %int4608_8942 = torch.constant.int 4608
    %int64_8943 = torch.constant.int 64
    %int1_8944 = torch.constant.int 1
    %int2_8945 = torch.constant.int 2
    %7073 = torch.prim.ListConstruct %int1_8940, %int24_8941, %int4608_8942, %int64_8943, %int1_8944, %int2_8945 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7074 = torch.aten.view %7072, %7073 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_8946 = torch.constant.int 6
    %7075 = torch.prims.convert_element_type %7071, %int6_8946 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_8947 = torch.constant.int 1
    %int24_8948 = torch.constant.int 24
    %int4608_8949 = torch.constant.int 4608
    %int64_8950 = torch.constant.int 64
    %int1_8951 = torch.constant.int 1
    %int2_8952 = torch.constant.int 2
    %7076 = torch.prim.ListConstruct %int1_8947, %int24_8948, %int4608_8949, %int64_8950, %int1_8951, %int2_8952 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7077 = torch.aten.view %7075, %7076 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_8953 = torch.constant.int 5
    %int0_8954 = torch.constant.int 0
    %7078 = torch.aten.select.int %211, %int5_8953, %int0_8954 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8955 = torch.constant.int 5
    %int0_8956 = torch.constant.int 0
    %7079 = torch.aten.select.int %7074, %int5_8955, %int0_8956 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7080 = torch.aten.mul.Tensor %7078, %7079 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8957 = torch.constant.int 5
    %int1_8958 = torch.constant.int 1
    %7081 = torch.aten.select.int %211, %int5_8957, %int1_8958 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8959 = torch.constant.int 5
    %int1_8960 = torch.constant.int 1
    %7082 = torch.aten.select.int %7074, %int5_8959, %int1_8960 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7083 = torch.aten.mul.Tensor %7081, %7082 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8961 = torch.constant.int 1
    %7084 = torch.aten.add.Tensor %7080, %7083, %int1_8961 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8962 = torch.constant.int 5
    %int0_8963 = torch.constant.int 0
    %7085 = torch.aten.select.int %211, %int5_8962, %int0_8963 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8964 = torch.constant.int 5
    %int0_8965 = torch.constant.int 0
    %7086 = torch.aten.select.int %7077, %int5_8964, %int0_8965 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7087 = torch.aten.mul.Tensor %7085, %7086 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_8966 = torch.constant.int 5
    %int1_8967 = torch.constant.int 1
    %7088 = torch.aten.select.int %211, %int5_8966, %int1_8967 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_8968 = torch.constant.int 5
    %int1_8969 = torch.constant.int 1
    %7089 = torch.aten.select.int %7077, %int5_8968, %int1_8969 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7090 = torch.aten.mul.Tensor %7088, %7089 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8970 = torch.constant.int 1
    %7091 = torch.aten.add.Tensor %7087, %7090, %int1_8970 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_8971 = torch.constant.int 1
    %int24_8972 = torch.constant.int 24
    %int4608_8973 = torch.constant.int 4608
    %int128_8974 = torch.constant.int 128
    %7092 = torch.prim.ListConstruct %int1_8971, %int24_8972, %int4608_8973, %int128_8974 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7093 = torch.aten.view %7084, %7092 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8975 = torch.constant.int 5
    %7094 = torch.prims.convert_element_type %7093, %int5_8975 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_8976 = torch.constant.int 1
    %int24_8977 = torch.constant.int 24
    %int4608_8978 = torch.constant.int 4608
    %int128_8979 = torch.constant.int 128
    %7095 = torch.prim.ListConstruct %int1_8976, %int24_8977, %int4608_8978, %int128_8979 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7096 = torch.aten.view %7091, %7095 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_8980 = torch.constant.int 5
    %7097 = torch.prims.convert_element_type %7096, %int5_8980 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_8981 = torch.constant.float 0.000000e+00
    %false_8982 = torch.constant.bool false
    %none_8983 = torch.constant.none
    %none_8984 = torch.constant.none
    %7098:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7094, %7097, %7049, %float0.000000e00_8981, %false_8982, %none_8983, %none_8984) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_8985 = torch.constant.int 0
    %int2_8986 = torch.constant.int 2
    %int1_8987 = torch.constant.int 1
    %int3_8988 = torch.constant.int 3
    %7099 = torch.prim.ListConstruct %int0_8985, %int2_8986, %int1_8987, %int3_8988 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7100 = torch.aten.permute %7098#0, %7099 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_8989 = torch.constant.int 1
    %int4608_8990 = torch.constant.int 4608
    %int3072_8991 = torch.constant.int 3072
    %7101 = torch.prim.ListConstruct %int1_8989, %int4608_8990, %int3072_8991 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7102 = torch.aten.view %7100, %7101 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_8992 = torch.constant.str "tanh"
    %7103 = torch.aten.gelu %7042, %str_8992 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7104 = torch.prim.ListConstruct %7102, %7103 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_8993 = torch.constant.int 2
    %7105 = torch.aten.cat %7104, %int2_8993 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_8994 = torch.constant.int 4608
    %int15360_8995 = torch.constant.int 15360
    %7106 = torch.prim.ListConstruct %int4608_8994, %int15360_8995 : (!torch.int, !torch.int) -> !torch.list<int>
    %7107 = torch.aten.view %7105, %7106 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.7.linear2.weight = util.global.load @__auto.sampler.single_blocks.7.linear2.weight : tensor<3072x15360xf16>
    %7108 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_8996 = torch.constant.int 0
    %int1_8997 = torch.constant.int 1
    %7109 = torch.aten.transpose.int %7108, %int0_8996, %int1_8997 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.7.linear2.bias = util.global.load @__auto.sampler.single_blocks.7.linear2.bias : tensor<3072xf16>
    %7110 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.7.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_8998 = torch.constant.int 6
    %7111 = torch.prims.convert_element_type %7110, %int6_8998 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_8999 = torch.constant.int 6
    %7112 = torch.prims.convert_element_type %7107, %int6_8999 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_9000 = torch.constant.int 6
    %7113 = torch.prims.convert_element_type %7109, %int6_9000 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7114 = torch.aten.mm %7112, %7113 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_9001 = torch.constant.int 1
    %7115 = torch.aten.mul.Scalar %7114, %int1_9001 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_9002 = torch.constant.int 1
    %7116 = torch.aten.mul.Scalar %7111, %int1_9002 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_9003 = torch.constant.int 1
    %7117 = torch.aten.add.Tensor %7115, %7116, %int1_9003 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_9004 = torch.constant.int 5
    %7118 = torch.prims.convert_element_type %7117, %int5_9004 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_9005 = torch.constant.int 1
    %int4608_9006 = torch.constant.int 4608
    %int3072_9007 = torch.constant.int 3072
    %7119 = torch.prim.ListConstruct %int1_9005, %int4608_9006, %int3072_9007 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7120 = torch.aten.view %7118, %7119 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7121 = torch.aten.mul.Tensor %7015, %7120 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9008 = torch.constant.int 1
    %7122 = torch.aten.add.Tensor %6997, %7121, %int1_9008 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7123 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.8.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.8.modulation.lin.weight : tensor<9216x3072xf16>
    %7124 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_9009 = torch.constant.int 0
    %int1_9010 = torch.constant.int 1
    %7125 = torch.aten.transpose.int %7124, %int0_9009, %int1_9010 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.8.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.8.modulation.lin.bias : tensor<9216xf16>
    %7126 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_9011 = torch.constant.int 6
    %7127 = torch.prims.convert_element_type %7126, %int6_9011 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_9012 = torch.constant.int 6
    %7128 = torch.prims.convert_element_type %7123, %int6_9012 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_9013 = torch.constant.int 6
    %7129 = torch.prims.convert_element_type %7125, %int6_9013 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7130 = torch.aten.mm %7128, %7129 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_9014 = torch.constant.int 1
    %7131 = torch.aten.mul.Scalar %7130, %int1_9014 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_9015 = torch.constant.int 1
    %7132 = torch.aten.mul.Scalar %7127, %int1_9015 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_9016 = torch.constant.int 1
    %7133 = torch.aten.add.Tensor %7131, %7132, %int1_9016 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_9017 = torch.constant.int 5
    %7134 = torch.prims.convert_element_type %7133, %int5_9017 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_9018 = torch.constant.int 0
    %int0_9019 = torch.constant.int 0
    %int9223372036854775807_9020 = torch.constant.int 9223372036854775807
    %int1_9021 = torch.constant.int 1
    %7135 = torch.aten.slice.Tensor %7134, %int0_9018, %int0_9019, %int9223372036854775807_9020, %int1_9021 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_9022 = torch.constant.int 1
    %7136 = torch.aten.unsqueeze %7135, %int1_9022 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_9023 = torch.constant.int 2
    %int0_9024 = torch.constant.int 0
    %int9223372036854775807_9025 = torch.constant.int 9223372036854775807
    %int1_9026 = torch.constant.int 1
    %7137 = torch.aten.slice.Tensor %7136, %int2_9023, %int0_9024, %int9223372036854775807_9025, %int1_9026 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_9027 = torch.constant.int -1
    %int0_9028 = torch.constant.int 0
    %int3072_9029 = torch.constant.int 3072
    %int1_9030 = torch.constant.int 1
    %7138 = torch.aten.slice.Tensor %7137, %int-1_9027, %int0_9028, %int3072_9029, %int1_9030 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9031 = torch.constant.int -1
    %int3072_9032 = torch.constant.int 3072
    %int6144_9033 = torch.constant.int 6144
    %int1_9034 = torch.constant.int 1
    %7139 = torch.aten.slice.Tensor %7137, %int-1_9031, %int3072_9032, %int6144_9033, %int1_9034 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9035 = torch.constant.int -1
    %int6144_9036 = torch.constant.int 6144
    %int9216_9037 = torch.constant.int 9216
    %int1_9038 = torch.constant.int 1
    %7140 = torch.aten.slice.Tensor %7137, %int-1_9035, %int6144_9036, %int9216_9037, %int1_9038 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_9039 = torch.constant.int 1
    %int1_9040 = torch.constant.int 1
    %7141 = torch.aten.add.Scalar %7139, %int1_9039, %int1_9040 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_9041 = torch.constant.int 6
    %7142 = torch.prims.convert_element_type %7122, %int6_9041 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_9042 = torch.constant.int 2
    %7143 = torch.prim.ListConstruct %int2_9042 : (!torch.int) -> !torch.list<int>
    %int0_9043 = torch.constant.int 0
    %true_9044 = torch.constant.bool true
    %result0_9045, %result1_9046 = torch.aten.var_mean.correction %7142, %7143, %int0_9043, %true_9044 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_9047 = torch.constant.float 9.9999999999999995E-7
    %int1_9048 = torch.constant.int 1
    %7144 = torch.aten.add.Scalar %result0_9045, %float9.999990e-07_9047, %int1_9048 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7145 = torch.aten.rsqrt %7144 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_9049 = torch.constant.int 1
    %7146 = torch.aten.sub.Tensor %7122, %result1_9046, %int1_9049 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7147 = torch.aten.mul.Tensor %7146, %7145 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_9050 = torch.constant.int 5
    %7148 = torch.prims.convert_element_type %7147, %int5_9050 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7149 = torch.aten.mul.Tensor %7141, %7148 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9051 = torch.constant.int 1
    %7150 = torch.aten.add.Tensor %7149, %7138, %int1_9051 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_9052 = torch.constant.int 4608
    %int3072_9053 = torch.constant.int 3072
    %7151 = torch.prim.ListConstruct %int4608_9052, %int3072_9053 : (!torch.int, !torch.int) -> !torch.list<int>
    %7152 = torch.aten.view %7150, %7151 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.8.linear1.weight = util.global.load @__auto.sampler.single_blocks.8.linear1.weight : tensor<21504x3072xf16>
    %7153 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_9054 = torch.constant.int 0
    %int1_9055 = torch.constant.int 1
    %7154 = torch.aten.transpose.int %7153, %int0_9054, %int1_9055 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.8.linear1.bias = util.global.load @__auto.sampler.single_blocks.8.linear1.bias : tensor<21504xf16>
    %7155 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_9056 = torch.constant.int 6
    %7156 = torch.prims.convert_element_type %7155, %int6_9056 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_9057 = torch.constant.int 6
    %7157 = torch.prims.convert_element_type %7152, %int6_9057 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_9058 = torch.constant.int 6
    %7158 = torch.prims.convert_element_type %7154, %int6_9058 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7159 = torch.aten.mm %7157, %7158 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_9059 = torch.constant.int 1
    %7160 = torch.aten.mul.Scalar %7159, %int1_9059 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_9060 = torch.constant.int 1
    %7161 = torch.aten.mul.Scalar %7156, %int1_9060 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_9061 = torch.constant.int 1
    %7162 = torch.aten.add.Tensor %7160, %7161, %int1_9061 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_9062 = torch.constant.int 5
    %7163 = torch.prims.convert_element_type %7162, %int5_9062 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_9063 = torch.constant.int 1
    %int4608_9064 = torch.constant.int 4608
    %int21504_9065 = torch.constant.int 21504
    %7164 = torch.prim.ListConstruct %int1_9063, %int4608_9064, %int21504_9065 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7165 = torch.aten.view %7163, %7164 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_9066 = torch.constant.int -1
    %int0_9067 = torch.constant.int 0
    %int9216_9068 = torch.constant.int 9216
    %int1_9069 = torch.constant.int 1
    %7166 = torch.aten.slice.Tensor %7165, %int-1_9066, %int0_9067, %int9216_9068, %int1_9069 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_9070 = torch.constant.int -1
    %int9216_9071 = torch.constant.int 9216
    %int21504_9072 = torch.constant.int 21504
    %int1_9073 = torch.constant.int 1
    %7167 = torch.aten.slice.Tensor %7165, %int-1_9070, %int9216_9071, %int21504_9072, %int1_9073 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_9074 = torch.constant.int 1
    %int4608_9075 = torch.constant.int 4608
    %int3_9076 = torch.constant.int 3
    %int24_9077 = torch.constant.int 24
    %int128_9078 = torch.constant.int 128
    %7168 = torch.prim.ListConstruct %int1_9074, %int4608_9075, %int3_9076, %int24_9077, %int128_9078 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7169 = torch.aten.view %7166, %7168 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_9079 = torch.constant.int 2
    %int0_9080 = torch.constant.int 0
    %int3_9081 = torch.constant.int 3
    %int1_9082 = torch.constant.int 1
    %int4_9083 = torch.constant.int 4
    %7170 = torch.prim.ListConstruct %int2_9079, %int0_9080, %int3_9081, %int1_9082, %int4_9083 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7171 = torch.aten.permute %7169, %7170 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_9084 = torch.constant.int 0
    %int0_9085 = torch.constant.int 0
    %7172 = torch.aten.select.int %7171, %int0_9084, %int0_9085 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9086 = torch.constant.int 0
    %int1_9087 = torch.constant.int 1
    %7173 = torch.aten.select.int %7171, %int0_9086, %int1_9087 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9088 = torch.constant.int 0
    %int2_9089 = torch.constant.int 2
    %7174 = torch.aten.select.int %7171, %int0_9088, %int2_9089 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9090 = torch.constant.int 6
    %7175 = torch.prims.convert_element_type %7172, %int6_9090 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9091 = torch.constant.int 2
    %7176 = torch.aten.pow.Tensor_Scalar %7175, %int2_9091 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9092 = torch.constant.int -1
    %7177 = torch.prim.ListConstruct %int-1_9092 : (!torch.int) -> !torch.list<int>
    %true_9093 = torch.constant.bool true
    %none_9094 = torch.constant.none
    %7178 = torch.aten.mean.dim %7176, %7177, %true_9093, %none_9094 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9095 = torch.constant.float 9.9999999999999995E-7
    %int1_9096 = torch.constant.int 1
    %7179 = torch.aten.add.Scalar %7178, %float9.999990e-07_9095, %int1_9096 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7180 = torch.aten.rsqrt %7179 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7181 = torch.aten.mul.Tensor %7175, %7180 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9097 = torch.constant.int 5
    %7182 = torch.prims.convert_element_type %7181, %int5_9097 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.8.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.8.norm.query_norm.scale : tensor<128xf16>
    %7183 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7184 = torch.aten.mul.Tensor %7182, %7183 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9098 = torch.constant.int 6
    %7185 = torch.prims.convert_element_type %7173, %int6_9098 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9099 = torch.constant.int 2
    %7186 = torch.aten.pow.Tensor_Scalar %7185, %int2_9099 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9100 = torch.constant.int -1
    %7187 = torch.prim.ListConstruct %int-1_9100 : (!torch.int) -> !torch.list<int>
    %true_9101 = torch.constant.bool true
    %none_9102 = torch.constant.none
    %7188 = torch.aten.mean.dim %7186, %7187, %true_9101, %none_9102 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9103 = torch.constant.float 9.9999999999999995E-7
    %int1_9104 = torch.constant.int 1
    %7189 = torch.aten.add.Scalar %7188, %float9.999990e-07_9103, %int1_9104 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7190 = torch.aten.rsqrt %7189 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7191 = torch.aten.mul.Tensor %7185, %7190 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9105 = torch.constant.int 5
    %7192 = torch.prims.convert_element_type %7191, %int5_9105 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.8.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.8.norm.key_norm.scale : tensor<128xf16>
    %7193 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7194 = torch.aten.mul.Tensor %7192, %7193 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9106 = torch.constant.int 5
    %7195 = torch.prims.convert_element_type %7184, %int5_9106 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9107 = torch.constant.int 5
    %7196 = torch.prims.convert_element_type %7194, %int5_9107 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9108 = torch.constant.int 6
    %7197 = torch.prims.convert_element_type %7195, %int6_9108 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9109 = torch.constant.int 1
    %int24_9110 = torch.constant.int 24
    %int4608_9111 = torch.constant.int 4608
    %int64_9112 = torch.constant.int 64
    %int1_9113 = torch.constant.int 1
    %int2_9114 = torch.constant.int 2
    %7198 = torch.prim.ListConstruct %int1_9109, %int24_9110, %int4608_9111, %int64_9112, %int1_9113, %int2_9114 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7199 = torch.aten.view %7197, %7198 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_9115 = torch.constant.int 6
    %7200 = torch.prims.convert_element_type %7196, %int6_9115 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9116 = torch.constant.int 1
    %int24_9117 = torch.constant.int 24
    %int4608_9118 = torch.constant.int 4608
    %int64_9119 = torch.constant.int 64
    %int1_9120 = torch.constant.int 1
    %int2_9121 = torch.constant.int 2
    %7201 = torch.prim.ListConstruct %int1_9116, %int24_9117, %int4608_9118, %int64_9119, %int1_9120, %int2_9121 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7202 = torch.aten.view %7200, %7201 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_9122 = torch.constant.int 5
    %int0_9123 = torch.constant.int 0
    %7203 = torch.aten.select.int %211, %int5_9122, %int0_9123 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9124 = torch.constant.int 5
    %int0_9125 = torch.constant.int 0
    %7204 = torch.aten.select.int %7199, %int5_9124, %int0_9125 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7205 = torch.aten.mul.Tensor %7203, %7204 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9126 = torch.constant.int 5
    %int1_9127 = torch.constant.int 1
    %7206 = torch.aten.select.int %211, %int5_9126, %int1_9127 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9128 = torch.constant.int 5
    %int1_9129 = torch.constant.int 1
    %7207 = torch.aten.select.int %7199, %int5_9128, %int1_9129 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7208 = torch.aten.mul.Tensor %7206, %7207 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9130 = torch.constant.int 1
    %7209 = torch.aten.add.Tensor %7205, %7208, %int1_9130 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9131 = torch.constant.int 5
    %int0_9132 = torch.constant.int 0
    %7210 = torch.aten.select.int %211, %int5_9131, %int0_9132 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9133 = torch.constant.int 5
    %int0_9134 = torch.constant.int 0
    %7211 = torch.aten.select.int %7202, %int5_9133, %int0_9134 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7212 = torch.aten.mul.Tensor %7210, %7211 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9135 = torch.constant.int 5
    %int1_9136 = torch.constant.int 1
    %7213 = torch.aten.select.int %211, %int5_9135, %int1_9136 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9137 = torch.constant.int 5
    %int1_9138 = torch.constant.int 1
    %7214 = torch.aten.select.int %7202, %int5_9137, %int1_9138 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7215 = torch.aten.mul.Tensor %7213, %7214 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9139 = torch.constant.int 1
    %7216 = torch.aten.add.Tensor %7212, %7215, %int1_9139 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9140 = torch.constant.int 1
    %int24_9141 = torch.constant.int 24
    %int4608_9142 = torch.constant.int 4608
    %int128_9143 = torch.constant.int 128
    %7217 = torch.prim.ListConstruct %int1_9140, %int24_9141, %int4608_9142, %int128_9143 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7218 = torch.aten.view %7209, %7217 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9144 = torch.constant.int 5
    %7219 = torch.prims.convert_element_type %7218, %int5_9144 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_9145 = torch.constant.int 1
    %int24_9146 = torch.constant.int 24
    %int4608_9147 = torch.constant.int 4608
    %int128_9148 = torch.constant.int 128
    %7220 = torch.prim.ListConstruct %int1_9145, %int24_9146, %int4608_9147, %int128_9148 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7221 = torch.aten.view %7216, %7220 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9149 = torch.constant.int 5
    %7222 = torch.prims.convert_element_type %7221, %int5_9149 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_9150 = torch.constant.float 0.000000e+00
    %false_9151 = torch.constant.bool false
    %none_9152 = torch.constant.none
    %none_9153 = torch.constant.none
    %7223:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7219, %7222, %7174, %float0.000000e00_9150, %false_9151, %none_9152, %none_9153) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_9154 = torch.constant.int 0
    %int2_9155 = torch.constant.int 2
    %int1_9156 = torch.constant.int 1
    %int3_9157 = torch.constant.int 3
    %7224 = torch.prim.ListConstruct %int0_9154, %int2_9155, %int1_9156, %int3_9157 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7225 = torch.aten.permute %7223#0, %7224 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_9158 = torch.constant.int 1
    %int4608_9159 = torch.constant.int 4608
    %int3072_9160 = torch.constant.int 3072
    %7226 = torch.prim.ListConstruct %int1_9158, %int4608_9159, %int3072_9160 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7227 = torch.aten.view %7225, %7226 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_9161 = torch.constant.str "tanh"
    %7228 = torch.aten.gelu %7167, %str_9161 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7229 = torch.prim.ListConstruct %7227, %7228 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_9162 = torch.constant.int 2
    %7230 = torch.aten.cat %7229, %int2_9162 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_9163 = torch.constant.int 4608
    %int15360_9164 = torch.constant.int 15360
    %7231 = torch.prim.ListConstruct %int4608_9163, %int15360_9164 : (!torch.int, !torch.int) -> !torch.list<int>
    %7232 = torch.aten.view %7230, %7231 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.8.linear2.weight = util.global.load @__auto.sampler.single_blocks.8.linear2.weight : tensor<3072x15360xf16>
    %7233 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_9165 = torch.constant.int 0
    %int1_9166 = torch.constant.int 1
    %7234 = torch.aten.transpose.int %7233, %int0_9165, %int1_9166 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.8.linear2.bias = util.global.load @__auto.sampler.single_blocks.8.linear2.bias : tensor<3072xf16>
    %7235 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.8.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_9167 = torch.constant.int 6
    %7236 = torch.prims.convert_element_type %7235, %int6_9167 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_9168 = torch.constant.int 6
    %7237 = torch.prims.convert_element_type %7232, %int6_9168 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_9169 = torch.constant.int 6
    %7238 = torch.prims.convert_element_type %7234, %int6_9169 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7239 = torch.aten.mm %7237, %7238 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_9170 = torch.constant.int 1
    %7240 = torch.aten.mul.Scalar %7239, %int1_9170 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_9171 = torch.constant.int 1
    %7241 = torch.aten.mul.Scalar %7236, %int1_9171 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_9172 = torch.constant.int 1
    %7242 = torch.aten.add.Tensor %7240, %7241, %int1_9172 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_9173 = torch.constant.int 5
    %7243 = torch.prims.convert_element_type %7242, %int5_9173 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_9174 = torch.constant.int 1
    %int4608_9175 = torch.constant.int 4608
    %int3072_9176 = torch.constant.int 3072
    %7244 = torch.prim.ListConstruct %int1_9174, %int4608_9175, %int3072_9176 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7245 = torch.aten.view %7243, %7244 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7246 = torch.aten.mul.Tensor %7140, %7245 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9177 = torch.constant.int 1
    %7247 = torch.aten.add.Tensor %7122, %7246, %int1_9177 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7248 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.9.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.9.modulation.lin.weight : tensor<9216x3072xf16>
    %7249 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_9178 = torch.constant.int 0
    %int1_9179 = torch.constant.int 1
    %7250 = torch.aten.transpose.int %7249, %int0_9178, %int1_9179 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.9.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.9.modulation.lin.bias : tensor<9216xf16>
    %7251 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_9180 = torch.constant.int 6
    %7252 = torch.prims.convert_element_type %7251, %int6_9180 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_9181 = torch.constant.int 6
    %7253 = torch.prims.convert_element_type %7248, %int6_9181 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_9182 = torch.constant.int 6
    %7254 = torch.prims.convert_element_type %7250, %int6_9182 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7255 = torch.aten.mm %7253, %7254 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_9183 = torch.constant.int 1
    %7256 = torch.aten.mul.Scalar %7255, %int1_9183 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_9184 = torch.constant.int 1
    %7257 = torch.aten.mul.Scalar %7252, %int1_9184 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_9185 = torch.constant.int 1
    %7258 = torch.aten.add.Tensor %7256, %7257, %int1_9185 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_9186 = torch.constant.int 5
    %7259 = torch.prims.convert_element_type %7258, %int5_9186 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_9187 = torch.constant.int 0
    %int0_9188 = torch.constant.int 0
    %int9223372036854775807_9189 = torch.constant.int 9223372036854775807
    %int1_9190 = torch.constant.int 1
    %7260 = torch.aten.slice.Tensor %7259, %int0_9187, %int0_9188, %int9223372036854775807_9189, %int1_9190 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_9191 = torch.constant.int 1
    %7261 = torch.aten.unsqueeze %7260, %int1_9191 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_9192 = torch.constant.int 2
    %int0_9193 = torch.constant.int 0
    %int9223372036854775807_9194 = torch.constant.int 9223372036854775807
    %int1_9195 = torch.constant.int 1
    %7262 = torch.aten.slice.Tensor %7261, %int2_9192, %int0_9193, %int9223372036854775807_9194, %int1_9195 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_9196 = torch.constant.int -1
    %int0_9197 = torch.constant.int 0
    %int3072_9198 = torch.constant.int 3072
    %int1_9199 = torch.constant.int 1
    %7263 = torch.aten.slice.Tensor %7262, %int-1_9196, %int0_9197, %int3072_9198, %int1_9199 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9200 = torch.constant.int -1
    %int3072_9201 = torch.constant.int 3072
    %int6144_9202 = torch.constant.int 6144
    %int1_9203 = torch.constant.int 1
    %7264 = torch.aten.slice.Tensor %7262, %int-1_9200, %int3072_9201, %int6144_9202, %int1_9203 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9204 = torch.constant.int -1
    %int6144_9205 = torch.constant.int 6144
    %int9216_9206 = torch.constant.int 9216
    %int1_9207 = torch.constant.int 1
    %7265 = torch.aten.slice.Tensor %7262, %int-1_9204, %int6144_9205, %int9216_9206, %int1_9207 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_9208 = torch.constant.int 1
    %int1_9209 = torch.constant.int 1
    %7266 = torch.aten.add.Scalar %7264, %int1_9208, %int1_9209 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_9210 = torch.constant.int 6
    %7267 = torch.prims.convert_element_type %7247, %int6_9210 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_9211 = torch.constant.int 2
    %7268 = torch.prim.ListConstruct %int2_9211 : (!torch.int) -> !torch.list<int>
    %int0_9212 = torch.constant.int 0
    %true_9213 = torch.constant.bool true
    %result0_9214, %result1_9215 = torch.aten.var_mean.correction %7267, %7268, %int0_9212, %true_9213 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_9216 = torch.constant.float 9.9999999999999995E-7
    %int1_9217 = torch.constant.int 1
    %7269 = torch.aten.add.Scalar %result0_9214, %float9.999990e-07_9216, %int1_9217 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7270 = torch.aten.rsqrt %7269 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_9218 = torch.constant.int 1
    %7271 = torch.aten.sub.Tensor %7247, %result1_9215, %int1_9218 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7272 = torch.aten.mul.Tensor %7271, %7270 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_9219 = torch.constant.int 5
    %7273 = torch.prims.convert_element_type %7272, %int5_9219 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7274 = torch.aten.mul.Tensor %7266, %7273 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9220 = torch.constant.int 1
    %7275 = torch.aten.add.Tensor %7274, %7263, %int1_9220 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_9221 = torch.constant.int 4608
    %int3072_9222 = torch.constant.int 3072
    %7276 = torch.prim.ListConstruct %int4608_9221, %int3072_9222 : (!torch.int, !torch.int) -> !torch.list<int>
    %7277 = torch.aten.view %7275, %7276 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.9.linear1.weight = util.global.load @__auto.sampler.single_blocks.9.linear1.weight : tensor<21504x3072xf16>
    %7278 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_9223 = torch.constant.int 0
    %int1_9224 = torch.constant.int 1
    %7279 = torch.aten.transpose.int %7278, %int0_9223, %int1_9224 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.9.linear1.bias = util.global.load @__auto.sampler.single_blocks.9.linear1.bias : tensor<21504xf16>
    %7280 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_9225 = torch.constant.int 6
    %7281 = torch.prims.convert_element_type %7280, %int6_9225 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_9226 = torch.constant.int 6
    %7282 = torch.prims.convert_element_type %7277, %int6_9226 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_9227 = torch.constant.int 6
    %7283 = torch.prims.convert_element_type %7279, %int6_9227 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7284 = torch.aten.mm %7282, %7283 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_9228 = torch.constant.int 1
    %7285 = torch.aten.mul.Scalar %7284, %int1_9228 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_9229 = torch.constant.int 1
    %7286 = torch.aten.mul.Scalar %7281, %int1_9229 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_9230 = torch.constant.int 1
    %7287 = torch.aten.add.Tensor %7285, %7286, %int1_9230 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_9231 = torch.constant.int 5
    %7288 = torch.prims.convert_element_type %7287, %int5_9231 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_9232 = torch.constant.int 1
    %int4608_9233 = torch.constant.int 4608
    %int21504_9234 = torch.constant.int 21504
    %7289 = torch.prim.ListConstruct %int1_9232, %int4608_9233, %int21504_9234 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7290 = torch.aten.view %7288, %7289 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_9235 = torch.constant.int -1
    %int0_9236 = torch.constant.int 0
    %int9216_9237 = torch.constant.int 9216
    %int1_9238 = torch.constant.int 1
    %7291 = torch.aten.slice.Tensor %7290, %int-1_9235, %int0_9236, %int9216_9237, %int1_9238 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_9239 = torch.constant.int -1
    %int9216_9240 = torch.constant.int 9216
    %int21504_9241 = torch.constant.int 21504
    %int1_9242 = torch.constant.int 1
    %7292 = torch.aten.slice.Tensor %7290, %int-1_9239, %int9216_9240, %int21504_9241, %int1_9242 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_9243 = torch.constant.int 1
    %int4608_9244 = torch.constant.int 4608
    %int3_9245 = torch.constant.int 3
    %int24_9246 = torch.constant.int 24
    %int128_9247 = torch.constant.int 128
    %7293 = torch.prim.ListConstruct %int1_9243, %int4608_9244, %int3_9245, %int24_9246, %int128_9247 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7294 = torch.aten.view %7291, %7293 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_9248 = torch.constant.int 2
    %int0_9249 = torch.constant.int 0
    %int3_9250 = torch.constant.int 3
    %int1_9251 = torch.constant.int 1
    %int4_9252 = torch.constant.int 4
    %7295 = torch.prim.ListConstruct %int2_9248, %int0_9249, %int3_9250, %int1_9251, %int4_9252 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7296 = torch.aten.permute %7294, %7295 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_9253 = torch.constant.int 0
    %int0_9254 = torch.constant.int 0
    %7297 = torch.aten.select.int %7296, %int0_9253, %int0_9254 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9255 = torch.constant.int 0
    %int1_9256 = torch.constant.int 1
    %7298 = torch.aten.select.int %7296, %int0_9255, %int1_9256 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9257 = torch.constant.int 0
    %int2_9258 = torch.constant.int 2
    %7299 = torch.aten.select.int %7296, %int0_9257, %int2_9258 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9259 = torch.constant.int 6
    %7300 = torch.prims.convert_element_type %7297, %int6_9259 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9260 = torch.constant.int 2
    %7301 = torch.aten.pow.Tensor_Scalar %7300, %int2_9260 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9261 = torch.constant.int -1
    %7302 = torch.prim.ListConstruct %int-1_9261 : (!torch.int) -> !torch.list<int>
    %true_9262 = torch.constant.bool true
    %none_9263 = torch.constant.none
    %7303 = torch.aten.mean.dim %7301, %7302, %true_9262, %none_9263 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9264 = torch.constant.float 9.9999999999999995E-7
    %int1_9265 = torch.constant.int 1
    %7304 = torch.aten.add.Scalar %7303, %float9.999990e-07_9264, %int1_9265 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7305 = torch.aten.rsqrt %7304 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7306 = torch.aten.mul.Tensor %7300, %7305 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9266 = torch.constant.int 5
    %7307 = torch.prims.convert_element_type %7306, %int5_9266 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.9.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.9.norm.query_norm.scale : tensor<128xf16>
    %7308 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7309 = torch.aten.mul.Tensor %7307, %7308 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9267 = torch.constant.int 6
    %7310 = torch.prims.convert_element_type %7298, %int6_9267 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9268 = torch.constant.int 2
    %7311 = torch.aten.pow.Tensor_Scalar %7310, %int2_9268 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9269 = torch.constant.int -1
    %7312 = torch.prim.ListConstruct %int-1_9269 : (!torch.int) -> !torch.list<int>
    %true_9270 = torch.constant.bool true
    %none_9271 = torch.constant.none
    %7313 = torch.aten.mean.dim %7311, %7312, %true_9270, %none_9271 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9272 = torch.constant.float 9.9999999999999995E-7
    %int1_9273 = torch.constant.int 1
    %7314 = torch.aten.add.Scalar %7313, %float9.999990e-07_9272, %int1_9273 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7315 = torch.aten.rsqrt %7314 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7316 = torch.aten.mul.Tensor %7310, %7315 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9274 = torch.constant.int 5
    %7317 = torch.prims.convert_element_type %7316, %int5_9274 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.9.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.9.norm.key_norm.scale : tensor<128xf16>
    %7318 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7319 = torch.aten.mul.Tensor %7317, %7318 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9275 = torch.constant.int 5
    %7320 = torch.prims.convert_element_type %7309, %int5_9275 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9276 = torch.constant.int 5
    %7321 = torch.prims.convert_element_type %7319, %int5_9276 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9277 = torch.constant.int 6
    %7322 = torch.prims.convert_element_type %7320, %int6_9277 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9278 = torch.constant.int 1
    %int24_9279 = torch.constant.int 24
    %int4608_9280 = torch.constant.int 4608
    %int64_9281 = torch.constant.int 64
    %int1_9282 = torch.constant.int 1
    %int2_9283 = torch.constant.int 2
    %7323 = torch.prim.ListConstruct %int1_9278, %int24_9279, %int4608_9280, %int64_9281, %int1_9282, %int2_9283 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7324 = torch.aten.view %7322, %7323 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_9284 = torch.constant.int 6
    %7325 = torch.prims.convert_element_type %7321, %int6_9284 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9285 = torch.constant.int 1
    %int24_9286 = torch.constant.int 24
    %int4608_9287 = torch.constant.int 4608
    %int64_9288 = torch.constant.int 64
    %int1_9289 = torch.constant.int 1
    %int2_9290 = torch.constant.int 2
    %7326 = torch.prim.ListConstruct %int1_9285, %int24_9286, %int4608_9287, %int64_9288, %int1_9289, %int2_9290 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7327 = torch.aten.view %7325, %7326 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_9291 = torch.constant.int 5
    %int0_9292 = torch.constant.int 0
    %7328 = torch.aten.select.int %211, %int5_9291, %int0_9292 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9293 = torch.constant.int 5
    %int0_9294 = torch.constant.int 0
    %7329 = torch.aten.select.int %7324, %int5_9293, %int0_9294 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7330 = torch.aten.mul.Tensor %7328, %7329 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9295 = torch.constant.int 5
    %int1_9296 = torch.constant.int 1
    %7331 = torch.aten.select.int %211, %int5_9295, %int1_9296 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9297 = torch.constant.int 5
    %int1_9298 = torch.constant.int 1
    %7332 = torch.aten.select.int %7324, %int5_9297, %int1_9298 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7333 = torch.aten.mul.Tensor %7331, %7332 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9299 = torch.constant.int 1
    %7334 = torch.aten.add.Tensor %7330, %7333, %int1_9299 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9300 = torch.constant.int 5
    %int0_9301 = torch.constant.int 0
    %7335 = torch.aten.select.int %211, %int5_9300, %int0_9301 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9302 = torch.constant.int 5
    %int0_9303 = torch.constant.int 0
    %7336 = torch.aten.select.int %7327, %int5_9302, %int0_9303 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7337 = torch.aten.mul.Tensor %7335, %7336 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9304 = torch.constant.int 5
    %int1_9305 = torch.constant.int 1
    %7338 = torch.aten.select.int %211, %int5_9304, %int1_9305 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9306 = torch.constant.int 5
    %int1_9307 = torch.constant.int 1
    %7339 = torch.aten.select.int %7327, %int5_9306, %int1_9307 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7340 = torch.aten.mul.Tensor %7338, %7339 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9308 = torch.constant.int 1
    %7341 = torch.aten.add.Tensor %7337, %7340, %int1_9308 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9309 = torch.constant.int 1
    %int24_9310 = torch.constant.int 24
    %int4608_9311 = torch.constant.int 4608
    %int128_9312 = torch.constant.int 128
    %7342 = torch.prim.ListConstruct %int1_9309, %int24_9310, %int4608_9311, %int128_9312 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7343 = torch.aten.view %7334, %7342 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9313 = torch.constant.int 5
    %7344 = torch.prims.convert_element_type %7343, %int5_9313 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_9314 = torch.constant.int 1
    %int24_9315 = torch.constant.int 24
    %int4608_9316 = torch.constant.int 4608
    %int128_9317 = torch.constant.int 128
    %7345 = torch.prim.ListConstruct %int1_9314, %int24_9315, %int4608_9316, %int128_9317 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7346 = torch.aten.view %7341, %7345 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9318 = torch.constant.int 5
    %7347 = torch.prims.convert_element_type %7346, %int5_9318 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_9319 = torch.constant.float 0.000000e+00
    %false_9320 = torch.constant.bool false
    %none_9321 = torch.constant.none
    %none_9322 = torch.constant.none
    %7348:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7344, %7347, %7299, %float0.000000e00_9319, %false_9320, %none_9321, %none_9322) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_9323 = torch.constant.int 0
    %int2_9324 = torch.constant.int 2
    %int1_9325 = torch.constant.int 1
    %int3_9326 = torch.constant.int 3
    %7349 = torch.prim.ListConstruct %int0_9323, %int2_9324, %int1_9325, %int3_9326 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7350 = torch.aten.permute %7348#0, %7349 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_9327 = torch.constant.int 1
    %int4608_9328 = torch.constant.int 4608
    %int3072_9329 = torch.constant.int 3072
    %7351 = torch.prim.ListConstruct %int1_9327, %int4608_9328, %int3072_9329 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7352 = torch.aten.view %7350, %7351 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_9330 = torch.constant.str "tanh"
    %7353 = torch.aten.gelu %7292, %str_9330 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7354 = torch.prim.ListConstruct %7352, %7353 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_9331 = torch.constant.int 2
    %7355 = torch.aten.cat %7354, %int2_9331 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_9332 = torch.constant.int 4608
    %int15360_9333 = torch.constant.int 15360
    %7356 = torch.prim.ListConstruct %int4608_9332, %int15360_9333 : (!torch.int, !torch.int) -> !torch.list<int>
    %7357 = torch.aten.view %7355, %7356 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.9.linear2.weight = util.global.load @__auto.sampler.single_blocks.9.linear2.weight : tensor<3072x15360xf16>
    %7358 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_9334 = torch.constant.int 0
    %int1_9335 = torch.constant.int 1
    %7359 = torch.aten.transpose.int %7358, %int0_9334, %int1_9335 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.9.linear2.bias = util.global.load @__auto.sampler.single_blocks.9.linear2.bias : tensor<3072xf16>
    %7360 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.9.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_9336 = torch.constant.int 6
    %7361 = torch.prims.convert_element_type %7360, %int6_9336 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_9337 = torch.constant.int 6
    %7362 = torch.prims.convert_element_type %7357, %int6_9337 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_9338 = torch.constant.int 6
    %7363 = torch.prims.convert_element_type %7359, %int6_9338 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7364 = torch.aten.mm %7362, %7363 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_9339 = torch.constant.int 1
    %7365 = torch.aten.mul.Scalar %7364, %int1_9339 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_9340 = torch.constant.int 1
    %7366 = torch.aten.mul.Scalar %7361, %int1_9340 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_9341 = torch.constant.int 1
    %7367 = torch.aten.add.Tensor %7365, %7366, %int1_9341 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_9342 = torch.constant.int 5
    %7368 = torch.prims.convert_element_type %7367, %int5_9342 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_9343 = torch.constant.int 1
    %int4608_9344 = torch.constant.int 4608
    %int3072_9345 = torch.constant.int 3072
    %7369 = torch.prim.ListConstruct %int1_9343, %int4608_9344, %int3072_9345 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7370 = torch.aten.view %7368, %7369 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7371 = torch.aten.mul.Tensor %7265, %7370 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9346 = torch.constant.int 1
    %7372 = torch.aten.add.Tensor %7247, %7371, %int1_9346 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7373 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.10.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.10.modulation.lin.weight : tensor<9216x3072xf16>
    %7374 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_9347 = torch.constant.int 0
    %int1_9348 = torch.constant.int 1
    %7375 = torch.aten.transpose.int %7374, %int0_9347, %int1_9348 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.10.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.10.modulation.lin.bias : tensor<9216xf16>
    %7376 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_9349 = torch.constant.int 6
    %7377 = torch.prims.convert_element_type %7376, %int6_9349 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_9350 = torch.constant.int 6
    %7378 = torch.prims.convert_element_type %7373, %int6_9350 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_9351 = torch.constant.int 6
    %7379 = torch.prims.convert_element_type %7375, %int6_9351 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7380 = torch.aten.mm %7378, %7379 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_9352 = torch.constant.int 1
    %7381 = torch.aten.mul.Scalar %7380, %int1_9352 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_9353 = torch.constant.int 1
    %7382 = torch.aten.mul.Scalar %7377, %int1_9353 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_9354 = torch.constant.int 1
    %7383 = torch.aten.add.Tensor %7381, %7382, %int1_9354 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_9355 = torch.constant.int 5
    %7384 = torch.prims.convert_element_type %7383, %int5_9355 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_9356 = torch.constant.int 0
    %int0_9357 = torch.constant.int 0
    %int9223372036854775807_9358 = torch.constant.int 9223372036854775807
    %int1_9359 = torch.constant.int 1
    %7385 = torch.aten.slice.Tensor %7384, %int0_9356, %int0_9357, %int9223372036854775807_9358, %int1_9359 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_9360 = torch.constant.int 1
    %7386 = torch.aten.unsqueeze %7385, %int1_9360 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_9361 = torch.constant.int 2
    %int0_9362 = torch.constant.int 0
    %int9223372036854775807_9363 = torch.constant.int 9223372036854775807
    %int1_9364 = torch.constant.int 1
    %7387 = torch.aten.slice.Tensor %7386, %int2_9361, %int0_9362, %int9223372036854775807_9363, %int1_9364 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_9365 = torch.constant.int -1
    %int0_9366 = torch.constant.int 0
    %int3072_9367 = torch.constant.int 3072
    %int1_9368 = torch.constant.int 1
    %7388 = torch.aten.slice.Tensor %7387, %int-1_9365, %int0_9366, %int3072_9367, %int1_9368 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9369 = torch.constant.int -1
    %int3072_9370 = torch.constant.int 3072
    %int6144_9371 = torch.constant.int 6144
    %int1_9372 = torch.constant.int 1
    %7389 = torch.aten.slice.Tensor %7387, %int-1_9369, %int3072_9370, %int6144_9371, %int1_9372 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9373 = torch.constant.int -1
    %int6144_9374 = torch.constant.int 6144
    %int9216_9375 = torch.constant.int 9216
    %int1_9376 = torch.constant.int 1
    %7390 = torch.aten.slice.Tensor %7387, %int-1_9373, %int6144_9374, %int9216_9375, %int1_9376 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_9377 = torch.constant.int 1
    %int1_9378 = torch.constant.int 1
    %7391 = torch.aten.add.Scalar %7389, %int1_9377, %int1_9378 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_9379 = torch.constant.int 6
    %7392 = torch.prims.convert_element_type %7372, %int6_9379 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_9380 = torch.constant.int 2
    %7393 = torch.prim.ListConstruct %int2_9380 : (!torch.int) -> !torch.list<int>
    %int0_9381 = torch.constant.int 0
    %true_9382 = torch.constant.bool true
    %result0_9383, %result1_9384 = torch.aten.var_mean.correction %7392, %7393, %int0_9381, %true_9382 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_9385 = torch.constant.float 9.9999999999999995E-7
    %int1_9386 = torch.constant.int 1
    %7394 = torch.aten.add.Scalar %result0_9383, %float9.999990e-07_9385, %int1_9386 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7395 = torch.aten.rsqrt %7394 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_9387 = torch.constant.int 1
    %7396 = torch.aten.sub.Tensor %7372, %result1_9384, %int1_9387 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7397 = torch.aten.mul.Tensor %7396, %7395 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_9388 = torch.constant.int 5
    %7398 = torch.prims.convert_element_type %7397, %int5_9388 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7399 = torch.aten.mul.Tensor %7391, %7398 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9389 = torch.constant.int 1
    %7400 = torch.aten.add.Tensor %7399, %7388, %int1_9389 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_9390 = torch.constant.int 4608
    %int3072_9391 = torch.constant.int 3072
    %7401 = torch.prim.ListConstruct %int4608_9390, %int3072_9391 : (!torch.int, !torch.int) -> !torch.list<int>
    %7402 = torch.aten.view %7400, %7401 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.10.linear1.weight = util.global.load @__auto.sampler.single_blocks.10.linear1.weight : tensor<21504x3072xf16>
    %7403 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_9392 = torch.constant.int 0
    %int1_9393 = torch.constant.int 1
    %7404 = torch.aten.transpose.int %7403, %int0_9392, %int1_9393 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.10.linear1.bias = util.global.load @__auto.sampler.single_blocks.10.linear1.bias : tensor<21504xf16>
    %7405 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_9394 = torch.constant.int 6
    %7406 = torch.prims.convert_element_type %7405, %int6_9394 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_9395 = torch.constant.int 6
    %7407 = torch.prims.convert_element_type %7402, %int6_9395 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_9396 = torch.constant.int 6
    %7408 = torch.prims.convert_element_type %7404, %int6_9396 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7409 = torch.aten.mm %7407, %7408 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_9397 = torch.constant.int 1
    %7410 = torch.aten.mul.Scalar %7409, %int1_9397 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_9398 = torch.constant.int 1
    %7411 = torch.aten.mul.Scalar %7406, %int1_9398 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_9399 = torch.constant.int 1
    %7412 = torch.aten.add.Tensor %7410, %7411, %int1_9399 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_9400 = torch.constant.int 5
    %7413 = torch.prims.convert_element_type %7412, %int5_9400 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_9401 = torch.constant.int 1
    %int4608_9402 = torch.constant.int 4608
    %int21504_9403 = torch.constant.int 21504
    %7414 = torch.prim.ListConstruct %int1_9401, %int4608_9402, %int21504_9403 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7415 = torch.aten.view %7413, %7414 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_9404 = torch.constant.int -1
    %int0_9405 = torch.constant.int 0
    %int9216_9406 = torch.constant.int 9216
    %int1_9407 = torch.constant.int 1
    %7416 = torch.aten.slice.Tensor %7415, %int-1_9404, %int0_9405, %int9216_9406, %int1_9407 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_9408 = torch.constant.int -1
    %int9216_9409 = torch.constant.int 9216
    %int21504_9410 = torch.constant.int 21504
    %int1_9411 = torch.constant.int 1
    %7417 = torch.aten.slice.Tensor %7415, %int-1_9408, %int9216_9409, %int21504_9410, %int1_9411 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_9412 = torch.constant.int 1
    %int4608_9413 = torch.constant.int 4608
    %int3_9414 = torch.constant.int 3
    %int24_9415 = torch.constant.int 24
    %int128_9416 = torch.constant.int 128
    %7418 = torch.prim.ListConstruct %int1_9412, %int4608_9413, %int3_9414, %int24_9415, %int128_9416 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7419 = torch.aten.view %7416, %7418 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_9417 = torch.constant.int 2
    %int0_9418 = torch.constant.int 0
    %int3_9419 = torch.constant.int 3
    %int1_9420 = torch.constant.int 1
    %int4_9421 = torch.constant.int 4
    %7420 = torch.prim.ListConstruct %int2_9417, %int0_9418, %int3_9419, %int1_9420, %int4_9421 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7421 = torch.aten.permute %7419, %7420 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_9422 = torch.constant.int 0
    %int0_9423 = torch.constant.int 0
    %7422 = torch.aten.select.int %7421, %int0_9422, %int0_9423 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9424 = torch.constant.int 0
    %int1_9425 = torch.constant.int 1
    %7423 = torch.aten.select.int %7421, %int0_9424, %int1_9425 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9426 = torch.constant.int 0
    %int2_9427 = torch.constant.int 2
    %7424 = torch.aten.select.int %7421, %int0_9426, %int2_9427 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9428 = torch.constant.int 6
    %7425 = torch.prims.convert_element_type %7422, %int6_9428 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9429 = torch.constant.int 2
    %7426 = torch.aten.pow.Tensor_Scalar %7425, %int2_9429 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9430 = torch.constant.int -1
    %7427 = torch.prim.ListConstruct %int-1_9430 : (!torch.int) -> !torch.list<int>
    %true_9431 = torch.constant.bool true
    %none_9432 = torch.constant.none
    %7428 = torch.aten.mean.dim %7426, %7427, %true_9431, %none_9432 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9433 = torch.constant.float 9.9999999999999995E-7
    %int1_9434 = torch.constant.int 1
    %7429 = torch.aten.add.Scalar %7428, %float9.999990e-07_9433, %int1_9434 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7430 = torch.aten.rsqrt %7429 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7431 = torch.aten.mul.Tensor %7425, %7430 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9435 = torch.constant.int 5
    %7432 = torch.prims.convert_element_type %7431, %int5_9435 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.10.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.10.norm.query_norm.scale : tensor<128xf16>
    %7433 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7434 = torch.aten.mul.Tensor %7432, %7433 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9436 = torch.constant.int 6
    %7435 = torch.prims.convert_element_type %7423, %int6_9436 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9437 = torch.constant.int 2
    %7436 = torch.aten.pow.Tensor_Scalar %7435, %int2_9437 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9438 = torch.constant.int -1
    %7437 = torch.prim.ListConstruct %int-1_9438 : (!torch.int) -> !torch.list<int>
    %true_9439 = torch.constant.bool true
    %none_9440 = torch.constant.none
    %7438 = torch.aten.mean.dim %7436, %7437, %true_9439, %none_9440 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9441 = torch.constant.float 9.9999999999999995E-7
    %int1_9442 = torch.constant.int 1
    %7439 = torch.aten.add.Scalar %7438, %float9.999990e-07_9441, %int1_9442 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7440 = torch.aten.rsqrt %7439 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7441 = torch.aten.mul.Tensor %7435, %7440 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9443 = torch.constant.int 5
    %7442 = torch.prims.convert_element_type %7441, %int5_9443 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.10.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.10.norm.key_norm.scale : tensor<128xf16>
    %7443 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7444 = torch.aten.mul.Tensor %7442, %7443 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9444 = torch.constant.int 5
    %7445 = torch.prims.convert_element_type %7434, %int5_9444 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9445 = torch.constant.int 5
    %7446 = torch.prims.convert_element_type %7444, %int5_9445 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9446 = torch.constant.int 6
    %7447 = torch.prims.convert_element_type %7445, %int6_9446 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9447 = torch.constant.int 1
    %int24_9448 = torch.constant.int 24
    %int4608_9449 = torch.constant.int 4608
    %int64_9450 = torch.constant.int 64
    %int1_9451 = torch.constant.int 1
    %int2_9452 = torch.constant.int 2
    %7448 = torch.prim.ListConstruct %int1_9447, %int24_9448, %int4608_9449, %int64_9450, %int1_9451, %int2_9452 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7449 = torch.aten.view %7447, %7448 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_9453 = torch.constant.int 6
    %7450 = torch.prims.convert_element_type %7446, %int6_9453 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9454 = torch.constant.int 1
    %int24_9455 = torch.constant.int 24
    %int4608_9456 = torch.constant.int 4608
    %int64_9457 = torch.constant.int 64
    %int1_9458 = torch.constant.int 1
    %int2_9459 = torch.constant.int 2
    %7451 = torch.prim.ListConstruct %int1_9454, %int24_9455, %int4608_9456, %int64_9457, %int1_9458, %int2_9459 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7452 = torch.aten.view %7450, %7451 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_9460 = torch.constant.int 5
    %int0_9461 = torch.constant.int 0
    %7453 = torch.aten.select.int %211, %int5_9460, %int0_9461 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9462 = torch.constant.int 5
    %int0_9463 = torch.constant.int 0
    %7454 = torch.aten.select.int %7449, %int5_9462, %int0_9463 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7455 = torch.aten.mul.Tensor %7453, %7454 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9464 = torch.constant.int 5
    %int1_9465 = torch.constant.int 1
    %7456 = torch.aten.select.int %211, %int5_9464, %int1_9465 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9466 = torch.constant.int 5
    %int1_9467 = torch.constant.int 1
    %7457 = torch.aten.select.int %7449, %int5_9466, %int1_9467 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7458 = torch.aten.mul.Tensor %7456, %7457 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9468 = torch.constant.int 1
    %7459 = torch.aten.add.Tensor %7455, %7458, %int1_9468 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9469 = torch.constant.int 5
    %int0_9470 = torch.constant.int 0
    %7460 = torch.aten.select.int %211, %int5_9469, %int0_9470 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9471 = torch.constant.int 5
    %int0_9472 = torch.constant.int 0
    %7461 = torch.aten.select.int %7452, %int5_9471, %int0_9472 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7462 = torch.aten.mul.Tensor %7460, %7461 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9473 = torch.constant.int 5
    %int1_9474 = torch.constant.int 1
    %7463 = torch.aten.select.int %211, %int5_9473, %int1_9474 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9475 = torch.constant.int 5
    %int1_9476 = torch.constant.int 1
    %7464 = torch.aten.select.int %7452, %int5_9475, %int1_9476 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7465 = torch.aten.mul.Tensor %7463, %7464 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9477 = torch.constant.int 1
    %7466 = torch.aten.add.Tensor %7462, %7465, %int1_9477 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9478 = torch.constant.int 1
    %int24_9479 = torch.constant.int 24
    %int4608_9480 = torch.constant.int 4608
    %int128_9481 = torch.constant.int 128
    %7467 = torch.prim.ListConstruct %int1_9478, %int24_9479, %int4608_9480, %int128_9481 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7468 = torch.aten.view %7459, %7467 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9482 = torch.constant.int 5
    %7469 = torch.prims.convert_element_type %7468, %int5_9482 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_9483 = torch.constant.int 1
    %int24_9484 = torch.constant.int 24
    %int4608_9485 = torch.constant.int 4608
    %int128_9486 = torch.constant.int 128
    %7470 = torch.prim.ListConstruct %int1_9483, %int24_9484, %int4608_9485, %int128_9486 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7471 = torch.aten.view %7466, %7470 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9487 = torch.constant.int 5
    %7472 = torch.prims.convert_element_type %7471, %int5_9487 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_9488 = torch.constant.float 0.000000e+00
    %false_9489 = torch.constant.bool false
    %none_9490 = torch.constant.none
    %none_9491 = torch.constant.none
    %7473:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7469, %7472, %7424, %float0.000000e00_9488, %false_9489, %none_9490, %none_9491) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_9492 = torch.constant.int 0
    %int2_9493 = torch.constant.int 2
    %int1_9494 = torch.constant.int 1
    %int3_9495 = torch.constant.int 3
    %7474 = torch.prim.ListConstruct %int0_9492, %int2_9493, %int1_9494, %int3_9495 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7475 = torch.aten.permute %7473#0, %7474 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_9496 = torch.constant.int 1
    %int4608_9497 = torch.constant.int 4608
    %int3072_9498 = torch.constant.int 3072
    %7476 = torch.prim.ListConstruct %int1_9496, %int4608_9497, %int3072_9498 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7477 = torch.aten.view %7475, %7476 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_9499 = torch.constant.str "tanh"
    %7478 = torch.aten.gelu %7417, %str_9499 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7479 = torch.prim.ListConstruct %7477, %7478 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_9500 = torch.constant.int 2
    %7480 = torch.aten.cat %7479, %int2_9500 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_9501 = torch.constant.int 4608
    %int15360_9502 = torch.constant.int 15360
    %7481 = torch.prim.ListConstruct %int4608_9501, %int15360_9502 : (!torch.int, !torch.int) -> !torch.list<int>
    %7482 = torch.aten.view %7480, %7481 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.10.linear2.weight = util.global.load @__auto.sampler.single_blocks.10.linear2.weight : tensor<3072x15360xf16>
    %7483 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_9503 = torch.constant.int 0
    %int1_9504 = torch.constant.int 1
    %7484 = torch.aten.transpose.int %7483, %int0_9503, %int1_9504 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.10.linear2.bias = util.global.load @__auto.sampler.single_blocks.10.linear2.bias : tensor<3072xf16>
    %7485 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.10.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_9505 = torch.constant.int 6
    %7486 = torch.prims.convert_element_type %7485, %int6_9505 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_9506 = torch.constant.int 6
    %7487 = torch.prims.convert_element_type %7482, %int6_9506 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_9507 = torch.constant.int 6
    %7488 = torch.prims.convert_element_type %7484, %int6_9507 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7489 = torch.aten.mm %7487, %7488 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_9508 = torch.constant.int 1
    %7490 = torch.aten.mul.Scalar %7489, %int1_9508 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_9509 = torch.constant.int 1
    %7491 = torch.aten.mul.Scalar %7486, %int1_9509 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_9510 = torch.constant.int 1
    %7492 = torch.aten.add.Tensor %7490, %7491, %int1_9510 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_9511 = torch.constant.int 5
    %7493 = torch.prims.convert_element_type %7492, %int5_9511 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_9512 = torch.constant.int 1
    %int4608_9513 = torch.constant.int 4608
    %int3072_9514 = torch.constant.int 3072
    %7494 = torch.prim.ListConstruct %int1_9512, %int4608_9513, %int3072_9514 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7495 = torch.aten.view %7493, %7494 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7496 = torch.aten.mul.Tensor %7390, %7495 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9515 = torch.constant.int 1
    %7497 = torch.aten.add.Tensor %7372, %7496, %int1_9515 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7498 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.11.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.11.modulation.lin.weight : tensor<9216x3072xf16>
    %7499 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_9516 = torch.constant.int 0
    %int1_9517 = torch.constant.int 1
    %7500 = torch.aten.transpose.int %7499, %int0_9516, %int1_9517 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.11.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.11.modulation.lin.bias : tensor<9216xf16>
    %7501 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_9518 = torch.constant.int 6
    %7502 = torch.prims.convert_element_type %7501, %int6_9518 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_9519 = torch.constant.int 6
    %7503 = torch.prims.convert_element_type %7498, %int6_9519 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_9520 = torch.constant.int 6
    %7504 = torch.prims.convert_element_type %7500, %int6_9520 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7505 = torch.aten.mm %7503, %7504 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_9521 = torch.constant.int 1
    %7506 = torch.aten.mul.Scalar %7505, %int1_9521 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_9522 = torch.constant.int 1
    %7507 = torch.aten.mul.Scalar %7502, %int1_9522 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_9523 = torch.constant.int 1
    %7508 = torch.aten.add.Tensor %7506, %7507, %int1_9523 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_9524 = torch.constant.int 5
    %7509 = torch.prims.convert_element_type %7508, %int5_9524 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_9525 = torch.constant.int 0
    %int0_9526 = torch.constant.int 0
    %int9223372036854775807_9527 = torch.constant.int 9223372036854775807
    %int1_9528 = torch.constant.int 1
    %7510 = torch.aten.slice.Tensor %7509, %int0_9525, %int0_9526, %int9223372036854775807_9527, %int1_9528 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_9529 = torch.constant.int 1
    %7511 = torch.aten.unsqueeze %7510, %int1_9529 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_9530 = torch.constant.int 2
    %int0_9531 = torch.constant.int 0
    %int9223372036854775807_9532 = torch.constant.int 9223372036854775807
    %int1_9533 = torch.constant.int 1
    %7512 = torch.aten.slice.Tensor %7511, %int2_9530, %int0_9531, %int9223372036854775807_9532, %int1_9533 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_9534 = torch.constant.int -1
    %int0_9535 = torch.constant.int 0
    %int3072_9536 = torch.constant.int 3072
    %int1_9537 = torch.constant.int 1
    %7513 = torch.aten.slice.Tensor %7512, %int-1_9534, %int0_9535, %int3072_9536, %int1_9537 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9538 = torch.constant.int -1
    %int3072_9539 = torch.constant.int 3072
    %int6144_9540 = torch.constant.int 6144
    %int1_9541 = torch.constant.int 1
    %7514 = torch.aten.slice.Tensor %7512, %int-1_9538, %int3072_9539, %int6144_9540, %int1_9541 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9542 = torch.constant.int -1
    %int6144_9543 = torch.constant.int 6144
    %int9216_9544 = torch.constant.int 9216
    %int1_9545 = torch.constant.int 1
    %7515 = torch.aten.slice.Tensor %7512, %int-1_9542, %int6144_9543, %int9216_9544, %int1_9545 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_9546 = torch.constant.int 1
    %int1_9547 = torch.constant.int 1
    %7516 = torch.aten.add.Scalar %7514, %int1_9546, %int1_9547 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_9548 = torch.constant.int 6
    %7517 = torch.prims.convert_element_type %7497, %int6_9548 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_9549 = torch.constant.int 2
    %7518 = torch.prim.ListConstruct %int2_9549 : (!torch.int) -> !torch.list<int>
    %int0_9550 = torch.constant.int 0
    %true_9551 = torch.constant.bool true
    %result0_9552, %result1_9553 = torch.aten.var_mean.correction %7517, %7518, %int0_9550, %true_9551 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_9554 = torch.constant.float 9.9999999999999995E-7
    %int1_9555 = torch.constant.int 1
    %7519 = torch.aten.add.Scalar %result0_9552, %float9.999990e-07_9554, %int1_9555 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7520 = torch.aten.rsqrt %7519 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_9556 = torch.constant.int 1
    %7521 = torch.aten.sub.Tensor %7497, %result1_9553, %int1_9556 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7522 = torch.aten.mul.Tensor %7521, %7520 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_9557 = torch.constant.int 5
    %7523 = torch.prims.convert_element_type %7522, %int5_9557 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7524 = torch.aten.mul.Tensor %7516, %7523 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9558 = torch.constant.int 1
    %7525 = torch.aten.add.Tensor %7524, %7513, %int1_9558 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_9559 = torch.constant.int 4608
    %int3072_9560 = torch.constant.int 3072
    %7526 = torch.prim.ListConstruct %int4608_9559, %int3072_9560 : (!torch.int, !torch.int) -> !torch.list<int>
    %7527 = torch.aten.view %7525, %7526 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.11.linear1.weight = util.global.load @__auto.sampler.single_blocks.11.linear1.weight : tensor<21504x3072xf16>
    %7528 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_9561 = torch.constant.int 0
    %int1_9562 = torch.constant.int 1
    %7529 = torch.aten.transpose.int %7528, %int0_9561, %int1_9562 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.11.linear1.bias = util.global.load @__auto.sampler.single_blocks.11.linear1.bias : tensor<21504xf16>
    %7530 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_9563 = torch.constant.int 6
    %7531 = torch.prims.convert_element_type %7530, %int6_9563 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_9564 = torch.constant.int 6
    %7532 = torch.prims.convert_element_type %7527, %int6_9564 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_9565 = torch.constant.int 6
    %7533 = torch.prims.convert_element_type %7529, %int6_9565 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7534 = torch.aten.mm %7532, %7533 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_9566 = torch.constant.int 1
    %7535 = torch.aten.mul.Scalar %7534, %int1_9566 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_9567 = torch.constant.int 1
    %7536 = torch.aten.mul.Scalar %7531, %int1_9567 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_9568 = torch.constant.int 1
    %7537 = torch.aten.add.Tensor %7535, %7536, %int1_9568 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_9569 = torch.constant.int 5
    %7538 = torch.prims.convert_element_type %7537, %int5_9569 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_9570 = torch.constant.int 1
    %int4608_9571 = torch.constant.int 4608
    %int21504_9572 = torch.constant.int 21504
    %7539 = torch.prim.ListConstruct %int1_9570, %int4608_9571, %int21504_9572 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7540 = torch.aten.view %7538, %7539 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_9573 = torch.constant.int -1
    %int0_9574 = torch.constant.int 0
    %int9216_9575 = torch.constant.int 9216
    %int1_9576 = torch.constant.int 1
    %7541 = torch.aten.slice.Tensor %7540, %int-1_9573, %int0_9574, %int9216_9575, %int1_9576 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_9577 = torch.constant.int -1
    %int9216_9578 = torch.constant.int 9216
    %int21504_9579 = torch.constant.int 21504
    %int1_9580 = torch.constant.int 1
    %7542 = torch.aten.slice.Tensor %7540, %int-1_9577, %int9216_9578, %int21504_9579, %int1_9580 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_9581 = torch.constant.int 1
    %int4608_9582 = torch.constant.int 4608
    %int3_9583 = torch.constant.int 3
    %int24_9584 = torch.constant.int 24
    %int128_9585 = torch.constant.int 128
    %7543 = torch.prim.ListConstruct %int1_9581, %int4608_9582, %int3_9583, %int24_9584, %int128_9585 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7544 = torch.aten.view %7541, %7543 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_9586 = torch.constant.int 2
    %int0_9587 = torch.constant.int 0
    %int3_9588 = torch.constant.int 3
    %int1_9589 = torch.constant.int 1
    %int4_9590 = torch.constant.int 4
    %7545 = torch.prim.ListConstruct %int2_9586, %int0_9587, %int3_9588, %int1_9589, %int4_9590 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7546 = torch.aten.permute %7544, %7545 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_9591 = torch.constant.int 0
    %int0_9592 = torch.constant.int 0
    %7547 = torch.aten.select.int %7546, %int0_9591, %int0_9592 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9593 = torch.constant.int 0
    %int1_9594 = torch.constant.int 1
    %7548 = torch.aten.select.int %7546, %int0_9593, %int1_9594 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9595 = torch.constant.int 0
    %int2_9596 = torch.constant.int 2
    %7549 = torch.aten.select.int %7546, %int0_9595, %int2_9596 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9597 = torch.constant.int 6
    %7550 = torch.prims.convert_element_type %7547, %int6_9597 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9598 = torch.constant.int 2
    %7551 = torch.aten.pow.Tensor_Scalar %7550, %int2_9598 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9599 = torch.constant.int -1
    %7552 = torch.prim.ListConstruct %int-1_9599 : (!torch.int) -> !torch.list<int>
    %true_9600 = torch.constant.bool true
    %none_9601 = torch.constant.none
    %7553 = torch.aten.mean.dim %7551, %7552, %true_9600, %none_9601 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9602 = torch.constant.float 9.9999999999999995E-7
    %int1_9603 = torch.constant.int 1
    %7554 = torch.aten.add.Scalar %7553, %float9.999990e-07_9602, %int1_9603 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7555 = torch.aten.rsqrt %7554 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7556 = torch.aten.mul.Tensor %7550, %7555 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9604 = torch.constant.int 5
    %7557 = torch.prims.convert_element_type %7556, %int5_9604 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.11.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.11.norm.query_norm.scale : tensor<128xf16>
    %7558 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7559 = torch.aten.mul.Tensor %7557, %7558 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9605 = torch.constant.int 6
    %7560 = torch.prims.convert_element_type %7548, %int6_9605 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9606 = torch.constant.int 2
    %7561 = torch.aten.pow.Tensor_Scalar %7560, %int2_9606 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9607 = torch.constant.int -1
    %7562 = torch.prim.ListConstruct %int-1_9607 : (!torch.int) -> !torch.list<int>
    %true_9608 = torch.constant.bool true
    %none_9609 = torch.constant.none
    %7563 = torch.aten.mean.dim %7561, %7562, %true_9608, %none_9609 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9610 = torch.constant.float 9.9999999999999995E-7
    %int1_9611 = torch.constant.int 1
    %7564 = torch.aten.add.Scalar %7563, %float9.999990e-07_9610, %int1_9611 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7565 = torch.aten.rsqrt %7564 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7566 = torch.aten.mul.Tensor %7560, %7565 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9612 = torch.constant.int 5
    %7567 = torch.prims.convert_element_type %7566, %int5_9612 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.11.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.11.norm.key_norm.scale : tensor<128xf16>
    %7568 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7569 = torch.aten.mul.Tensor %7567, %7568 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9613 = torch.constant.int 5
    %7570 = torch.prims.convert_element_type %7559, %int5_9613 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9614 = torch.constant.int 5
    %7571 = torch.prims.convert_element_type %7569, %int5_9614 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9615 = torch.constant.int 6
    %7572 = torch.prims.convert_element_type %7570, %int6_9615 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9616 = torch.constant.int 1
    %int24_9617 = torch.constant.int 24
    %int4608_9618 = torch.constant.int 4608
    %int64_9619 = torch.constant.int 64
    %int1_9620 = torch.constant.int 1
    %int2_9621 = torch.constant.int 2
    %7573 = torch.prim.ListConstruct %int1_9616, %int24_9617, %int4608_9618, %int64_9619, %int1_9620, %int2_9621 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7574 = torch.aten.view %7572, %7573 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_9622 = torch.constant.int 6
    %7575 = torch.prims.convert_element_type %7571, %int6_9622 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9623 = torch.constant.int 1
    %int24_9624 = torch.constant.int 24
    %int4608_9625 = torch.constant.int 4608
    %int64_9626 = torch.constant.int 64
    %int1_9627 = torch.constant.int 1
    %int2_9628 = torch.constant.int 2
    %7576 = torch.prim.ListConstruct %int1_9623, %int24_9624, %int4608_9625, %int64_9626, %int1_9627, %int2_9628 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7577 = torch.aten.view %7575, %7576 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_9629 = torch.constant.int 5
    %int0_9630 = torch.constant.int 0
    %7578 = torch.aten.select.int %211, %int5_9629, %int0_9630 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9631 = torch.constant.int 5
    %int0_9632 = torch.constant.int 0
    %7579 = torch.aten.select.int %7574, %int5_9631, %int0_9632 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7580 = torch.aten.mul.Tensor %7578, %7579 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9633 = torch.constant.int 5
    %int1_9634 = torch.constant.int 1
    %7581 = torch.aten.select.int %211, %int5_9633, %int1_9634 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9635 = torch.constant.int 5
    %int1_9636 = torch.constant.int 1
    %7582 = torch.aten.select.int %7574, %int5_9635, %int1_9636 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7583 = torch.aten.mul.Tensor %7581, %7582 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9637 = torch.constant.int 1
    %7584 = torch.aten.add.Tensor %7580, %7583, %int1_9637 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9638 = torch.constant.int 5
    %int0_9639 = torch.constant.int 0
    %7585 = torch.aten.select.int %211, %int5_9638, %int0_9639 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9640 = torch.constant.int 5
    %int0_9641 = torch.constant.int 0
    %7586 = torch.aten.select.int %7577, %int5_9640, %int0_9641 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7587 = torch.aten.mul.Tensor %7585, %7586 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9642 = torch.constant.int 5
    %int1_9643 = torch.constant.int 1
    %7588 = torch.aten.select.int %211, %int5_9642, %int1_9643 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9644 = torch.constant.int 5
    %int1_9645 = torch.constant.int 1
    %7589 = torch.aten.select.int %7577, %int5_9644, %int1_9645 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7590 = torch.aten.mul.Tensor %7588, %7589 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9646 = torch.constant.int 1
    %7591 = torch.aten.add.Tensor %7587, %7590, %int1_9646 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9647 = torch.constant.int 1
    %int24_9648 = torch.constant.int 24
    %int4608_9649 = torch.constant.int 4608
    %int128_9650 = torch.constant.int 128
    %7592 = torch.prim.ListConstruct %int1_9647, %int24_9648, %int4608_9649, %int128_9650 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7593 = torch.aten.view %7584, %7592 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9651 = torch.constant.int 5
    %7594 = torch.prims.convert_element_type %7593, %int5_9651 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_9652 = torch.constant.int 1
    %int24_9653 = torch.constant.int 24
    %int4608_9654 = torch.constant.int 4608
    %int128_9655 = torch.constant.int 128
    %7595 = torch.prim.ListConstruct %int1_9652, %int24_9653, %int4608_9654, %int128_9655 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7596 = torch.aten.view %7591, %7595 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9656 = torch.constant.int 5
    %7597 = torch.prims.convert_element_type %7596, %int5_9656 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_9657 = torch.constant.float 0.000000e+00
    %false_9658 = torch.constant.bool false
    %none_9659 = torch.constant.none
    %none_9660 = torch.constant.none
    %7598:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7594, %7597, %7549, %float0.000000e00_9657, %false_9658, %none_9659, %none_9660) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_9661 = torch.constant.int 0
    %int2_9662 = torch.constant.int 2
    %int1_9663 = torch.constant.int 1
    %int3_9664 = torch.constant.int 3
    %7599 = torch.prim.ListConstruct %int0_9661, %int2_9662, %int1_9663, %int3_9664 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7600 = torch.aten.permute %7598#0, %7599 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_9665 = torch.constant.int 1
    %int4608_9666 = torch.constant.int 4608
    %int3072_9667 = torch.constant.int 3072
    %7601 = torch.prim.ListConstruct %int1_9665, %int4608_9666, %int3072_9667 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7602 = torch.aten.view %7600, %7601 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_9668 = torch.constant.str "tanh"
    %7603 = torch.aten.gelu %7542, %str_9668 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7604 = torch.prim.ListConstruct %7602, %7603 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_9669 = torch.constant.int 2
    %7605 = torch.aten.cat %7604, %int2_9669 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_9670 = torch.constant.int 4608
    %int15360_9671 = torch.constant.int 15360
    %7606 = torch.prim.ListConstruct %int4608_9670, %int15360_9671 : (!torch.int, !torch.int) -> !torch.list<int>
    %7607 = torch.aten.view %7605, %7606 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.11.linear2.weight = util.global.load @__auto.sampler.single_blocks.11.linear2.weight : tensor<3072x15360xf16>
    %7608 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_9672 = torch.constant.int 0
    %int1_9673 = torch.constant.int 1
    %7609 = torch.aten.transpose.int %7608, %int0_9672, %int1_9673 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.11.linear2.bias = util.global.load @__auto.sampler.single_blocks.11.linear2.bias : tensor<3072xf16>
    %7610 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.11.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_9674 = torch.constant.int 6
    %7611 = torch.prims.convert_element_type %7610, %int6_9674 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_9675 = torch.constant.int 6
    %7612 = torch.prims.convert_element_type %7607, %int6_9675 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_9676 = torch.constant.int 6
    %7613 = torch.prims.convert_element_type %7609, %int6_9676 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7614 = torch.aten.mm %7612, %7613 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_9677 = torch.constant.int 1
    %7615 = torch.aten.mul.Scalar %7614, %int1_9677 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_9678 = torch.constant.int 1
    %7616 = torch.aten.mul.Scalar %7611, %int1_9678 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_9679 = torch.constant.int 1
    %7617 = torch.aten.add.Tensor %7615, %7616, %int1_9679 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_9680 = torch.constant.int 5
    %7618 = torch.prims.convert_element_type %7617, %int5_9680 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_9681 = torch.constant.int 1
    %int4608_9682 = torch.constant.int 4608
    %int3072_9683 = torch.constant.int 3072
    %7619 = torch.prim.ListConstruct %int1_9681, %int4608_9682, %int3072_9683 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7620 = torch.aten.view %7618, %7619 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7621 = torch.aten.mul.Tensor %7515, %7620 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9684 = torch.constant.int 1
    %7622 = torch.aten.add.Tensor %7497, %7621, %int1_9684 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7623 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.12.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.12.modulation.lin.weight : tensor<9216x3072xf16>
    %7624 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_9685 = torch.constant.int 0
    %int1_9686 = torch.constant.int 1
    %7625 = torch.aten.transpose.int %7624, %int0_9685, %int1_9686 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.12.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.12.modulation.lin.bias : tensor<9216xf16>
    %7626 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_9687 = torch.constant.int 6
    %7627 = torch.prims.convert_element_type %7626, %int6_9687 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_9688 = torch.constant.int 6
    %7628 = torch.prims.convert_element_type %7623, %int6_9688 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_9689 = torch.constant.int 6
    %7629 = torch.prims.convert_element_type %7625, %int6_9689 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7630 = torch.aten.mm %7628, %7629 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_9690 = torch.constant.int 1
    %7631 = torch.aten.mul.Scalar %7630, %int1_9690 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_9691 = torch.constant.int 1
    %7632 = torch.aten.mul.Scalar %7627, %int1_9691 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_9692 = torch.constant.int 1
    %7633 = torch.aten.add.Tensor %7631, %7632, %int1_9692 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_9693 = torch.constant.int 5
    %7634 = torch.prims.convert_element_type %7633, %int5_9693 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_9694 = torch.constant.int 0
    %int0_9695 = torch.constant.int 0
    %int9223372036854775807_9696 = torch.constant.int 9223372036854775807
    %int1_9697 = torch.constant.int 1
    %7635 = torch.aten.slice.Tensor %7634, %int0_9694, %int0_9695, %int9223372036854775807_9696, %int1_9697 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_9698 = torch.constant.int 1
    %7636 = torch.aten.unsqueeze %7635, %int1_9698 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_9699 = torch.constant.int 2
    %int0_9700 = torch.constant.int 0
    %int9223372036854775807_9701 = torch.constant.int 9223372036854775807
    %int1_9702 = torch.constant.int 1
    %7637 = torch.aten.slice.Tensor %7636, %int2_9699, %int0_9700, %int9223372036854775807_9701, %int1_9702 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_9703 = torch.constant.int -1
    %int0_9704 = torch.constant.int 0
    %int3072_9705 = torch.constant.int 3072
    %int1_9706 = torch.constant.int 1
    %7638 = torch.aten.slice.Tensor %7637, %int-1_9703, %int0_9704, %int3072_9705, %int1_9706 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9707 = torch.constant.int -1
    %int3072_9708 = torch.constant.int 3072
    %int6144_9709 = torch.constant.int 6144
    %int1_9710 = torch.constant.int 1
    %7639 = torch.aten.slice.Tensor %7637, %int-1_9707, %int3072_9708, %int6144_9709, %int1_9710 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9711 = torch.constant.int -1
    %int6144_9712 = torch.constant.int 6144
    %int9216_9713 = torch.constant.int 9216
    %int1_9714 = torch.constant.int 1
    %7640 = torch.aten.slice.Tensor %7637, %int-1_9711, %int6144_9712, %int9216_9713, %int1_9714 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_9715 = torch.constant.int 1
    %int1_9716 = torch.constant.int 1
    %7641 = torch.aten.add.Scalar %7639, %int1_9715, %int1_9716 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_9717 = torch.constant.int 6
    %7642 = torch.prims.convert_element_type %7622, %int6_9717 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_9718 = torch.constant.int 2
    %7643 = torch.prim.ListConstruct %int2_9718 : (!torch.int) -> !torch.list<int>
    %int0_9719 = torch.constant.int 0
    %true_9720 = torch.constant.bool true
    %result0_9721, %result1_9722 = torch.aten.var_mean.correction %7642, %7643, %int0_9719, %true_9720 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_9723 = torch.constant.float 9.9999999999999995E-7
    %int1_9724 = torch.constant.int 1
    %7644 = torch.aten.add.Scalar %result0_9721, %float9.999990e-07_9723, %int1_9724 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7645 = torch.aten.rsqrt %7644 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_9725 = torch.constant.int 1
    %7646 = torch.aten.sub.Tensor %7622, %result1_9722, %int1_9725 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7647 = torch.aten.mul.Tensor %7646, %7645 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_9726 = torch.constant.int 5
    %7648 = torch.prims.convert_element_type %7647, %int5_9726 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7649 = torch.aten.mul.Tensor %7641, %7648 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9727 = torch.constant.int 1
    %7650 = torch.aten.add.Tensor %7649, %7638, %int1_9727 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_9728 = torch.constant.int 4608
    %int3072_9729 = torch.constant.int 3072
    %7651 = torch.prim.ListConstruct %int4608_9728, %int3072_9729 : (!torch.int, !torch.int) -> !torch.list<int>
    %7652 = torch.aten.view %7650, %7651 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.12.linear1.weight = util.global.load @__auto.sampler.single_blocks.12.linear1.weight : tensor<21504x3072xf16>
    %7653 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_9730 = torch.constant.int 0
    %int1_9731 = torch.constant.int 1
    %7654 = torch.aten.transpose.int %7653, %int0_9730, %int1_9731 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.12.linear1.bias = util.global.load @__auto.sampler.single_blocks.12.linear1.bias : tensor<21504xf16>
    %7655 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_9732 = torch.constant.int 6
    %7656 = torch.prims.convert_element_type %7655, %int6_9732 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_9733 = torch.constant.int 6
    %7657 = torch.prims.convert_element_type %7652, %int6_9733 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_9734 = torch.constant.int 6
    %7658 = torch.prims.convert_element_type %7654, %int6_9734 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7659 = torch.aten.mm %7657, %7658 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_9735 = torch.constant.int 1
    %7660 = torch.aten.mul.Scalar %7659, %int1_9735 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_9736 = torch.constant.int 1
    %7661 = torch.aten.mul.Scalar %7656, %int1_9736 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_9737 = torch.constant.int 1
    %7662 = torch.aten.add.Tensor %7660, %7661, %int1_9737 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_9738 = torch.constant.int 5
    %7663 = torch.prims.convert_element_type %7662, %int5_9738 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_9739 = torch.constant.int 1
    %int4608_9740 = torch.constant.int 4608
    %int21504_9741 = torch.constant.int 21504
    %7664 = torch.prim.ListConstruct %int1_9739, %int4608_9740, %int21504_9741 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7665 = torch.aten.view %7663, %7664 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_9742 = torch.constant.int -1
    %int0_9743 = torch.constant.int 0
    %int9216_9744 = torch.constant.int 9216
    %int1_9745 = torch.constant.int 1
    %7666 = torch.aten.slice.Tensor %7665, %int-1_9742, %int0_9743, %int9216_9744, %int1_9745 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_9746 = torch.constant.int -1
    %int9216_9747 = torch.constant.int 9216
    %int21504_9748 = torch.constant.int 21504
    %int1_9749 = torch.constant.int 1
    %7667 = torch.aten.slice.Tensor %7665, %int-1_9746, %int9216_9747, %int21504_9748, %int1_9749 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_9750 = torch.constant.int 1
    %int4608_9751 = torch.constant.int 4608
    %int3_9752 = torch.constant.int 3
    %int24_9753 = torch.constant.int 24
    %int128_9754 = torch.constant.int 128
    %7668 = torch.prim.ListConstruct %int1_9750, %int4608_9751, %int3_9752, %int24_9753, %int128_9754 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7669 = torch.aten.view %7666, %7668 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_9755 = torch.constant.int 2
    %int0_9756 = torch.constant.int 0
    %int3_9757 = torch.constant.int 3
    %int1_9758 = torch.constant.int 1
    %int4_9759 = torch.constant.int 4
    %7670 = torch.prim.ListConstruct %int2_9755, %int0_9756, %int3_9757, %int1_9758, %int4_9759 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7671 = torch.aten.permute %7669, %7670 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_9760 = torch.constant.int 0
    %int0_9761 = torch.constant.int 0
    %7672 = torch.aten.select.int %7671, %int0_9760, %int0_9761 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9762 = torch.constant.int 0
    %int1_9763 = torch.constant.int 1
    %7673 = torch.aten.select.int %7671, %int0_9762, %int1_9763 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9764 = torch.constant.int 0
    %int2_9765 = torch.constant.int 2
    %7674 = torch.aten.select.int %7671, %int0_9764, %int2_9765 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9766 = torch.constant.int 6
    %7675 = torch.prims.convert_element_type %7672, %int6_9766 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9767 = torch.constant.int 2
    %7676 = torch.aten.pow.Tensor_Scalar %7675, %int2_9767 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9768 = torch.constant.int -1
    %7677 = torch.prim.ListConstruct %int-1_9768 : (!torch.int) -> !torch.list<int>
    %true_9769 = torch.constant.bool true
    %none_9770 = torch.constant.none
    %7678 = torch.aten.mean.dim %7676, %7677, %true_9769, %none_9770 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9771 = torch.constant.float 9.9999999999999995E-7
    %int1_9772 = torch.constant.int 1
    %7679 = torch.aten.add.Scalar %7678, %float9.999990e-07_9771, %int1_9772 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7680 = torch.aten.rsqrt %7679 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7681 = torch.aten.mul.Tensor %7675, %7680 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9773 = torch.constant.int 5
    %7682 = torch.prims.convert_element_type %7681, %int5_9773 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.12.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.12.norm.query_norm.scale : tensor<128xf16>
    %7683 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7684 = torch.aten.mul.Tensor %7682, %7683 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9774 = torch.constant.int 6
    %7685 = torch.prims.convert_element_type %7673, %int6_9774 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9775 = torch.constant.int 2
    %7686 = torch.aten.pow.Tensor_Scalar %7685, %int2_9775 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9776 = torch.constant.int -1
    %7687 = torch.prim.ListConstruct %int-1_9776 : (!torch.int) -> !torch.list<int>
    %true_9777 = torch.constant.bool true
    %none_9778 = torch.constant.none
    %7688 = torch.aten.mean.dim %7686, %7687, %true_9777, %none_9778 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9779 = torch.constant.float 9.9999999999999995E-7
    %int1_9780 = torch.constant.int 1
    %7689 = torch.aten.add.Scalar %7688, %float9.999990e-07_9779, %int1_9780 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7690 = torch.aten.rsqrt %7689 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7691 = torch.aten.mul.Tensor %7685, %7690 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9781 = torch.constant.int 5
    %7692 = torch.prims.convert_element_type %7691, %int5_9781 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.12.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.12.norm.key_norm.scale : tensor<128xf16>
    %7693 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7694 = torch.aten.mul.Tensor %7692, %7693 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9782 = torch.constant.int 5
    %7695 = torch.prims.convert_element_type %7684, %int5_9782 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9783 = torch.constant.int 5
    %7696 = torch.prims.convert_element_type %7694, %int5_9783 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9784 = torch.constant.int 6
    %7697 = torch.prims.convert_element_type %7695, %int6_9784 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9785 = torch.constant.int 1
    %int24_9786 = torch.constant.int 24
    %int4608_9787 = torch.constant.int 4608
    %int64_9788 = torch.constant.int 64
    %int1_9789 = torch.constant.int 1
    %int2_9790 = torch.constant.int 2
    %7698 = torch.prim.ListConstruct %int1_9785, %int24_9786, %int4608_9787, %int64_9788, %int1_9789, %int2_9790 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7699 = torch.aten.view %7697, %7698 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_9791 = torch.constant.int 6
    %7700 = torch.prims.convert_element_type %7696, %int6_9791 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9792 = torch.constant.int 1
    %int24_9793 = torch.constant.int 24
    %int4608_9794 = torch.constant.int 4608
    %int64_9795 = torch.constant.int 64
    %int1_9796 = torch.constant.int 1
    %int2_9797 = torch.constant.int 2
    %7701 = torch.prim.ListConstruct %int1_9792, %int24_9793, %int4608_9794, %int64_9795, %int1_9796, %int2_9797 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7702 = torch.aten.view %7700, %7701 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_9798 = torch.constant.int 5
    %int0_9799 = torch.constant.int 0
    %7703 = torch.aten.select.int %211, %int5_9798, %int0_9799 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9800 = torch.constant.int 5
    %int0_9801 = torch.constant.int 0
    %7704 = torch.aten.select.int %7699, %int5_9800, %int0_9801 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7705 = torch.aten.mul.Tensor %7703, %7704 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9802 = torch.constant.int 5
    %int1_9803 = torch.constant.int 1
    %7706 = torch.aten.select.int %211, %int5_9802, %int1_9803 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9804 = torch.constant.int 5
    %int1_9805 = torch.constant.int 1
    %7707 = torch.aten.select.int %7699, %int5_9804, %int1_9805 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7708 = torch.aten.mul.Tensor %7706, %7707 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9806 = torch.constant.int 1
    %7709 = torch.aten.add.Tensor %7705, %7708, %int1_9806 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9807 = torch.constant.int 5
    %int0_9808 = torch.constant.int 0
    %7710 = torch.aten.select.int %211, %int5_9807, %int0_9808 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9809 = torch.constant.int 5
    %int0_9810 = torch.constant.int 0
    %7711 = torch.aten.select.int %7702, %int5_9809, %int0_9810 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7712 = torch.aten.mul.Tensor %7710, %7711 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9811 = torch.constant.int 5
    %int1_9812 = torch.constant.int 1
    %7713 = torch.aten.select.int %211, %int5_9811, %int1_9812 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9813 = torch.constant.int 5
    %int1_9814 = torch.constant.int 1
    %7714 = torch.aten.select.int %7702, %int5_9813, %int1_9814 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7715 = torch.aten.mul.Tensor %7713, %7714 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9815 = torch.constant.int 1
    %7716 = torch.aten.add.Tensor %7712, %7715, %int1_9815 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9816 = torch.constant.int 1
    %int24_9817 = torch.constant.int 24
    %int4608_9818 = torch.constant.int 4608
    %int128_9819 = torch.constant.int 128
    %7717 = torch.prim.ListConstruct %int1_9816, %int24_9817, %int4608_9818, %int128_9819 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7718 = torch.aten.view %7709, %7717 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9820 = torch.constant.int 5
    %7719 = torch.prims.convert_element_type %7718, %int5_9820 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_9821 = torch.constant.int 1
    %int24_9822 = torch.constant.int 24
    %int4608_9823 = torch.constant.int 4608
    %int128_9824 = torch.constant.int 128
    %7720 = torch.prim.ListConstruct %int1_9821, %int24_9822, %int4608_9823, %int128_9824 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7721 = torch.aten.view %7716, %7720 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9825 = torch.constant.int 5
    %7722 = torch.prims.convert_element_type %7721, %int5_9825 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_9826 = torch.constant.float 0.000000e+00
    %false_9827 = torch.constant.bool false
    %none_9828 = torch.constant.none
    %none_9829 = torch.constant.none
    %7723:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7719, %7722, %7674, %float0.000000e00_9826, %false_9827, %none_9828, %none_9829) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_9830 = torch.constant.int 0
    %int2_9831 = torch.constant.int 2
    %int1_9832 = torch.constant.int 1
    %int3_9833 = torch.constant.int 3
    %7724 = torch.prim.ListConstruct %int0_9830, %int2_9831, %int1_9832, %int3_9833 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7725 = torch.aten.permute %7723#0, %7724 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_9834 = torch.constant.int 1
    %int4608_9835 = torch.constant.int 4608
    %int3072_9836 = torch.constant.int 3072
    %7726 = torch.prim.ListConstruct %int1_9834, %int4608_9835, %int3072_9836 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7727 = torch.aten.view %7725, %7726 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_9837 = torch.constant.str "tanh"
    %7728 = torch.aten.gelu %7667, %str_9837 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7729 = torch.prim.ListConstruct %7727, %7728 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_9838 = torch.constant.int 2
    %7730 = torch.aten.cat %7729, %int2_9838 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_9839 = torch.constant.int 4608
    %int15360_9840 = torch.constant.int 15360
    %7731 = torch.prim.ListConstruct %int4608_9839, %int15360_9840 : (!torch.int, !torch.int) -> !torch.list<int>
    %7732 = torch.aten.view %7730, %7731 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.12.linear2.weight = util.global.load @__auto.sampler.single_blocks.12.linear2.weight : tensor<3072x15360xf16>
    %7733 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_9841 = torch.constant.int 0
    %int1_9842 = torch.constant.int 1
    %7734 = torch.aten.transpose.int %7733, %int0_9841, %int1_9842 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.12.linear2.bias = util.global.load @__auto.sampler.single_blocks.12.linear2.bias : tensor<3072xf16>
    %7735 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.12.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_9843 = torch.constant.int 6
    %7736 = torch.prims.convert_element_type %7735, %int6_9843 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_9844 = torch.constant.int 6
    %7737 = torch.prims.convert_element_type %7732, %int6_9844 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_9845 = torch.constant.int 6
    %7738 = torch.prims.convert_element_type %7734, %int6_9845 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7739 = torch.aten.mm %7737, %7738 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_9846 = torch.constant.int 1
    %7740 = torch.aten.mul.Scalar %7739, %int1_9846 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_9847 = torch.constant.int 1
    %7741 = torch.aten.mul.Scalar %7736, %int1_9847 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_9848 = torch.constant.int 1
    %7742 = torch.aten.add.Tensor %7740, %7741, %int1_9848 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_9849 = torch.constant.int 5
    %7743 = torch.prims.convert_element_type %7742, %int5_9849 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_9850 = torch.constant.int 1
    %int4608_9851 = torch.constant.int 4608
    %int3072_9852 = torch.constant.int 3072
    %7744 = torch.prim.ListConstruct %int1_9850, %int4608_9851, %int3072_9852 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7745 = torch.aten.view %7743, %7744 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7746 = torch.aten.mul.Tensor %7640, %7745 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9853 = torch.constant.int 1
    %7747 = torch.aten.add.Tensor %7622, %7746, %int1_9853 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7748 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.13.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.13.modulation.lin.weight : tensor<9216x3072xf16>
    %7749 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_9854 = torch.constant.int 0
    %int1_9855 = torch.constant.int 1
    %7750 = torch.aten.transpose.int %7749, %int0_9854, %int1_9855 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.13.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.13.modulation.lin.bias : tensor<9216xf16>
    %7751 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_9856 = torch.constant.int 6
    %7752 = torch.prims.convert_element_type %7751, %int6_9856 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_9857 = torch.constant.int 6
    %7753 = torch.prims.convert_element_type %7748, %int6_9857 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_9858 = torch.constant.int 6
    %7754 = torch.prims.convert_element_type %7750, %int6_9858 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7755 = torch.aten.mm %7753, %7754 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_9859 = torch.constant.int 1
    %7756 = torch.aten.mul.Scalar %7755, %int1_9859 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_9860 = torch.constant.int 1
    %7757 = torch.aten.mul.Scalar %7752, %int1_9860 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_9861 = torch.constant.int 1
    %7758 = torch.aten.add.Tensor %7756, %7757, %int1_9861 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_9862 = torch.constant.int 5
    %7759 = torch.prims.convert_element_type %7758, %int5_9862 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_9863 = torch.constant.int 0
    %int0_9864 = torch.constant.int 0
    %int9223372036854775807_9865 = torch.constant.int 9223372036854775807
    %int1_9866 = torch.constant.int 1
    %7760 = torch.aten.slice.Tensor %7759, %int0_9863, %int0_9864, %int9223372036854775807_9865, %int1_9866 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_9867 = torch.constant.int 1
    %7761 = torch.aten.unsqueeze %7760, %int1_9867 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_9868 = torch.constant.int 2
    %int0_9869 = torch.constant.int 0
    %int9223372036854775807_9870 = torch.constant.int 9223372036854775807
    %int1_9871 = torch.constant.int 1
    %7762 = torch.aten.slice.Tensor %7761, %int2_9868, %int0_9869, %int9223372036854775807_9870, %int1_9871 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_9872 = torch.constant.int -1
    %int0_9873 = torch.constant.int 0
    %int3072_9874 = torch.constant.int 3072
    %int1_9875 = torch.constant.int 1
    %7763 = torch.aten.slice.Tensor %7762, %int-1_9872, %int0_9873, %int3072_9874, %int1_9875 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9876 = torch.constant.int -1
    %int3072_9877 = torch.constant.int 3072
    %int6144_9878 = torch.constant.int 6144
    %int1_9879 = torch.constant.int 1
    %7764 = torch.aten.slice.Tensor %7762, %int-1_9876, %int3072_9877, %int6144_9878, %int1_9879 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_9880 = torch.constant.int -1
    %int6144_9881 = torch.constant.int 6144
    %int9216_9882 = torch.constant.int 9216
    %int1_9883 = torch.constant.int 1
    %7765 = torch.aten.slice.Tensor %7762, %int-1_9880, %int6144_9881, %int9216_9882, %int1_9883 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_9884 = torch.constant.int 1
    %int1_9885 = torch.constant.int 1
    %7766 = torch.aten.add.Scalar %7764, %int1_9884, %int1_9885 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_9886 = torch.constant.int 6
    %7767 = torch.prims.convert_element_type %7747, %int6_9886 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_9887 = torch.constant.int 2
    %7768 = torch.prim.ListConstruct %int2_9887 : (!torch.int) -> !torch.list<int>
    %int0_9888 = torch.constant.int 0
    %true_9889 = torch.constant.bool true
    %result0_9890, %result1_9891 = torch.aten.var_mean.correction %7767, %7768, %int0_9888, %true_9889 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_9892 = torch.constant.float 9.9999999999999995E-7
    %int1_9893 = torch.constant.int 1
    %7769 = torch.aten.add.Scalar %result0_9890, %float9.999990e-07_9892, %int1_9893 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7770 = torch.aten.rsqrt %7769 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_9894 = torch.constant.int 1
    %7771 = torch.aten.sub.Tensor %7747, %result1_9891, %int1_9894 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7772 = torch.aten.mul.Tensor %7771, %7770 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_9895 = torch.constant.int 5
    %7773 = torch.prims.convert_element_type %7772, %int5_9895 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7774 = torch.aten.mul.Tensor %7766, %7773 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_9896 = torch.constant.int 1
    %7775 = torch.aten.add.Tensor %7774, %7763, %int1_9896 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_9897 = torch.constant.int 4608
    %int3072_9898 = torch.constant.int 3072
    %7776 = torch.prim.ListConstruct %int4608_9897, %int3072_9898 : (!torch.int, !torch.int) -> !torch.list<int>
    %7777 = torch.aten.view %7775, %7776 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.13.linear1.weight = util.global.load @__auto.sampler.single_blocks.13.linear1.weight : tensor<21504x3072xf16>
    %7778 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_9899 = torch.constant.int 0
    %int1_9900 = torch.constant.int 1
    %7779 = torch.aten.transpose.int %7778, %int0_9899, %int1_9900 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.13.linear1.bias = util.global.load @__auto.sampler.single_blocks.13.linear1.bias : tensor<21504xf16>
    %7780 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_9901 = torch.constant.int 6
    %7781 = torch.prims.convert_element_type %7780, %int6_9901 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_9902 = torch.constant.int 6
    %7782 = torch.prims.convert_element_type %7777, %int6_9902 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_9903 = torch.constant.int 6
    %7783 = torch.prims.convert_element_type %7779, %int6_9903 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7784 = torch.aten.mm %7782, %7783 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_9904 = torch.constant.int 1
    %7785 = torch.aten.mul.Scalar %7784, %int1_9904 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_9905 = torch.constant.int 1
    %7786 = torch.aten.mul.Scalar %7781, %int1_9905 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_9906 = torch.constant.int 1
    %7787 = torch.aten.add.Tensor %7785, %7786, %int1_9906 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_9907 = torch.constant.int 5
    %7788 = torch.prims.convert_element_type %7787, %int5_9907 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_9908 = torch.constant.int 1
    %int4608_9909 = torch.constant.int 4608
    %int21504_9910 = torch.constant.int 21504
    %7789 = torch.prim.ListConstruct %int1_9908, %int4608_9909, %int21504_9910 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7790 = torch.aten.view %7788, %7789 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_9911 = torch.constant.int -1
    %int0_9912 = torch.constant.int 0
    %int9216_9913 = torch.constant.int 9216
    %int1_9914 = torch.constant.int 1
    %7791 = torch.aten.slice.Tensor %7790, %int-1_9911, %int0_9912, %int9216_9913, %int1_9914 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_9915 = torch.constant.int -1
    %int9216_9916 = torch.constant.int 9216
    %int21504_9917 = torch.constant.int 21504
    %int1_9918 = torch.constant.int 1
    %7792 = torch.aten.slice.Tensor %7790, %int-1_9915, %int9216_9916, %int21504_9917, %int1_9918 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_9919 = torch.constant.int 1
    %int4608_9920 = torch.constant.int 4608
    %int3_9921 = torch.constant.int 3
    %int24_9922 = torch.constant.int 24
    %int128_9923 = torch.constant.int 128
    %7793 = torch.prim.ListConstruct %int1_9919, %int4608_9920, %int3_9921, %int24_9922, %int128_9923 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7794 = torch.aten.view %7791, %7793 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_9924 = torch.constant.int 2
    %int0_9925 = torch.constant.int 0
    %int3_9926 = torch.constant.int 3
    %int1_9927 = torch.constant.int 1
    %int4_9928 = torch.constant.int 4
    %7795 = torch.prim.ListConstruct %int2_9924, %int0_9925, %int3_9926, %int1_9927, %int4_9928 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7796 = torch.aten.permute %7794, %7795 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_9929 = torch.constant.int 0
    %int0_9930 = torch.constant.int 0
    %7797 = torch.aten.select.int %7796, %int0_9929, %int0_9930 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9931 = torch.constant.int 0
    %int1_9932 = torch.constant.int 1
    %7798 = torch.aten.select.int %7796, %int0_9931, %int1_9932 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_9933 = torch.constant.int 0
    %int2_9934 = torch.constant.int 2
    %7799 = torch.aten.select.int %7796, %int0_9933, %int2_9934 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9935 = torch.constant.int 6
    %7800 = torch.prims.convert_element_type %7797, %int6_9935 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9936 = torch.constant.int 2
    %7801 = torch.aten.pow.Tensor_Scalar %7800, %int2_9936 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9937 = torch.constant.int -1
    %7802 = torch.prim.ListConstruct %int-1_9937 : (!torch.int) -> !torch.list<int>
    %true_9938 = torch.constant.bool true
    %none_9939 = torch.constant.none
    %7803 = torch.aten.mean.dim %7801, %7802, %true_9938, %none_9939 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9940 = torch.constant.float 9.9999999999999995E-7
    %int1_9941 = torch.constant.int 1
    %7804 = torch.aten.add.Scalar %7803, %float9.999990e-07_9940, %int1_9941 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7805 = torch.aten.rsqrt %7804 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7806 = torch.aten.mul.Tensor %7800, %7805 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9942 = torch.constant.int 5
    %7807 = torch.prims.convert_element_type %7806, %int5_9942 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.13.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.13.norm.query_norm.scale : tensor<128xf16>
    %7808 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7809 = torch.aten.mul.Tensor %7807, %7808 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9943 = torch.constant.int 6
    %7810 = torch.prims.convert_element_type %7798, %int6_9943 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_9944 = torch.constant.int 2
    %7811 = torch.aten.pow.Tensor_Scalar %7810, %int2_9944 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_9945 = torch.constant.int -1
    %7812 = torch.prim.ListConstruct %int-1_9945 : (!torch.int) -> !torch.list<int>
    %true_9946 = torch.constant.bool true
    %none_9947 = torch.constant.none
    %7813 = torch.aten.mean.dim %7811, %7812, %true_9946, %none_9947 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_9948 = torch.constant.float 9.9999999999999995E-7
    %int1_9949 = torch.constant.int 1
    %7814 = torch.aten.add.Scalar %7813, %float9.999990e-07_9948, %int1_9949 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7815 = torch.aten.rsqrt %7814 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7816 = torch.aten.mul.Tensor %7810, %7815 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9950 = torch.constant.int 5
    %7817 = torch.prims.convert_element_type %7816, %int5_9950 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.13.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.13.norm.key_norm.scale : tensor<128xf16>
    %7818 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7819 = torch.aten.mul.Tensor %7817, %7818 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9951 = torch.constant.int 5
    %7820 = torch.prims.convert_element_type %7809, %int5_9951 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_9952 = torch.constant.int 5
    %7821 = torch.prims.convert_element_type %7819, %int5_9952 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_9953 = torch.constant.int 6
    %7822 = torch.prims.convert_element_type %7820, %int6_9953 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9954 = torch.constant.int 1
    %int24_9955 = torch.constant.int 24
    %int4608_9956 = torch.constant.int 4608
    %int64_9957 = torch.constant.int 64
    %int1_9958 = torch.constant.int 1
    %int2_9959 = torch.constant.int 2
    %7823 = torch.prim.ListConstruct %int1_9954, %int24_9955, %int4608_9956, %int64_9957, %int1_9958, %int2_9959 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7824 = torch.aten.view %7822, %7823 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_9960 = torch.constant.int 6
    %7825 = torch.prims.convert_element_type %7821, %int6_9960 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_9961 = torch.constant.int 1
    %int24_9962 = torch.constant.int 24
    %int4608_9963 = torch.constant.int 4608
    %int64_9964 = torch.constant.int 64
    %int1_9965 = torch.constant.int 1
    %int2_9966 = torch.constant.int 2
    %7826 = torch.prim.ListConstruct %int1_9961, %int24_9962, %int4608_9963, %int64_9964, %int1_9965, %int2_9966 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7827 = torch.aten.view %7825, %7826 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_9967 = torch.constant.int 5
    %int0_9968 = torch.constant.int 0
    %7828 = torch.aten.select.int %211, %int5_9967, %int0_9968 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9969 = torch.constant.int 5
    %int0_9970 = torch.constant.int 0
    %7829 = torch.aten.select.int %7824, %int5_9969, %int0_9970 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7830 = torch.aten.mul.Tensor %7828, %7829 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9971 = torch.constant.int 5
    %int1_9972 = torch.constant.int 1
    %7831 = torch.aten.select.int %211, %int5_9971, %int1_9972 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9973 = torch.constant.int 5
    %int1_9974 = torch.constant.int 1
    %7832 = torch.aten.select.int %7824, %int5_9973, %int1_9974 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7833 = torch.aten.mul.Tensor %7831, %7832 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9975 = torch.constant.int 1
    %7834 = torch.aten.add.Tensor %7830, %7833, %int1_9975 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9976 = torch.constant.int 5
    %int0_9977 = torch.constant.int 0
    %7835 = torch.aten.select.int %211, %int5_9976, %int0_9977 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9978 = torch.constant.int 5
    %int0_9979 = torch.constant.int 0
    %7836 = torch.aten.select.int %7827, %int5_9978, %int0_9979 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7837 = torch.aten.mul.Tensor %7835, %7836 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_9980 = torch.constant.int 5
    %int1_9981 = torch.constant.int 1
    %7838 = torch.aten.select.int %211, %int5_9980, %int1_9981 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_9982 = torch.constant.int 5
    %int1_9983 = torch.constant.int 1
    %7839 = torch.aten.select.int %7827, %int5_9982, %int1_9983 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7840 = torch.aten.mul.Tensor %7838, %7839 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9984 = torch.constant.int 1
    %7841 = torch.aten.add.Tensor %7837, %7840, %int1_9984 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_9985 = torch.constant.int 1
    %int24_9986 = torch.constant.int 24
    %int4608_9987 = torch.constant.int 4608
    %int128_9988 = torch.constant.int 128
    %7842 = torch.prim.ListConstruct %int1_9985, %int24_9986, %int4608_9987, %int128_9988 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7843 = torch.aten.view %7834, %7842 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9989 = torch.constant.int 5
    %7844 = torch.prims.convert_element_type %7843, %int5_9989 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_9990 = torch.constant.int 1
    %int24_9991 = torch.constant.int 24
    %int4608_9992 = torch.constant.int 4608
    %int128_9993 = torch.constant.int 128
    %7845 = torch.prim.ListConstruct %int1_9990, %int24_9991, %int4608_9992, %int128_9993 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7846 = torch.aten.view %7841, %7845 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_9994 = torch.constant.int 5
    %7847 = torch.prims.convert_element_type %7846, %int5_9994 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_9995 = torch.constant.float 0.000000e+00
    %false_9996 = torch.constant.bool false
    %none_9997 = torch.constant.none
    %none_9998 = torch.constant.none
    %7848:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7844, %7847, %7799, %float0.000000e00_9995, %false_9996, %none_9997, %none_9998) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_9999 = torch.constant.int 0
    %int2_10000 = torch.constant.int 2
    %int1_10001 = torch.constant.int 1
    %int3_10002 = torch.constant.int 3
    %7849 = torch.prim.ListConstruct %int0_9999, %int2_10000, %int1_10001, %int3_10002 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7850 = torch.aten.permute %7848#0, %7849 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_10003 = torch.constant.int 1
    %int4608_10004 = torch.constant.int 4608
    %int3072_10005 = torch.constant.int 3072
    %7851 = torch.prim.ListConstruct %int1_10003, %int4608_10004, %int3072_10005 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7852 = torch.aten.view %7850, %7851 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_10006 = torch.constant.str "tanh"
    %7853 = torch.aten.gelu %7792, %str_10006 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7854 = torch.prim.ListConstruct %7852, %7853 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_10007 = torch.constant.int 2
    %7855 = torch.aten.cat %7854, %int2_10007 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_10008 = torch.constant.int 4608
    %int15360_10009 = torch.constant.int 15360
    %7856 = torch.prim.ListConstruct %int4608_10008, %int15360_10009 : (!torch.int, !torch.int) -> !torch.list<int>
    %7857 = torch.aten.view %7855, %7856 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.13.linear2.weight = util.global.load @__auto.sampler.single_blocks.13.linear2.weight : tensor<3072x15360xf16>
    %7858 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_10010 = torch.constant.int 0
    %int1_10011 = torch.constant.int 1
    %7859 = torch.aten.transpose.int %7858, %int0_10010, %int1_10011 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.13.linear2.bias = util.global.load @__auto.sampler.single_blocks.13.linear2.bias : tensor<3072xf16>
    %7860 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.13.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_10012 = torch.constant.int 6
    %7861 = torch.prims.convert_element_type %7860, %int6_10012 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_10013 = torch.constant.int 6
    %7862 = torch.prims.convert_element_type %7857, %int6_10013 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_10014 = torch.constant.int 6
    %7863 = torch.prims.convert_element_type %7859, %int6_10014 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7864 = torch.aten.mm %7862, %7863 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_10015 = torch.constant.int 1
    %7865 = torch.aten.mul.Scalar %7864, %int1_10015 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_10016 = torch.constant.int 1
    %7866 = torch.aten.mul.Scalar %7861, %int1_10016 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_10017 = torch.constant.int 1
    %7867 = torch.aten.add.Tensor %7865, %7866, %int1_10017 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_10018 = torch.constant.int 5
    %7868 = torch.prims.convert_element_type %7867, %int5_10018 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_10019 = torch.constant.int 1
    %int4608_10020 = torch.constant.int 4608
    %int3072_10021 = torch.constant.int 3072
    %7869 = torch.prim.ListConstruct %int1_10019, %int4608_10020, %int3072_10021 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7870 = torch.aten.view %7868, %7869 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7871 = torch.aten.mul.Tensor %7765, %7870 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10022 = torch.constant.int 1
    %7872 = torch.aten.add.Tensor %7747, %7871, %int1_10022 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7873 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.14.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.14.modulation.lin.weight : tensor<9216x3072xf16>
    %7874 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_10023 = torch.constant.int 0
    %int1_10024 = torch.constant.int 1
    %7875 = torch.aten.transpose.int %7874, %int0_10023, %int1_10024 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.14.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.14.modulation.lin.bias : tensor<9216xf16>
    %7876 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_10025 = torch.constant.int 6
    %7877 = torch.prims.convert_element_type %7876, %int6_10025 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_10026 = torch.constant.int 6
    %7878 = torch.prims.convert_element_type %7873, %int6_10026 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_10027 = torch.constant.int 6
    %7879 = torch.prims.convert_element_type %7875, %int6_10027 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %7880 = torch.aten.mm %7878, %7879 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_10028 = torch.constant.int 1
    %7881 = torch.aten.mul.Scalar %7880, %int1_10028 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_10029 = torch.constant.int 1
    %7882 = torch.aten.mul.Scalar %7877, %int1_10029 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_10030 = torch.constant.int 1
    %7883 = torch.aten.add.Tensor %7881, %7882, %int1_10030 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_10031 = torch.constant.int 5
    %7884 = torch.prims.convert_element_type %7883, %int5_10031 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_10032 = torch.constant.int 0
    %int0_10033 = torch.constant.int 0
    %int9223372036854775807_10034 = torch.constant.int 9223372036854775807
    %int1_10035 = torch.constant.int 1
    %7885 = torch.aten.slice.Tensor %7884, %int0_10032, %int0_10033, %int9223372036854775807_10034, %int1_10035 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_10036 = torch.constant.int 1
    %7886 = torch.aten.unsqueeze %7885, %int1_10036 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_10037 = torch.constant.int 2
    %int0_10038 = torch.constant.int 0
    %int9223372036854775807_10039 = torch.constant.int 9223372036854775807
    %int1_10040 = torch.constant.int 1
    %7887 = torch.aten.slice.Tensor %7886, %int2_10037, %int0_10038, %int9223372036854775807_10039, %int1_10040 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_10041 = torch.constant.int -1
    %int0_10042 = torch.constant.int 0
    %int3072_10043 = torch.constant.int 3072
    %int1_10044 = torch.constant.int 1
    %7888 = torch.aten.slice.Tensor %7887, %int-1_10041, %int0_10042, %int3072_10043, %int1_10044 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10045 = torch.constant.int -1
    %int3072_10046 = torch.constant.int 3072
    %int6144_10047 = torch.constant.int 6144
    %int1_10048 = torch.constant.int 1
    %7889 = torch.aten.slice.Tensor %7887, %int-1_10045, %int3072_10046, %int6144_10047, %int1_10048 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10049 = torch.constant.int -1
    %int6144_10050 = torch.constant.int 6144
    %int9216_10051 = torch.constant.int 9216
    %int1_10052 = torch.constant.int 1
    %7890 = torch.aten.slice.Tensor %7887, %int-1_10049, %int6144_10050, %int9216_10051, %int1_10052 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_10053 = torch.constant.int 1
    %int1_10054 = torch.constant.int 1
    %7891 = torch.aten.add.Scalar %7889, %int1_10053, %int1_10054 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_10055 = torch.constant.int 6
    %7892 = torch.prims.convert_element_type %7872, %int6_10055 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_10056 = torch.constant.int 2
    %7893 = torch.prim.ListConstruct %int2_10056 : (!torch.int) -> !torch.list<int>
    %int0_10057 = torch.constant.int 0
    %true_10058 = torch.constant.bool true
    %result0_10059, %result1_10060 = torch.aten.var_mean.correction %7892, %7893, %int0_10057, %true_10058 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_10061 = torch.constant.float 9.9999999999999995E-7
    %int1_10062 = torch.constant.int 1
    %7894 = torch.aten.add.Scalar %result0_10059, %float9.999990e-07_10061, %int1_10062 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %7895 = torch.aten.rsqrt %7894 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_10063 = torch.constant.int 1
    %7896 = torch.aten.sub.Tensor %7872, %result1_10060, %int1_10063 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %7897 = torch.aten.mul.Tensor %7896, %7895 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_10064 = torch.constant.int 5
    %7898 = torch.prims.convert_element_type %7897, %int5_10064 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7899 = torch.aten.mul.Tensor %7891, %7898 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10065 = torch.constant.int 1
    %7900 = torch.aten.add.Tensor %7899, %7888, %int1_10065 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_10066 = torch.constant.int 4608
    %int3072_10067 = torch.constant.int 3072
    %7901 = torch.prim.ListConstruct %int4608_10066, %int3072_10067 : (!torch.int, !torch.int) -> !torch.list<int>
    %7902 = torch.aten.view %7900, %7901 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.14.linear1.weight = util.global.load @__auto.sampler.single_blocks.14.linear1.weight : tensor<21504x3072xf16>
    %7903 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_10068 = torch.constant.int 0
    %int1_10069 = torch.constant.int 1
    %7904 = torch.aten.transpose.int %7903, %int0_10068, %int1_10069 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.14.linear1.bias = util.global.load @__auto.sampler.single_blocks.14.linear1.bias : tensor<21504xf16>
    %7905 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_10070 = torch.constant.int 6
    %7906 = torch.prims.convert_element_type %7905, %int6_10070 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_10071 = torch.constant.int 6
    %7907 = torch.prims.convert_element_type %7902, %int6_10071 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_10072 = torch.constant.int 6
    %7908 = torch.prims.convert_element_type %7904, %int6_10072 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %7909 = torch.aten.mm %7907, %7908 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_10073 = torch.constant.int 1
    %7910 = torch.aten.mul.Scalar %7909, %int1_10073 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_10074 = torch.constant.int 1
    %7911 = torch.aten.mul.Scalar %7906, %int1_10074 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_10075 = torch.constant.int 1
    %7912 = torch.aten.add.Tensor %7910, %7911, %int1_10075 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_10076 = torch.constant.int 5
    %7913 = torch.prims.convert_element_type %7912, %int5_10076 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_10077 = torch.constant.int 1
    %int4608_10078 = torch.constant.int 4608
    %int21504_10079 = torch.constant.int 21504
    %7914 = torch.prim.ListConstruct %int1_10077, %int4608_10078, %int21504_10079 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7915 = torch.aten.view %7913, %7914 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_10080 = torch.constant.int -1
    %int0_10081 = torch.constant.int 0
    %int9216_10082 = torch.constant.int 9216
    %int1_10083 = torch.constant.int 1
    %7916 = torch.aten.slice.Tensor %7915, %int-1_10080, %int0_10081, %int9216_10082, %int1_10083 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_10084 = torch.constant.int -1
    %int9216_10085 = torch.constant.int 9216
    %int21504_10086 = torch.constant.int 21504
    %int1_10087 = torch.constant.int 1
    %7917 = torch.aten.slice.Tensor %7915, %int-1_10084, %int9216_10085, %int21504_10086, %int1_10087 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_10088 = torch.constant.int 1
    %int4608_10089 = torch.constant.int 4608
    %int3_10090 = torch.constant.int 3
    %int24_10091 = torch.constant.int 24
    %int128_10092 = torch.constant.int 128
    %7918 = torch.prim.ListConstruct %int1_10088, %int4608_10089, %int3_10090, %int24_10091, %int128_10092 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7919 = torch.aten.view %7916, %7918 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_10093 = torch.constant.int 2
    %int0_10094 = torch.constant.int 0
    %int3_10095 = torch.constant.int 3
    %int1_10096 = torch.constant.int 1
    %int4_10097 = torch.constant.int 4
    %7920 = torch.prim.ListConstruct %int2_10093, %int0_10094, %int3_10095, %int1_10096, %int4_10097 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7921 = torch.aten.permute %7919, %7920 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_10098 = torch.constant.int 0
    %int0_10099 = torch.constant.int 0
    %7922 = torch.aten.select.int %7921, %int0_10098, %int0_10099 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10100 = torch.constant.int 0
    %int1_10101 = torch.constant.int 1
    %7923 = torch.aten.select.int %7921, %int0_10100, %int1_10101 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10102 = torch.constant.int 0
    %int2_10103 = torch.constant.int 2
    %7924 = torch.aten.select.int %7921, %int0_10102, %int2_10103 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10104 = torch.constant.int 6
    %7925 = torch.prims.convert_element_type %7922, %int6_10104 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10105 = torch.constant.int 2
    %7926 = torch.aten.pow.Tensor_Scalar %7925, %int2_10105 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10106 = torch.constant.int -1
    %7927 = torch.prim.ListConstruct %int-1_10106 : (!torch.int) -> !torch.list<int>
    %true_10107 = torch.constant.bool true
    %none_10108 = torch.constant.none
    %7928 = torch.aten.mean.dim %7926, %7927, %true_10107, %none_10108 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10109 = torch.constant.float 9.9999999999999995E-7
    %int1_10110 = torch.constant.int 1
    %7929 = torch.aten.add.Scalar %7928, %float9.999990e-07_10109, %int1_10110 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7930 = torch.aten.rsqrt %7929 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7931 = torch.aten.mul.Tensor %7925, %7930 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10111 = torch.constant.int 5
    %7932 = torch.prims.convert_element_type %7931, %int5_10111 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.14.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.14.norm.query_norm.scale : tensor<128xf16>
    %7933 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7934 = torch.aten.mul.Tensor %7932, %7933 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10112 = torch.constant.int 6
    %7935 = torch.prims.convert_element_type %7923, %int6_10112 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10113 = torch.constant.int 2
    %7936 = torch.aten.pow.Tensor_Scalar %7935, %int2_10113 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10114 = torch.constant.int -1
    %7937 = torch.prim.ListConstruct %int-1_10114 : (!torch.int) -> !torch.list<int>
    %true_10115 = torch.constant.bool true
    %none_10116 = torch.constant.none
    %7938 = torch.aten.mean.dim %7936, %7937, %true_10115, %none_10116 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10117 = torch.constant.float 9.9999999999999995E-7
    %int1_10118 = torch.constant.int 1
    %7939 = torch.aten.add.Scalar %7938, %float9.999990e-07_10117, %int1_10118 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %7940 = torch.aten.rsqrt %7939 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %7941 = torch.aten.mul.Tensor %7935, %7940 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10119 = torch.constant.int 5
    %7942 = torch.prims.convert_element_type %7941, %int5_10119 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.14.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.14.norm.key_norm.scale : tensor<128xf16>
    %7943 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %7944 = torch.aten.mul.Tensor %7942, %7943 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10120 = torch.constant.int 5
    %7945 = torch.prims.convert_element_type %7934, %int5_10120 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10121 = torch.constant.int 5
    %7946 = torch.prims.convert_element_type %7944, %int5_10121 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10122 = torch.constant.int 6
    %7947 = torch.prims.convert_element_type %7945, %int6_10122 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10123 = torch.constant.int 1
    %int24_10124 = torch.constant.int 24
    %int4608_10125 = torch.constant.int 4608
    %int64_10126 = torch.constant.int 64
    %int1_10127 = torch.constant.int 1
    %int2_10128 = torch.constant.int 2
    %7948 = torch.prim.ListConstruct %int1_10123, %int24_10124, %int4608_10125, %int64_10126, %int1_10127, %int2_10128 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7949 = torch.aten.view %7947, %7948 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_10129 = torch.constant.int 6
    %7950 = torch.prims.convert_element_type %7946, %int6_10129 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10130 = torch.constant.int 1
    %int24_10131 = torch.constant.int 24
    %int4608_10132 = torch.constant.int 4608
    %int64_10133 = torch.constant.int 64
    %int1_10134 = torch.constant.int 1
    %int2_10135 = torch.constant.int 2
    %7951 = torch.prim.ListConstruct %int1_10130, %int24_10131, %int4608_10132, %int64_10133, %int1_10134, %int2_10135 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7952 = torch.aten.view %7950, %7951 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_10136 = torch.constant.int 5
    %int0_10137 = torch.constant.int 0
    %7953 = torch.aten.select.int %211, %int5_10136, %int0_10137 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10138 = torch.constant.int 5
    %int0_10139 = torch.constant.int 0
    %7954 = torch.aten.select.int %7949, %int5_10138, %int0_10139 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7955 = torch.aten.mul.Tensor %7953, %7954 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10140 = torch.constant.int 5
    %int1_10141 = torch.constant.int 1
    %7956 = torch.aten.select.int %211, %int5_10140, %int1_10141 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10142 = torch.constant.int 5
    %int1_10143 = torch.constant.int 1
    %7957 = torch.aten.select.int %7949, %int5_10142, %int1_10143 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7958 = torch.aten.mul.Tensor %7956, %7957 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10144 = torch.constant.int 1
    %7959 = torch.aten.add.Tensor %7955, %7958, %int1_10144 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10145 = torch.constant.int 5
    %int0_10146 = torch.constant.int 0
    %7960 = torch.aten.select.int %211, %int5_10145, %int0_10146 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10147 = torch.constant.int 5
    %int0_10148 = torch.constant.int 0
    %7961 = torch.aten.select.int %7952, %int5_10147, %int0_10148 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7962 = torch.aten.mul.Tensor %7960, %7961 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10149 = torch.constant.int 5
    %int1_10150 = torch.constant.int 1
    %7963 = torch.aten.select.int %211, %int5_10149, %int1_10150 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10151 = torch.constant.int 5
    %int1_10152 = torch.constant.int 1
    %7964 = torch.aten.select.int %7952, %int5_10151, %int1_10152 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %7965 = torch.aten.mul.Tensor %7963, %7964 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10153 = torch.constant.int 1
    %7966 = torch.aten.add.Tensor %7962, %7965, %int1_10153 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10154 = torch.constant.int 1
    %int24_10155 = torch.constant.int 24
    %int4608_10156 = torch.constant.int 4608
    %int128_10157 = torch.constant.int 128
    %7967 = torch.prim.ListConstruct %int1_10154, %int24_10155, %int4608_10156, %int128_10157 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7968 = torch.aten.view %7959, %7967 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10158 = torch.constant.int 5
    %7969 = torch.prims.convert_element_type %7968, %int5_10158 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_10159 = torch.constant.int 1
    %int24_10160 = torch.constant.int 24
    %int4608_10161 = torch.constant.int 4608
    %int128_10162 = torch.constant.int 128
    %7970 = torch.prim.ListConstruct %int1_10159, %int24_10160, %int4608_10161, %int128_10162 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7971 = torch.aten.view %7966, %7970 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10163 = torch.constant.int 5
    %7972 = torch.prims.convert_element_type %7971, %int5_10163 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_10164 = torch.constant.float 0.000000e+00
    %false_10165 = torch.constant.bool false
    %none_10166 = torch.constant.none
    %none_10167 = torch.constant.none
    %7973:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%7969, %7972, %7924, %float0.000000e00_10164, %false_10165, %none_10166, %none_10167) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_10168 = torch.constant.int 0
    %int2_10169 = torch.constant.int 2
    %int1_10170 = torch.constant.int 1
    %int3_10171 = torch.constant.int 3
    %7974 = torch.prim.ListConstruct %int0_10168, %int2_10169, %int1_10170, %int3_10171 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7975 = torch.aten.permute %7973#0, %7974 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_10172 = torch.constant.int 1
    %int4608_10173 = torch.constant.int 4608
    %int3072_10174 = torch.constant.int 3072
    %7976 = torch.prim.ListConstruct %int1_10172, %int4608_10173, %int3072_10174 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7977 = torch.aten.view %7975, %7976 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_10175 = torch.constant.str "tanh"
    %7978 = torch.aten.gelu %7917, %str_10175 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %7979 = torch.prim.ListConstruct %7977, %7978 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_10176 = torch.constant.int 2
    %7980 = torch.aten.cat %7979, %int2_10176 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_10177 = torch.constant.int 4608
    %int15360_10178 = torch.constant.int 15360
    %7981 = torch.prim.ListConstruct %int4608_10177, %int15360_10178 : (!torch.int, !torch.int) -> !torch.list<int>
    %7982 = torch.aten.view %7980, %7981 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.14.linear2.weight = util.global.load @__auto.sampler.single_blocks.14.linear2.weight : tensor<3072x15360xf16>
    %7983 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_10179 = torch.constant.int 0
    %int1_10180 = torch.constant.int 1
    %7984 = torch.aten.transpose.int %7983, %int0_10179, %int1_10180 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.14.linear2.bias = util.global.load @__auto.sampler.single_blocks.14.linear2.bias : tensor<3072xf16>
    %7985 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.14.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_10181 = torch.constant.int 6
    %7986 = torch.prims.convert_element_type %7985, %int6_10181 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_10182 = torch.constant.int 6
    %7987 = torch.prims.convert_element_type %7982, %int6_10182 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_10183 = torch.constant.int 6
    %7988 = torch.prims.convert_element_type %7984, %int6_10183 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %7989 = torch.aten.mm %7987, %7988 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_10184 = torch.constant.int 1
    %7990 = torch.aten.mul.Scalar %7989, %int1_10184 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_10185 = torch.constant.int 1
    %7991 = torch.aten.mul.Scalar %7986, %int1_10185 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_10186 = torch.constant.int 1
    %7992 = torch.aten.add.Tensor %7990, %7991, %int1_10186 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_10187 = torch.constant.int 5
    %7993 = torch.prims.convert_element_type %7992, %int5_10187 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_10188 = torch.constant.int 1
    %int4608_10189 = torch.constant.int 4608
    %int3072_10190 = torch.constant.int 3072
    %7994 = torch.prim.ListConstruct %int1_10188, %int4608_10189, %int3072_10190 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %7995 = torch.aten.view %7993, %7994 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %7996 = torch.aten.mul.Tensor %7890, %7995 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10191 = torch.constant.int 1
    %7997 = torch.aten.add.Tensor %7872, %7996, %int1_10191 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %7998 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.15.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.15.modulation.lin.weight : tensor<9216x3072xf16>
    %7999 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_10192 = torch.constant.int 0
    %int1_10193 = torch.constant.int 1
    %8000 = torch.aten.transpose.int %7999, %int0_10192, %int1_10193 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.15.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.15.modulation.lin.bias : tensor<9216xf16>
    %8001 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_10194 = torch.constant.int 6
    %8002 = torch.prims.convert_element_type %8001, %int6_10194 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_10195 = torch.constant.int 6
    %8003 = torch.prims.convert_element_type %7998, %int6_10195 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_10196 = torch.constant.int 6
    %8004 = torch.prims.convert_element_type %8000, %int6_10196 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8005 = torch.aten.mm %8003, %8004 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_10197 = torch.constant.int 1
    %8006 = torch.aten.mul.Scalar %8005, %int1_10197 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_10198 = torch.constant.int 1
    %8007 = torch.aten.mul.Scalar %8002, %int1_10198 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_10199 = torch.constant.int 1
    %8008 = torch.aten.add.Tensor %8006, %8007, %int1_10199 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_10200 = torch.constant.int 5
    %8009 = torch.prims.convert_element_type %8008, %int5_10200 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_10201 = torch.constant.int 0
    %int0_10202 = torch.constant.int 0
    %int9223372036854775807_10203 = torch.constant.int 9223372036854775807
    %int1_10204 = torch.constant.int 1
    %8010 = torch.aten.slice.Tensor %8009, %int0_10201, %int0_10202, %int9223372036854775807_10203, %int1_10204 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_10205 = torch.constant.int 1
    %8011 = torch.aten.unsqueeze %8010, %int1_10205 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_10206 = torch.constant.int 2
    %int0_10207 = torch.constant.int 0
    %int9223372036854775807_10208 = torch.constant.int 9223372036854775807
    %int1_10209 = torch.constant.int 1
    %8012 = torch.aten.slice.Tensor %8011, %int2_10206, %int0_10207, %int9223372036854775807_10208, %int1_10209 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_10210 = torch.constant.int -1
    %int0_10211 = torch.constant.int 0
    %int3072_10212 = torch.constant.int 3072
    %int1_10213 = torch.constant.int 1
    %8013 = torch.aten.slice.Tensor %8012, %int-1_10210, %int0_10211, %int3072_10212, %int1_10213 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10214 = torch.constant.int -1
    %int3072_10215 = torch.constant.int 3072
    %int6144_10216 = torch.constant.int 6144
    %int1_10217 = torch.constant.int 1
    %8014 = torch.aten.slice.Tensor %8012, %int-1_10214, %int3072_10215, %int6144_10216, %int1_10217 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10218 = torch.constant.int -1
    %int6144_10219 = torch.constant.int 6144
    %int9216_10220 = torch.constant.int 9216
    %int1_10221 = torch.constant.int 1
    %8015 = torch.aten.slice.Tensor %8012, %int-1_10218, %int6144_10219, %int9216_10220, %int1_10221 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_10222 = torch.constant.int 1
    %int1_10223 = torch.constant.int 1
    %8016 = torch.aten.add.Scalar %8014, %int1_10222, %int1_10223 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_10224 = torch.constant.int 6
    %8017 = torch.prims.convert_element_type %7997, %int6_10224 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_10225 = torch.constant.int 2
    %8018 = torch.prim.ListConstruct %int2_10225 : (!torch.int) -> !torch.list<int>
    %int0_10226 = torch.constant.int 0
    %true_10227 = torch.constant.bool true
    %result0_10228, %result1_10229 = torch.aten.var_mean.correction %8017, %8018, %int0_10226, %true_10227 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_10230 = torch.constant.float 9.9999999999999995E-7
    %int1_10231 = torch.constant.int 1
    %8019 = torch.aten.add.Scalar %result0_10228, %float9.999990e-07_10230, %int1_10231 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8020 = torch.aten.rsqrt %8019 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_10232 = torch.constant.int 1
    %8021 = torch.aten.sub.Tensor %7997, %result1_10229, %int1_10232 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8022 = torch.aten.mul.Tensor %8021, %8020 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_10233 = torch.constant.int 5
    %8023 = torch.prims.convert_element_type %8022, %int5_10233 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8024 = torch.aten.mul.Tensor %8016, %8023 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10234 = torch.constant.int 1
    %8025 = torch.aten.add.Tensor %8024, %8013, %int1_10234 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_10235 = torch.constant.int 4608
    %int3072_10236 = torch.constant.int 3072
    %8026 = torch.prim.ListConstruct %int4608_10235, %int3072_10236 : (!torch.int, !torch.int) -> !torch.list<int>
    %8027 = torch.aten.view %8025, %8026 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.15.linear1.weight = util.global.load @__auto.sampler.single_blocks.15.linear1.weight : tensor<21504x3072xf16>
    %8028 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_10237 = torch.constant.int 0
    %int1_10238 = torch.constant.int 1
    %8029 = torch.aten.transpose.int %8028, %int0_10237, %int1_10238 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.15.linear1.bias = util.global.load @__auto.sampler.single_blocks.15.linear1.bias : tensor<21504xf16>
    %8030 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_10239 = torch.constant.int 6
    %8031 = torch.prims.convert_element_type %8030, %int6_10239 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_10240 = torch.constant.int 6
    %8032 = torch.prims.convert_element_type %8027, %int6_10240 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_10241 = torch.constant.int 6
    %8033 = torch.prims.convert_element_type %8029, %int6_10241 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8034 = torch.aten.mm %8032, %8033 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_10242 = torch.constant.int 1
    %8035 = torch.aten.mul.Scalar %8034, %int1_10242 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_10243 = torch.constant.int 1
    %8036 = torch.aten.mul.Scalar %8031, %int1_10243 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_10244 = torch.constant.int 1
    %8037 = torch.aten.add.Tensor %8035, %8036, %int1_10244 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_10245 = torch.constant.int 5
    %8038 = torch.prims.convert_element_type %8037, %int5_10245 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_10246 = torch.constant.int 1
    %int4608_10247 = torch.constant.int 4608
    %int21504_10248 = torch.constant.int 21504
    %8039 = torch.prim.ListConstruct %int1_10246, %int4608_10247, %int21504_10248 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8040 = torch.aten.view %8038, %8039 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_10249 = torch.constant.int -1
    %int0_10250 = torch.constant.int 0
    %int9216_10251 = torch.constant.int 9216
    %int1_10252 = torch.constant.int 1
    %8041 = torch.aten.slice.Tensor %8040, %int-1_10249, %int0_10250, %int9216_10251, %int1_10252 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_10253 = torch.constant.int -1
    %int9216_10254 = torch.constant.int 9216
    %int21504_10255 = torch.constant.int 21504
    %int1_10256 = torch.constant.int 1
    %8042 = torch.aten.slice.Tensor %8040, %int-1_10253, %int9216_10254, %int21504_10255, %int1_10256 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_10257 = torch.constant.int 1
    %int4608_10258 = torch.constant.int 4608
    %int3_10259 = torch.constant.int 3
    %int24_10260 = torch.constant.int 24
    %int128_10261 = torch.constant.int 128
    %8043 = torch.prim.ListConstruct %int1_10257, %int4608_10258, %int3_10259, %int24_10260, %int128_10261 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8044 = torch.aten.view %8041, %8043 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_10262 = torch.constant.int 2
    %int0_10263 = torch.constant.int 0
    %int3_10264 = torch.constant.int 3
    %int1_10265 = torch.constant.int 1
    %int4_10266 = torch.constant.int 4
    %8045 = torch.prim.ListConstruct %int2_10262, %int0_10263, %int3_10264, %int1_10265, %int4_10266 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8046 = torch.aten.permute %8044, %8045 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_10267 = torch.constant.int 0
    %int0_10268 = torch.constant.int 0
    %8047 = torch.aten.select.int %8046, %int0_10267, %int0_10268 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10269 = torch.constant.int 0
    %int1_10270 = torch.constant.int 1
    %8048 = torch.aten.select.int %8046, %int0_10269, %int1_10270 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10271 = torch.constant.int 0
    %int2_10272 = torch.constant.int 2
    %8049 = torch.aten.select.int %8046, %int0_10271, %int2_10272 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10273 = torch.constant.int 6
    %8050 = torch.prims.convert_element_type %8047, %int6_10273 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10274 = torch.constant.int 2
    %8051 = torch.aten.pow.Tensor_Scalar %8050, %int2_10274 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10275 = torch.constant.int -1
    %8052 = torch.prim.ListConstruct %int-1_10275 : (!torch.int) -> !torch.list<int>
    %true_10276 = torch.constant.bool true
    %none_10277 = torch.constant.none
    %8053 = torch.aten.mean.dim %8051, %8052, %true_10276, %none_10277 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10278 = torch.constant.float 9.9999999999999995E-7
    %int1_10279 = torch.constant.int 1
    %8054 = torch.aten.add.Scalar %8053, %float9.999990e-07_10278, %int1_10279 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8055 = torch.aten.rsqrt %8054 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8056 = torch.aten.mul.Tensor %8050, %8055 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10280 = torch.constant.int 5
    %8057 = torch.prims.convert_element_type %8056, %int5_10280 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.15.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.15.norm.query_norm.scale : tensor<128xf16>
    %8058 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8059 = torch.aten.mul.Tensor %8057, %8058 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10281 = torch.constant.int 6
    %8060 = torch.prims.convert_element_type %8048, %int6_10281 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10282 = torch.constant.int 2
    %8061 = torch.aten.pow.Tensor_Scalar %8060, %int2_10282 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10283 = torch.constant.int -1
    %8062 = torch.prim.ListConstruct %int-1_10283 : (!torch.int) -> !torch.list<int>
    %true_10284 = torch.constant.bool true
    %none_10285 = torch.constant.none
    %8063 = torch.aten.mean.dim %8061, %8062, %true_10284, %none_10285 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10286 = torch.constant.float 9.9999999999999995E-7
    %int1_10287 = torch.constant.int 1
    %8064 = torch.aten.add.Scalar %8063, %float9.999990e-07_10286, %int1_10287 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8065 = torch.aten.rsqrt %8064 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8066 = torch.aten.mul.Tensor %8060, %8065 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10288 = torch.constant.int 5
    %8067 = torch.prims.convert_element_type %8066, %int5_10288 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.15.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.15.norm.key_norm.scale : tensor<128xf16>
    %8068 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8069 = torch.aten.mul.Tensor %8067, %8068 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10289 = torch.constant.int 5
    %8070 = torch.prims.convert_element_type %8059, %int5_10289 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10290 = torch.constant.int 5
    %8071 = torch.prims.convert_element_type %8069, %int5_10290 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10291 = torch.constant.int 6
    %8072 = torch.prims.convert_element_type %8070, %int6_10291 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10292 = torch.constant.int 1
    %int24_10293 = torch.constant.int 24
    %int4608_10294 = torch.constant.int 4608
    %int64_10295 = torch.constant.int 64
    %int1_10296 = torch.constant.int 1
    %int2_10297 = torch.constant.int 2
    %8073 = torch.prim.ListConstruct %int1_10292, %int24_10293, %int4608_10294, %int64_10295, %int1_10296, %int2_10297 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8074 = torch.aten.view %8072, %8073 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_10298 = torch.constant.int 6
    %8075 = torch.prims.convert_element_type %8071, %int6_10298 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10299 = torch.constant.int 1
    %int24_10300 = torch.constant.int 24
    %int4608_10301 = torch.constant.int 4608
    %int64_10302 = torch.constant.int 64
    %int1_10303 = torch.constant.int 1
    %int2_10304 = torch.constant.int 2
    %8076 = torch.prim.ListConstruct %int1_10299, %int24_10300, %int4608_10301, %int64_10302, %int1_10303, %int2_10304 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8077 = torch.aten.view %8075, %8076 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_10305 = torch.constant.int 5
    %int0_10306 = torch.constant.int 0
    %8078 = torch.aten.select.int %211, %int5_10305, %int0_10306 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10307 = torch.constant.int 5
    %int0_10308 = torch.constant.int 0
    %8079 = torch.aten.select.int %8074, %int5_10307, %int0_10308 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8080 = torch.aten.mul.Tensor %8078, %8079 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10309 = torch.constant.int 5
    %int1_10310 = torch.constant.int 1
    %8081 = torch.aten.select.int %211, %int5_10309, %int1_10310 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10311 = torch.constant.int 5
    %int1_10312 = torch.constant.int 1
    %8082 = torch.aten.select.int %8074, %int5_10311, %int1_10312 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8083 = torch.aten.mul.Tensor %8081, %8082 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10313 = torch.constant.int 1
    %8084 = torch.aten.add.Tensor %8080, %8083, %int1_10313 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10314 = torch.constant.int 5
    %int0_10315 = torch.constant.int 0
    %8085 = torch.aten.select.int %211, %int5_10314, %int0_10315 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10316 = torch.constant.int 5
    %int0_10317 = torch.constant.int 0
    %8086 = torch.aten.select.int %8077, %int5_10316, %int0_10317 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8087 = torch.aten.mul.Tensor %8085, %8086 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10318 = torch.constant.int 5
    %int1_10319 = torch.constant.int 1
    %8088 = torch.aten.select.int %211, %int5_10318, %int1_10319 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10320 = torch.constant.int 5
    %int1_10321 = torch.constant.int 1
    %8089 = torch.aten.select.int %8077, %int5_10320, %int1_10321 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8090 = torch.aten.mul.Tensor %8088, %8089 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10322 = torch.constant.int 1
    %8091 = torch.aten.add.Tensor %8087, %8090, %int1_10322 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10323 = torch.constant.int 1
    %int24_10324 = torch.constant.int 24
    %int4608_10325 = torch.constant.int 4608
    %int128_10326 = torch.constant.int 128
    %8092 = torch.prim.ListConstruct %int1_10323, %int24_10324, %int4608_10325, %int128_10326 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8093 = torch.aten.view %8084, %8092 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10327 = torch.constant.int 5
    %8094 = torch.prims.convert_element_type %8093, %int5_10327 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_10328 = torch.constant.int 1
    %int24_10329 = torch.constant.int 24
    %int4608_10330 = torch.constant.int 4608
    %int128_10331 = torch.constant.int 128
    %8095 = torch.prim.ListConstruct %int1_10328, %int24_10329, %int4608_10330, %int128_10331 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8096 = torch.aten.view %8091, %8095 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10332 = torch.constant.int 5
    %8097 = torch.prims.convert_element_type %8096, %int5_10332 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_10333 = torch.constant.float 0.000000e+00
    %false_10334 = torch.constant.bool false
    %none_10335 = torch.constant.none
    %none_10336 = torch.constant.none
    %8098:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8094, %8097, %8049, %float0.000000e00_10333, %false_10334, %none_10335, %none_10336) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_10337 = torch.constant.int 0
    %int2_10338 = torch.constant.int 2
    %int1_10339 = torch.constant.int 1
    %int3_10340 = torch.constant.int 3
    %8099 = torch.prim.ListConstruct %int0_10337, %int2_10338, %int1_10339, %int3_10340 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8100 = torch.aten.permute %8098#0, %8099 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_10341 = torch.constant.int 1
    %int4608_10342 = torch.constant.int 4608
    %int3072_10343 = torch.constant.int 3072
    %8101 = torch.prim.ListConstruct %int1_10341, %int4608_10342, %int3072_10343 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8102 = torch.aten.view %8100, %8101 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_10344 = torch.constant.str "tanh"
    %8103 = torch.aten.gelu %8042, %str_10344 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8104 = torch.prim.ListConstruct %8102, %8103 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_10345 = torch.constant.int 2
    %8105 = torch.aten.cat %8104, %int2_10345 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_10346 = torch.constant.int 4608
    %int15360_10347 = torch.constant.int 15360
    %8106 = torch.prim.ListConstruct %int4608_10346, %int15360_10347 : (!torch.int, !torch.int) -> !torch.list<int>
    %8107 = torch.aten.view %8105, %8106 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.15.linear2.weight = util.global.load @__auto.sampler.single_blocks.15.linear2.weight : tensor<3072x15360xf16>
    %8108 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_10348 = torch.constant.int 0
    %int1_10349 = torch.constant.int 1
    %8109 = torch.aten.transpose.int %8108, %int0_10348, %int1_10349 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.15.linear2.bias = util.global.load @__auto.sampler.single_blocks.15.linear2.bias : tensor<3072xf16>
    %8110 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.15.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_10350 = torch.constant.int 6
    %8111 = torch.prims.convert_element_type %8110, %int6_10350 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_10351 = torch.constant.int 6
    %8112 = torch.prims.convert_element_type %8107, %int6_10351 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_10352 = torch.constant.int 6
    %8113 = torch.prims.convert_element_type %8109, %int6_10352 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8114 = torch.aten.mm %8112, %8113 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_10353 = torch.constant.int 1
    %8115 = torch.aten.mul.Scalar %8114, %int1_10353 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_10354 = torch.constant.int 1
    %8116 = torch.aten.mul.Scalar %8111, %int1_10354 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_10355 = torch.constant.int 1
    %8117 = torch.aten.add.Tensor %8115, %8116, %int1_10355 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_10356 = torch.constant.int 5
    %8118 = torch.prims.convert_element_type %8117, %int5_10356 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_10357 = torch.constant.int 1
    %int4608_10358 = torch.constant.int 4608
    %int3072_10359 = torch.constant.int 3072
    %8119 = torch.prim.ListConstruct %int1_10357, %int4608_10358, %int3072_10359 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8120 = torch.aten.view %8118, %8119 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8121 = torch.aten.mul.Tensor %8015, %8120 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10360 = torch.constant.int 1
    %8122 = torch.aten.add.Tensor %7997, %8121, %int1_10360 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8123 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.16.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.16.modulation.lin.weight : tensor<9216x3072xf16>
    %8124 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_10361 = torch.constant.int 0
    %int1_10362 = torch.constant.int 1
    %8125 = torch.aten.transpose.int %8124, %int0_10361, %int1_10362 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.16.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.16.modulation.lin.bias : tensor<9216xf16>
    %8126 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_10363 = torch.constant.int 6
    %8127 = torch.prims.convert_element_type %8126, %int6_10363 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_10364 = torch.constant.int 6
    %8128 = torch.prims.convert_element_type %8123, %int6_10364 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_10365 = torch.constant.int 6
    %8129 = torch.prims.convert_element_type %8125, %int6_10365 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8130 = torch.aten.mm %8128, %8129 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_10366 = torch.constant.int 1
    %8131 = torch.aten.mul.Scalar %8130, %int1_10366 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_10367 = torch.constant.int 1
    %8132 = torch.aten.mul.Scalar %8127, %int1_10367 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_10368 = torch.constant.int 1
    %8133 = torch.aten.add.Tensor %8131, %8132, %int1_10368 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_10369 = torch.constant.int 5
    %8134 = torch.prims.convert_element_type %8133, %int5_10369 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_10370 = torch.constant.int 0
    %int0_10371 = torch.constant.int 0
    %int9223372036854775807_10372 = torch.constant.int 9223372036854775807
    %int1_10373 = torch.constant.int 1
    %8135 = torch.aten.slice.Tensor %8134, %int0_10370, %int0_10371, %int9223372036854775807_10372, %int1_10373 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_10374 = torch.constant.int 1
    %8136 = torch.aten.unsqueeze %8135, %int1_10374 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_10375 = torch.constant.int 2
    %int0_10376 = torch.constant.int 0
    %int9223372036854775807_10377 = torch.constant.int 9223372036854775807
    %int1_10378 = torch.constant.int 1
    %8137 = torch.aten.slice.Tensor %8136, %int2_10375, %int0_10376, %int9223372036854775807_10377, %int1_10378 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_10379 = torch.constant.int -1
    %int0_10380 = torch.constant.int 0
    %int3072_10381 = torch.constant.int 3072
    %int1_10382 = torch.constant.int 1
    %8138 = torch.aten.slice.Tensor %8137, %int-1_10379, %int0_10380, %int3072_10381, %int1_10382 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10383 = torch.constant.int -1
    %int3072_10384 = torch.constant.int 3072
    %int6144_10385 = torch.constant.int 6144
    %int1_10386 = torch.constant.int 1
    %8139 = torch.aten.slice.Tensor %8137, %int-1_10383, %int3072_10384, %int6144_10385, %int1_10386 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10387 = torch.constant.int -1
    %int6144_10388 = torch.constant.int 6144
    %int9216_10389 = torch.constant.int 9216
    %int1_10390 = torch.constant.int 1
    %8140 = torch.aten.slice.Tensor %8137, %int-1_10387, %int6144_10388, %int9216_10389, %int1_10390 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_10391 = torch.constant.int 1
    %int1_10392 = torch.constant.int 1
    %8141 = torch.aten.add.Scalar %8139, %int1_10391, %int1_10392 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_10393 = torch.constant.int 6
    %8142 = torch.prims.convert_element_type %8122, %int6_10393 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_10394 = torch.constant.int 2
    %8143 = torch.prim.ListConstruct %int2_10394 : (!torch.int) -> !torch.list<int>
    %int0_10395 = torch.constant.int 0
    %true_10396 = torch.constant.bool true
    %result0_10397, %result1_10398 = torch.aten.var_mean.correction %8142, %8143, %int0_10395, %true_10396 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_10399 = torch.constant.float 9.9999999999999995E-7
    %int1_10400 = torch.constant.int 1
    %8144 = torch.aten.add.Scalar %result0_10397, %float9.999990e-07_10399, %int1_10400 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8145 = torch.aten.rsqrt %8144 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_10401 = torch.constant.int 1
    %8146 = torch.aten.sub.Tensor %8122, %result1_10398, %int1_10401 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8147 = torch.aten.mul.Tensor %8146, %8145 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_10402 = torch.constant.int 5
    %8148 = torch.prims.convert_element_type %8147, %int5_10402 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8149 = torch.aten.mul.Tensor %8141, %8148 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10403 = torch.constant.int 1
    %8150 = torch.aten.add.Tensor %8149, %8138, %int1_10403 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_10404 = torch.constant.int 4608
    %int3072_10405 = torch.constant.int 3072
    %8151 = torch.prim.ListConstruct %int4608_10404, %int3072_10405 : (!torch.int, !torch.int) -> !torch.list<int>
    %8152 = torch.aten.view %8150, %8151 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.16.linear1.weight = util.global.load @__auto.sampler.single_blocks.16.linear1.weight : tensor<21504x3072xf16>
    %8153 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_10406 = torch.constant.int 0
    %int1_10407 = torch.constant.int 1
    %8154 = torch.aten.transpose.int %8153, %int0_10406, %int1_10407 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.16.linear1.bias = util.global.load @__auto.sampler.single_blocks.16.linear1.bias : tensor<21504xf16>
    %8155 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_10408 = torch.constant.int 6
    %8156 = torch.prims.convert_element_type %8155, %int6_10408 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_10409 = torch.constant.int 6
    %8157 = torch.prims.convert_element_type %8152, %int6_10409 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_10410 = torch.constant.int 6
    %8158 = torch.prims.convert_element_type %8154, %int6_10410 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8159 = torch.aten.mm %8157, %8158 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_10411 = torch.constant.int 1
    %8160 = torch.aten.mul.Scalar %8159, %int1_10411 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_10412 = torch.constant.int 1
    %8161 = torch.aten.mul.Scalar %8156, %int1_10412 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_10413 = torch.constant.int 1
    %8162 = torch.aten.add.Tensor %8160, %8161, %int1_10413 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_10414 = torch.constant.int 5
    %8163 = torch.prims.convert_element_type %8162, %int5_10414 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_10415 = torch.constant.int 1
    %int4608_10416 = torch.constant.int 4608
    %int21504_10417 = torch.constant.int 21504
    %8164 = torch.prim.ListConstruct %int1_10415, %int4608_10416, %int21504_10417 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8165 = torch.aten.view %8163, %8164 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_10418 = torch.constant.int -1
    %int0_10419 = torch.constant.int 0
    %int9216_10420 = torch.constant.int 9216
    %int1_10421 = torch.constant.int 1
    %8166 = torch.aten.slice.Tensor %8165, %int-1_10418, %int0_10419, %int9216_10420, %int1_10421 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_10422 = torch.constant.int -1
    %int9216_10423 = torch.constant.int 9216
    %int21504_10424 = torch.constant.int 21504
    %int1_10425 = torch.constant.int 1
    %8167 = torch.aten.slice.Tensor %8165, %int-1_10422, %int9216_10423, %int21504_10424, %int1_10425 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_10426 = torch.constant.int 1
    %int4608_10427 = torch.constant.int 4608
    %int3_10428 = torch.constant.int 3
    %int24_10429 = torch.constant.int 24
    %int128_10430 = torch.constant.int 128
    %8168 = torch.prim.ListConstruct %int1_10426, %int4608_10427, %int3_10428, %int24_10429, %int128_10430 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8169 = torch.aten.view %8166, %8168 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_10431 = torch.constant.int 2
    %int0_10432 = torch.constant.int 0
    %int3_10433 = torch.constant.int 3
    %int1_10434 = torch.constant.int 1
    %int4_10435 = torch.constant.int 4
    %8170 = torch.prim.ListConstruct %int2_10431, %int0_10432, %int3_10433, %int1_10434, %int4_10435 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8171 = torch.aten.permute %8169, %8170 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_10436 = torch.constant.int 0
    %int0_10437 = torch.constant.int 0
    %8172 = torch.aten.select.int %8171, %int0_10436, %int0_10437 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10438 = torch.constant.int 0
    %int1_10439 = torch.constant.int 1
    %8173 = torch.aten.select.int %8171, %int0_10438, %int1_10439 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10440 = torch.constant.int 0
    %int2_10441 = torch.constant.int 2
    %8174 = torch.aten.select.int %8171, %int0_10440, %int2_10441 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10442 = torch.constant.int 6
    %8175 = torch.prims.convert_element_type %8172, %int6_10442 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10443 = torch.constant.int 2
    %8176 = torch.aten.pow.Tensor_Scalar %8175, %int2_10443 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10444 = torch.constant.int -1
    %8177 = torch.prim.ListConstruct %int-1_10444 : (!torch.int) -> !torch.list<int>
    %true_10445 = torch.constant.bool true
    %none_10446 = torch.constant.none
    %8178 = torch.aten.mean.dim %8176, %8177, %true_10445, %none_10446 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10447 = torch.constant.float 9.9999999999999995E-7
    %int1_10448 = torch.constant.int 1
    %8179 = torch.aten.add.Scalar %8178, %float9.999990e-07_10447, %int1_10448 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8180 = torch.aten.rsqrt %8179 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8181 = torch.aten.mul.Tensor %8175, %8180 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10449 = torch.constant.int 5
    %8182 = torch.prims.convert_element_type %8181, %int5_10449 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.16.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.16.norm.query_norm.scale : tensor<128xf16>
    %8183 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8184 = torch.aten.mul.Tensor %8182, %8183 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10450 = torch.constant.int 6
    %8185 = torch.prims.convert_element_type %8173, %int6_10450 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10451 = torch.constant.int 2
    %8186 = torch.aten.pow.Tensor_Scalar %8185, %int2_10451 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10452 = torch.constant.int -1
    %8187 = torch.prim.ListConstruct %int-1_10452 : (!torch.int) -> !torch.list<int>
    %true_10453 = torch.constant.bool true
    %none_10454 = torch.constant.none
    %8188 = torch.aten.mean.dim %8186, %8187, %true_10453, %none_10454 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10455 = torch.constant.float 9.9999999999999995E-7
    %int1_10456 = torch.constant.int 1
    %8189 = torch.aten.add.Scalar %8188, %float9.999990e-07_10455, %int1_10456 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8190 = torch.aten.rsqrt %8189 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8191 = torch.aten.mul.Tensor %8185, %8190 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10457 = torch.constant.int 5
    %8192 = torch.prims.convert_element_type %8191, %int5_10457 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.16.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.16.norm.key_norm.scale : tensor<128xf16>
    %8193 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8194 = torch.aten.mul.Tensor %8192, %8193 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10458 = torch.constant.int 5
    %8195 = torch.prims.convert_element_type %8184, %int5_10458 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10459 = torch.constant.int 5
    %8196 = torch.prims.convert_element_type %8194, %int5_10459 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10460 = torch.constant.int 6
    %8197 = torch.prims.convert_element_type %8195, %int6_10460 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10461 = torch.constant.int 1
    %int24_10462 = torch.constant.int 24
    %int4608_10463 = torch.constant.int 4608
    %int64_10464 = torch.constant.int 64
    %int1_10465 = torch.constant.int 1
    %int2_10466 = torch.constant.int 2
    %8198 = torch.prim.ListConstruct %int1_10461, %int24_10462, %int4608_10463, %int64_10464, %int1_10465, %int2_10466 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8199 = torch.aten.view %8197, %8198 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_10467 = torch.constant.int 6
    %8200 = torch.prims.convert_element_type %8196, %int6_10467 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10468 = torch.constant.int 1
    %int24_10469 = torch.constant.int 24
    %int4608_10470 = torch.constant.int 4608
    %int64_10471 = torch.constant.int 64
    %int1_10472 = torch.constant.int 1
    %int2_10473 = torch.constant.int 2
    %8201 = torch.prim.ListConstruct %int1_10468, %int24_10469, %int4608_10470, %int64_10471, %int1_10472, %int2_10473 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8202 = torch.aten.view %8200, %8201 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_10474 = torch.constant.int 5
    %int0_10475 = torch.constant.int 0
    %8203 = torch.aten.select.int %211, %int5_10474, %int0_10475 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10476 = torch.constant.int 5
    %int0_10477 = torch.constant.int 0
    %8204 = torch.aten.select.int %8199, %int5_10476, %int0_10477 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8205 = torch.aten.mul.Tensor %8203, %8204 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10478 = torch.constant.int 5
    %int1_10479 = torch.constant.int 1
    %8206 = torch.aten.select.int %211, %int5_10478, %int1_10479 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10480 = torch.constant.int 5
    %int1_10481 = torch.constant.int 1
    %8207 = torch.aten.select.int %8199, %int5_10480, %int1_10481 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8208 = torch.aten.mul.Tensor %8206, %8207 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10482 = torch.constant.int 1
    %8209 = torch.aten.add.Tensor %8205, %8208, %int1_10482 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10483 = torch.constant.int 5
    %int0_10484 = torch.constant.int 0
    %8210 = torch.aten.select.int %211, %int5_10483, %int0_10484 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10485 = torch.constant.int 5
    %int0_10486 = torch.constant.int 0
    %8211 = torch.aten.select.int %8202, %int5_10485, %int0_10486 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8212 = torch.aten.mul.Tensor %8210, %8211 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10487 = torch.constant.int 5
    %int1_10488 = torch.constant.int 1
    %8213 = torch.aten.select.int %211, %int5_10487, %int1_10488 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10489 = torch.constant.int 5
    %int1_10490 = torch.constant.int 1
    %8214 = torch.aten.select.int %8202, %int5_10489, %int1_10490 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8215 = torch.aten.mul.Tensor %8213, %8214 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10491 = torch.constant.int 1
    %8216 = torch.aten.add.Tensor %8212, %8215, %int1_10491 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10492 = torch.constant.int 1
    %int24_10493 = torch.constant.int 24
    %int4608_10494 = torch.constant.int 4608
    %int128_10495 = torch.constant.int 128
    %8217 = torch.prim.ListConstruct %int1_10492, %int24_10493, %int4608_10494, %int128_10495 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8218 = torch.aten.view %8209, %8217 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10496 = torch.constant.int 5
    %8219 = torch.prims.convert_element_type %8218, %int5_10496 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_10497 = torch.constant.int 1
    %int24_10498 = torch.constant.int 24
    %int4608_10499 = torch.constant.int 4608
    %int128_10500 = torch.constant.int 128
    %8220 = torch.prim.ListConstruct %int1_10497, %int24_10498, %int4608_10499, %int128_10500 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8221 = torch.aten.view %8216, %8220 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10501 = torch.constant.int 5
    %8222 = torch.prims.convert_element_type %8221, %int5_10501 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_10502 = torch.constant.float 0.000000e+00
    %false_10503 = torch.constant.bool false
    %none_10504 = torch.constant.none
    %none_10505 = torch.constant.none
    %8223:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8219, %8222, %8174, %float0.000000e00_10502, %false_10503, %none_10504, %none_10505) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_10506 = torch.constant.int 0
    %int2_10507 = torch.constant.int 2
    %int1_10508 = torch.constant.int 1
    %int3_10509 = torch.constant.int 3
    %8224 = torch.prim.ListConstruct %int0_10506, %int2_10507, %int1_10508, %int3_10509 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8225 = torch.aten.permute %8223#0, %8224 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_10510 = torch.constant.int 1
    %int4608_10511 = torch.constant.int 4608
    %int3072_10512 = torch.constant.int 3072
    %8226 = torch.prim.ListConstruct %int1_10510, %int4608_10511, %int3072_10512 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8227 = torch.aten.view %8225, %8226 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_10513 = torch.constant.str "tanh"
    %8228 = torch.aten.gelu %8167, %str_10513 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8229 = torch.prim.ListConstruct %8227, %8228 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_10514 = torch.constant.int 2
    %8230 = torch.aten.cat %8229, %int2_10514 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_10515 = torch.constant.int 4608
    %int15360_10516 = torch.constant.int 15360
    %8231 = torch.prim.ListConstruct %int4608_10515, %int15360_10516 : (!torch.int, !torch.int) -> !torch.list<int>
    %8232 = torch.aten.view %8230, %8231 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.16.linear2.weight = util.global.load @__auto.sampler.single_blocks.16.linear2.weight : tensor<3072x15360xf16>
    %8233 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_10517 = torch.constant.int 0
    %int1_10518 = torch.constant.int 1
    %8234 = torch.aten.transpose.int %8233, %int0_10517, %int1_10518 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.16.linear2.bias = util.global.load @__auto.sampler.single_blocks.16.linear2.bias : tensor<3072xf16>
    %8235 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.16.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_10519 = torch.constant.int 6
    %8236 = torch.prims.convert_element_type %8235, %int6_10519 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_10520 = torch.constant.int 6
    %8237 = torch.prims.convert_element_type %8232, %int6_10520 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_10521 = torch.constant.int 6
    %8238 = torch.prims.convert_element_type %8234, %int6_10521 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8239 = torch.aten.mm %8237, %8238 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_10522 = torch.constant.int 1
    %8240 = torch.aten.mul.Scalar %8239, %int1_10522 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_10523 = torch.constant.int 1
    %8241 = torch.aten.mul.Scalar %8236, %int1_10523 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_10524 = torch.constant.int 1
    %8242 = torch.aten.add.Tensor %8240, %8241, %int1_10524 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_10525 = torch.constant.int 5
    %8243 = torch.prims.convert_element_type %8242, %int5_10525 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_10526 = torch.constant.int 1
    %int4608_10527 = torch.constant.int 4608
    %int3072_10528 = torch.constant.int 3072
    %8244 = torch.prim.ListConstruct %int1_10526, %int4608_10527, %int3072_10528 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8245 = torch.aten.view %8243, %8244 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8246 = torch.aten.mul.Tensor %8140, %8245 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10529 = torch.constant.int 1
    %8247 = torch.aten.add.Tensor %8122, %8246, %int1_10529 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8248 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.17.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.17.modulation.lin.weight : tensor<9216x3072xf16>
    %8249 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_10530 = torch.constant.int 0
    %int1_10531 = torch.constant.int 1
    %8250 = torch.aten.transpose.int %8249, %int0_10530, %int1_10531 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.17.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.17.modulation.lin.bias : tensor<9216xf16>
    %8251 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_10532 = torch.constant.int 6
    %8252 = torch.prims.convert_element_type %8251, %int6_10532 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_10533 = torch.constant.int 6
    %8253 = torch.prims.convert_element_type %8248, %int6_10533 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_10534 = torch.constant.int 6
    %8254 = torch.prims.convert_element_type %8250, %int6_10534 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8255 = torch.aten.mm %8253, %8254 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_10535 = torch.constant.int 1
    %8256 = torch.aten.mul.Scalar %8255, %int1_10535 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_10536 = torch.constant.int 1
    %8257 = torch.aten.mul.Scalar %8252, %int1_10536 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_10537 = torch.constant.int 1
    %8258 = torch.aten.add.Tensor %8256, %8257, %int1_10537 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_10538 = torch.constant.int 5
    %8259 = torch.prims.convert_element_type %8258, %int5_10538 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_10539 = torch.constant.int 0
    %int0_10540 = torch.constant.int 0
    %int9223372036854775807_10541 = torch.constant.int 9223372036854775807
    %int1_10542 = torch.constant.int 1
    %8260 = torch.aten.slice.Tensor %8259, %int0_10539, %int0_10540, %int9223372036854775807_10541, %int1_10542 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_10543 = torch.constant.int 1
    %8261 = torch.aten.unsqueeze %8260, %int1_10543 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_10544 = torch.constant.int 2
    %int0_10545 = torch.constant.int 0
    %int9223372036854775807_10546 = torch.constant.int 9223372036854775807
    %int1_10547 = torch.constant.int 1
    %8262 = torch.aten.slice.Tensor %8261, %int2_10544, %int0_10545, %int9223372036854775807_10546, %int1_10547 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_10548 = torch.constant.int -1
    %int0_10549 = torch.constant.int 0
    %int3072_10550 = torch.constant.int 3072
    %int1_10551 = torch.constant.int 1
    %8263 = torch.aten.slice.Tensor %8262, %int-1_10548, %int0_10549, %int3072_10550, %int1_10551 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10552 = torch.constant.int -1
    %int3072_10553 = torch.constant.int 3072
    %int6144_10554 = torch.constant.int 6144
    %int1_10555 = torch.constant.int 1
    %8264 = torch.aten.slice.Tensor %8262, %int-1_10552, %int3072_10553, %int6144_10554, %int1_10555 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10556 = torch.constant.int -1
    %int6144_10557 = torch.constant.int 6144
    %int9216_10558 = torch.constant.int 9216
    %int1_10559 = torch.constant.int 1
    %8265 = torch.aten.slice.Tensor %8262, %int-1_10556, %int6144_10557, %int9216_10558, %int1_10559 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_10560 = torch.constant.int 1
    %int1_10561 = torch.constant.int 1
    %8266 = torch.aten.add.Scalar %8264, %int1_10560, %int1_10561 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_10562 = torch.constant.int 6
    %8267 = torch.prims.convert_element_type %8247, %int6_10562 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_10563 = torch.constant.int 2
    %8268 = torch.prim.ListConstruct %int2_10563 : (!torch.int) -> !torch.list<int>
    %int0_10564 = torch.constant.int 0
    %true_10565 = torch.constant.bool true
    %result0_10566, %result1_10567 = torch.aten.var_mean.correction %8267, %8268, %int0_10564, %true_10565 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_10568 = torch.constant.float 9.9999999999999995E-7
    %int1_10569 = torch.constant.int 1
    %8269 = torch.aten.add.Scalar %result0_10566, %float9.999990e-07_10568, %int1_10569 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8270 = torch.aten.rsqrt %8269 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_10570 = torch.constant.int 1
    %8271 = torch.aten.sub.Tensor %8247, %result1_10567, %int1_10570 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8272 = torch.aten.mul.Tensor %8271, %8270 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_10571 = torch.constant.int 5
    %8273 = torch.prims.convert_element_type %8272, %int5_10571 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8274 = torch.aten.mul.Tensor %8266, %8273 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10572 = torch.constant.int 1
    %8275 = torch.aten.add.Tensor %8274, %8263, %int1_10572 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_10573 = torch.constant.int 4608
    %int3072_10574 = torch.constant.int 3072
    %8276 = torch.prim.ListConstruct %int4608_10573, %int3072_10574 : (!torch.int, !torch.int) -> !torch.list<int>
    %8277 = torch.aten.view %8275, %8276 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.17.linear1.weight = util.global.load @__auto.sampler.single_blocks.17.linear1.weight : tensor<21504x3072xf16>
    %8278 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_10575 = torch.constant.int 0
    %int1_10576 = torch.constant.int 1
    %8279 = torch.aten.transpose.int %8278, %int0_10575, %int1_10576 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.17.linear1.bias = util.global.load @__auto.sampler.single_blocks.17.linear1.bias : tensor<21504xf16>
    %8280 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_10577 = torch.constant.int 6
    %8281 = torch.prims.convert_element_type %8280, %int6_10577 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_10578 = torch.constant.int 6
    %8282 = torch.prims.convert_element_type %8277, %int6_10578 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_10579 = torch.constant.int 6
    %8283 = torch.prims.convert_element_type %8279, %int6_10579 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8284 = torch.aten.mm %8282, %8283 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_10580 = torch.constant.int 1
    %8285 = torch.aten.mul.Scalar %8284, %int1_10580 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_10581 = torch.constant.int 1
    %8286 = torch.aten.mul.Scalar %8281, %int1_10581 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_10582 = torch.constant.int 1
    %8287 = torch.aten.add.Tensor %8285, %8286, %int1_10582 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_10583 = torch.constant.int 5
    %8288 = torch.prims.convert_element_type %8287, %int5_10583 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_10584 = torch.constant.int 1
    %int4608_10585 = torch.constant.int 4608
    %int21504_10586 = torch.constant.int 21504
    %8289 = torch.prim.ListConstruct %int1_10584, %int4608_10585, %int21504_10586 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8290 = torch.aten.view %8288, %8289 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_10587 = torch.constant.int -1
    %int0_10588 = torch.constant.int 0
    %int9216_10589 = torch.constant.int 9216
    %int1_10590 = torch.constant.int 1
    %8291 = torch.aten.slice.Tensor %8290, %int-1_10587, %int0_10588, %int9216_10589, %int1_10590 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_10591 = torch.constant.int -1
    %int9216_10592 = torch.constant.int 9216
    %int21504_10593 = torch.constant.int 21504
    %int1_10594 = torch.constant.int 1
    %8292 = torch.aten.slice.Tensor %8290, %int-1_10591, %int9216_10592, %int21504_10593, %int1_10594 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_10595 = torch.constant.int 1
    %int4608_10596 = torch.constant.int 4608
    %int3_10597 = torch.constant.int 3
    %int24_10598 = torch.constant.int 24
    %int128_10599 = torch.constant.int 128
    %8293 = torch.prim.ListConstruct %int1_10595, %int4608_10596, %int3_10597, %int24_10598, %int128_10599 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8294 = torch.aten.view %8291, %8293 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_10600 = torch.constant.int 2
    %int0_10601 = torch.constant.int 0
    %int3_10602 = torch.constant.int 3
    %int1_10603 = torch.constant.int 1
    %int4_10604 = torch.constant.int 4
    %8295 = torch.prim.ListConstruct %int2_10600, %int0_10601, %int3_10602, %int1_10603, %int4_10604 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8296 = torch.aten.permute %8294, %8295 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_10605 = torch.constant.int 0
    %int0_10606 = torch.constant.int 0
    %8297 = torch.aten.select.int %8296, %int0_10605, %int0_10606 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10607 = torch.constant.int 0
    %int1_10608 = torch.constant.int 1
    %8298 = torch.aten.select.int %8296, %int0_10607, %int1_10608 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10609 = torch.constant.int 0
    %int2_10610 = torch.constant.int 2
    %8299 = torch.aten.select.int %8296, %int0_10609, %int2_10610 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10611 = torch.constant.int 6
    %8300 = torch.prims.convert_element_type %8297, %int6_10611 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10612 = torch.constant.int 2
    %8301 = torch.aten.pow.Tensor_Scalar %8300, %int2_10612 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10613 = torch.constant.int -1
    %8302 = torch.prim.ListConstruct %int-1_10613 : (!torch.int) -> !torch.list<int>
    %true_10614 = torch.constant.bool true
    %none_10615 = torch.constant.none
    %8303 = torch.aten.mean.dim %8301, %8302, %true_10614, %none_10615 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10616 = torch.constant.float 9.9999999999999995E-7
    %int1_10617 = torch.constant.int 1
    %8304 = torch.aten.add.Scalar %8303, %float9.999990e-07_10616, %int1_10617 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8305 = torch.aten.rsqrt %8304 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8306 = torch.aten.mul.Tensor %8300, %8305 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10618 = torch.constant.int 5
    %8307 = torch.prims.convert_element_type %8306, %int5_10618 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.17.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.17.norm.query_norm.scale : tensor<128xf16>
    %8308 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8309 = torch.aten.mul.Tensor %8307, %8308 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10619 = torch.constant.int 6
    %8310 = torch.prims.convert_element_type %8298, %int6_10619 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10620 = torch.constant.int 2
    %8311 = torch.aten.pow.Tensor_Scalar %8310, %int2_10620 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10621 = torch.constant.int -1
    %8312 = torch.prim.ListConstruct %int-1_10621 : (!torch.int) -> !torch.list<int>
    %true_10622 = torch.constant.bool true
    %none_10623 = torch.constant.none
    %8313 = torch.aten.mean.dim %8311, %8312, %true_10622, %none_10623 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10624 = torch.constant.float 9.9999999999999995E-7
    %int1_10625 = torch.constant.int 1
    %8314 = torch.aten.add.Scalar %8313, %float9.999990e-07_10624, %int1_10625 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8315 = torch.aten.rsqrt %8314 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8316 = torch.aten.mul.Tensor %8310, %8315 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10626 = torch.constant.int 5
    %8317 = torch.prims.convert_element_type %8316, %int5_10626 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.17.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.17.norm.key_norm.scale : tensor<128xf16>
    %8318 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8319 = torch.aten.mul.Tensor %8317, %8318 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10627 = torch.constant.int 5
    %8320 = torch.prims.convert_element_type %8309, %int5_10627 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10628 = torch.constant.int 5
    %8321 = torch.prims.convert_element_type %8319, %int5_10628 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10629 = torch.constant.int 6
    %8322 = torch.prims.convert_element_type %8320, %int6_10629 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10630 = torch.constant.int 1
    %int24_10631 = torch.constant.int 24
    %int4608_10632 = torch.constant.int 4608
    %int64_10633 = torch.constant.int 64
    %int1_10634 = torch.constant.int 1
    %int2_10635 = torch.constant.int 2
    %8323 = torch.prim.ListConstruct %int1_10630, %int24_10631, %int4608_10632, %int64_10633, %int1_10634, %int2_10635 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8324 = torch.aten.view %8322, %8323 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_10636 = torch.constant.int 6
    %8325 = torch.prims.convert_element_type %8321, %int6_10636 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10637 = torch.constant.int 1
    %int24_10638 = torch.constant.int 24
    %int4608_10639 = torch.constant.int 4608
    %int64_10640 = torch.constant.int 64
    %int1_10641 = torch.constant.int 1
    %int2_10642 = torch.constant.int 2
    %8326 = torch.prim.ListConstruct %int1_10637, %int24_10638, %int4608_10639, %int64_10640, %int1_10641, %int2_10642 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8327 = torch.aten.view %8325, %8326 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_10643 = torch.constant.int 5
    %int0_10644 = torch.constant.int 0
    %8328 = torch.aten.select.int %211, %int5_10643, %int0_10644 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10645 = torch.constant.int 5
    %int0_10646 = torch.constant.int 0
    %8329 = torch.aten.select.int %8324, %int5_10645, %int0_10646 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8330 = torch.aten.mul.Tensor %8328, %8329 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10647 = torch.constant.int 5
    %int1_10648 = torch.constant.int 1
    %8331 = torch.aten.select.int %211, %int5_10647, %int1_10648 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10649 = torch.constant.int 5
    %int1_10650 = torch.constant.int 1
    %8332 = torch.aten.select.int %8324, %int5_10649, %int1_10650 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8333 = torch.aten.mul.Tensor %8331, %8332 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10651 = torch.constant.int 1
    %8334 = torch.aten.add.Tensor %8330, %8333, %int1_10651 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10652 = torch.constant.int 5
    %int0_10653 = torch.constant.int 0
    %8335 = torch.aten.select.int %211, %int5_10652, %int0_10653 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10654 = torch.constant.int 5
    %int0_10655 = torch.constant.int 0
    %8336 = torch.aten.select.int %8327, %int5_10654, %int0_10655 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8337 = torch.aten.mul.Tensor %8335, %8336 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10656 = torch.constant.int 5
    %int1_10657 = torch.constant.int 1
    %8338 = torch.aten.select.int %211, %int5_10656, %int1_10657 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10658 = torch.constant.int 5
    %int1_10659 = torch.constant.int 1
    %8339 = torch.aten.select.int %8327, %int5_10658, %int1_10659 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8340 = torch.aten.mul.Tensor %8338, %8339 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10660 = torch.constant.int 1
    %8341 = torch.aten.add.Tensor %8337, %8340, %int1_10660 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10661 = torch.constant.int 1
    %int24_10662 = torch.constant.int 24
    %int4608_10663 = torch.constant.int 4608
    %int128_10664 = torch.constant.int 128
    %8342 = torch.prim.ListConstruct %int1_10661, %int24_10662, %int4608_10663, %int128_10664 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8343 = torch.aten.view %8334, %8342 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10665 = torch.constant.int 5
    %8344 = torch.prims.convert_element_type %8343, %int5_10665 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_10666 = torch.constant.int 1
    %int24_10667 = torch.constant.int 24
    %int4608_10668 = torch.constant.int 4608
    %int128_10669 = torch.constant.int 128
    %8345 = torch.prim.ListConstruct %int1_10666, %int24_10667, %int4608_10668, %int128_10669 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8346 = torch.aten.view %8341, %8345 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10670 = torch.constant.int 5
    %8347 = torch.prims.convert_element_type %8346, %int5_10670 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_10671 = torch.constant.float 0.000000e+00
    %false_10672 = torch.constant.bool false
    %none_10673 = torch.constant.none
    %none_10674 = torch.constant.none
    %8348:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8344, %8347, %8299, %float0.000000e00_10671, %false_10672, %none_10673, %none_10674) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_10675 = torch.constant.int 0
    %int2_10676 = torch.constant.int 2
    %int1_10677 = torch.constant.int 1
    %int3_10678 = torch.constant.int 3
    %8349 = torch.prim.ListConstruct %int0_10675, %int2_10676, %int1_10677, %int3_10678 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8350 = torch.aten.permute %8348#0, %8349 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_10679 = torch.constant.int 1
    %int4608_10680 = torch.constant.int 4608
    %int3072_10681 = torch.constant.int 3072
    %8351 = torch.prim.ListConstruct %int1_10679, %int4608_10680, %int3072_10681 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8352 = torch.aten.view %8350, %8351 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_10682 = torch.constant.str "tanh"
    %8353 = torch.aten.gelu %8292, %str_10682 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8354 = torch.prim.ListConstruct %8352, %8353 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_10683 = torch.constant.int 2
    %8355 = torch.aten.cat %8354, %int2_10683 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_10684 = torch.constant.int 4608
    %int15360_10685 = torch.constant.int 15360
    %8356 = torch.prim.ListConstruct %int4608_10684, %int15360_10685 : (!torch.int, !torch.int) -> !torch.list<int>
    %8357 = torch.aten.view %8355, %8356 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.17.linear2.weight = util.global.load @__auto.sampler.single_blocks.17.linear2.weight : tensor<3072x15360xf16>
    %8358 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_10686 = torch.constant.int 0
    %int1_10687 = torch.constant.int 1
    %8359 = torch.aten.transpose.int %8358, %int0_10686, %int1_10687 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.17.linear2.bias = util.global.load @__auto.sampler.single_blocks.17.linear2.bias : tensor<3072xf16>
    %8360 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.17.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_10688 = torch.constant.int 6
    %8361 = torch.prims.convert_element_type %8360, %int6_10688 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_10689 = torch.constant.int 6
    %8362 = torch.prims.convert_element_type %8357, %int6_10689 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_10690 = torch.constant.int 6
    %8363 = torch.prims.convert_element_type %8359, %int6_10690 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8364 = torch.aten.mm %8362, %8363 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_10691 = torch.constant.int 1
    %8365 = torch.aten.mul.Scalar %8364, %int1_10691 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_10692 = torch.constant.int 1
    %8366 = torch.aten.mul.Scalar %8361, %int1_10692 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_10693 = torch.constant.int 1
    %8367 = torch.aten.add.Tensor %8365, %8366, %int1_10693 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_10694 = torch.constant.int 5
    %8368 = torch.prims.convert_element_type %8367, %int5_10694 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_10695 = torch.constant.int 1
    %int4608_10696 = torch.constant.int 4608
    %int3072_10697 = torch.constant.int 3072
    %8369 = torch.prim.ListConstruct %int1_10695, %int4608_10696, %int3072_10697 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8370 = torch.aten.view %8368, %8369 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8371 = torch.aten.mul.Tensor %8265, %8370 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10698 = torch.constant.int 1
    %8372 = torch.aten.add.Tensor %8247, %8371, %int1_10698 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8373 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.18.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.18.modulation.lin.weight : tensor<9216x3072xf16>
    %8374 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_10699 = torch.constant.int 0
    %int1_10700 = torch.constant.int 1
    %8375 = torch.aten.transpose.int %8374, %int0_10699, %int1_10700 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.18.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.18.modulation.lin.bias : tensor<9216xf16>
    %8376 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_10701 = torch.constant.int 6
    %8377 = torch.prims.convert_element_type %8376, %int6_10701 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_10702 = torch.constant.int 6
    %8378 = torch.prims.convert_element_type %8373, %int6_10702 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_10703 = torch.constant.int 6
    %8379 = torch.prims.convert_element_type %8375, %int6_10703 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8380 = torch.aten.mm %8378, %8379 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_10704 = torch.constant.int 1
    %8381 = torch.aten.mul.Scalar %8380, %int1_10704 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_10705 = torch.constant.int 1
    %8382 = torch.aten.mul.Scalar %8377, %int1_10705 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_10706 = torch.constant.int 1
    %8383 = torch.aten.add.Tensor %8381, %8382, %int1_10706 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_10707 = torch.constant.int 5
    %8384 = torch.prims.convert_element_type %8383, %int5_10707 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_10708 = torch.constant.int 0
    %int0_10709 = torch.constant.int 0
    %int9223372036854775807_10710 = torch.constant.int 9223372036854775807
    %int1_10711 = torch.constant.int 1
    %8385 = torch.aten.slice.Tensor %8384, %int0_10708, %int0_10709, %int9223372036854775807_10710, %int1_10711 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_10712 = torch.constant.int 1
    %8386 = torch.aten.unsqueeze %8385, %int1_10712 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_10713 = torch.constant.int 2
    %int0_10714 = torch.constant.int 0
    %int9223372036854775807_10715 = torch.constant.int 9223372036854775807
    %int1_10716 = torch.constant.int 1
    %8387 = torch.aten.slice.Tensor %8386, %int2_10713, %int0_10714, %int9223372036854775807_10715, %int1_10716 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_10717 = torch.constant.int -1
    %int0_10718 = torch.constant.int 0
    %int3072_10719 = torch.constant.int 3072
    %int1_10720 = torch.constant.int 1
    %8388 = torch.aten.slice.Tensor %8387, %int-1_10717, %int0_10718, %int3072_10719, %int1_10720 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10721 = torch.constant.int -1
    %int3072_10722 = torch.constant.int 3072
    %int6144_10723 = torch.constant.int 6144
    %int1_10724 = torch.constant.int 1
    %8389 = torch.aten.slice.Tensor %8387, %int-1_10721, %int3072_10722, %int6144_10723, %int1_10724 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10725 = torch.constant.int -1
    %int6144_10726 = torch.constant.int 6144
    %int9216_10727 = torch.constant.int 9216
    %int1_10728 = torch.constant.int 1
    %8390 = torch.aten.slice.Tensor %8387, %int-1_10725, %int6144_10726, %int9216_10727, %int1_10728 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_10729 = torch.constant.int 1
    %int1_10730 = torch.constant.int 1
    %8391 = torch.aten.add.Scalar %8389, %int1_10729, %int1_10730 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_10731 = torch.constant.int 6
    %8392 = torch.prims.convert_element_type %8372, %int6_10731 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_10732 = torch.constant.int 2
    %8393 = torch.prim.ListConstruct %int2_10732 : (!torch.int) -> !torch.list<int>
    %int0_10733 = torch.constant.int 0
    %true_10734 = torch.constant.bool true
    %result0_10735, %result1_10736 = torch.aten.var_mean.correction %8392, %8393, %int0_10733, %true_10734 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_10737 = torch.constant.float 9.9999999999999995E-7
    %int1_10738 = torch.constant.int 1
    %8394 = torch.aten.add.Scalar %result0_10735, %float9.999990e-07_10737, %int1_10738 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8395 = torch.aten.rsqrt %8394 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_10739 = torch.constant.int 1
    %8396 = torch.aten.sub.Tensor %8372, %result1_10736, %int1_10739 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8397 = torch.aten.mul.Tensor %8396, %8395 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_10740 = torch.constant.int 5
    %8398 = torch.prims.convert_element_type %8397, %int5_10740 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8399 = torch.aten.mul.Tensor %8391, %8398 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10741 = torch.constant.int 1
    %8400 = torch.aten.add.Tensor %8399, %8388, %int1_10741 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_10742 = torch.constant.int 4608
    %int3072_10743 = torch.constant.int 3072
    %8401 = torch.prim.ListConstruct %int4608_10742, %int3072_10743 : (!torch.int, !torch.int) -> !torch.list<int>
    %8402 = torch.aten.view %8400, %8401 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.18.linear1.weight = util.global.load @__auto.sampler.single_blocks.18.linear1.weight : tensor<21504x3072xf16>
    %8403 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_10744 = torch.constant.int 0
    %int1_10745 = torch.constant.int 1
    %8404 = torch.aten.transpose.int %8403, %int0_10744, %int1_10745 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.18.linear1.bias = util.global.load @__auto.sampler.single_blocks.18.linear1.bias : tensor<21504xf16>
    %8405 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_10746 = torch.constant.int 6
    %8406 = torch.prims.convert_element_type %8405, %int6_10746 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_10747 = torch.constant.int 6
    %8407 = torch.prims.convert_element_type %8402, %int6_10747 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_10748 = torch.constant.int 6
    %8408 = torch.prims.convert_element_type %8404, %int6_10748 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8409 = torch.aten.mm %8407, %8408 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_10749 = torch.constant.int 1
    %8410 = torch.aten.mul.Scalar %8409, %int1_10749 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_10750 = torch.constant.int 1
    %8411 = torch.aten.mul.Scalar %8406, %int1_10750 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_10751 = torch.constant.int 1
    %8412 = torch.aten.add.Tensor %8410, %8411, %int1_10751 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_10752 = torch.constant.int 5
    %8413 = torch.prims.convert_element_type %8412, %int5_10752 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_10753 = torch.constant.int 1
    %int4608_10754 = torch.constant.int 4608
    %int21504_10755 = torch.constant.int 21504
    %8414 = torch.prim.ListConstruct %int1_10753, %int4608_10754, %int21504_10755 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8415 = torch.aten.view %8413, %8414 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_10756 = torch.constant.int -1
    %int0_10757 = torch.constant.int 0
    %int9216_10758 = torch.constant.int 9216
    %int1_10759 = torch.constant.int 1
    %8416 = torch.aten.slice.Tensor %8415, %int-1_10756, %int0_10757, %int9216_10758, %int1_10759 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_10760 = torch.constant.int -1
    %int9216_10761 = torch.constant.int 9216
    %int21504_10762 = torch.constant.int 21504
    %int1_10763 = torch.constant.int 1
    %8417 = torch.aten.slice.Tensor %8415, %int-1_10760, %int9216_10761, %int21504_10762, %int1_10763 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_10764 = torch.constant.int 1
    %int4608_10765 = torch.constant.int 4608
    %int3_10766 = torch.constant.int 3
    %int24_10767 = torch.constant.int 24
    %int128_10768 = torch.constant.int 128
    %8418 = torch.prim.ListConstruct %int1_10764, %int4608_10765, %int3_10766, %int24_10767, %int128_10768 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8419 = torch.aten.view %8416, %8418 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_10769 = torch.constant.int 2
    %int0_10770 = torch.constant.int 0
    %int3_10771 = torch.constant.int 3
    %int1_10772 = torch.constant.int 1
    %int4_10773 = torch.constant.int 4
    %8420 = torch.prim.ListConstruct %int2_10769, %int0_10770, %int3_10771, %int1_10772, %int4_10773 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8421 = torch.aten.permute %8419, %8420 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_10774 = torch.constant.int 0
    %int0_10775 = torch.constant.int 0
    %8422 = torch.aten.select.int %8421, %int0_10774, %int0_10775 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10776 = torch.constant.int 0
    %int1_10777 = torch.constant.int 1
    %8423 = torch.aten.select.int %8421, %int0_10776, %int1_10777 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10778 = torch.constant.int 0
    %int2_10779 = torch.constant.int 2
    %8424 = torch.aten.select.int %8421, %int0_10778, %int2_10779 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10780 = torch.constant.int 6
    %8425 = torch.prims.convert_element_type %8422, %int6_10780 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10781 = torch.constant.int 2
    %8426 = torch.aten.pow.Tensor_Scalar %8425, %int2_10781 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10782 = torch.constant.int -1
    %8427 = torch.prim.ListConstruct %int-1_10782 : (!torch.int) -> !torch.list<int>
    %true_10783 = torch.constant.bool true
    %none_10784 = torch.constant.none
    %8428 = torch.aten.mean.dim %8426, %8427, %true_10783, %none_10784 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10785 = torch.constant.float 9.9999999999999995E-7
    %int1_10786 = torch.constant.int 1
    %8429 = torch.aten.add.Scalar %8428, %float9.999990e-07_10785, %int1_10786 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8430 = torch.aten.rsqrt %8429 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8431 = torch.aten.mul.Tensor %8425, %8430 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10787 = torch.constant.int 5
    %8432 = torch.prims.convert_element_type %8431, %int5_10787 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.18.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.18.norm.query_norm.scale : tensor<128xf16>
    %8433 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8434 = torch.aten.mul.Tensor %8432, %8433 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10788 = torch.constant.int 6
    %8435 = torch.prims.convert_element_type %8423, %int6_10788 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10789 = torch.constant.int 2
    %8436 = torch.aten.pow.Tensor_Scalar %8435, %int2_10789 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10790 = torch.constant.int -1
    %8437 = torch.prim.ListConstruct %int-1_10790 : (!torch.int) -> !torch.list<int>
    %true_10791 = torch.constant.bool true
    %none_10792 = torch.constant.none
    %8438 = torch.aten.mean.dim %8436, %8437, %true_10791, %none_10792 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10793 = torch.constant.float 9.9999999999999995E-7
    %int1_10794 = torch.constant.int 1
    %8439 = torch.aten.add.Scalar %8438, %float9.999990e-07_10793, %int1_10794 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8440 = torch.aten.rsqrt %8439 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8441 = torch.aten.mul.Tensor %8435, %8440 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10795 = torch.constant.int 5
    %8442 = torch.prims.convert_element_type %8441, %int5_10795 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.18.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.18.norm.key_norm.scale : tensor<128xf16>
    %8443 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8444 = torch.aten.mul.Tensor %8442, %8443 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10796 = torch.constant.int 5
    %8445 = torch.prims.convert_element_type %8434, %int5_10796 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10797 = torch.constant.int 5
    %8446 = torch.prims.convert_element_type %8444, %int5_10797 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10798 = torch.constant.int 6
    %8447 = torch.prims.convert_element_type %8445, %int6_10798 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10799 = torch.constant.int 1
    %int24_10800 = torch.constant.int 24
    %int4608_10801 = torch.constant.int 4608
    %int64_10802 = torch.constant.int 64
    %int1_10803 = torch.constant.int 1
    %int2_10804 = torch.constant.int 2
    %8448 = torch.prim.ListConstruct %int1_10799, %int24_10800, %int4608_10801, %int64_10802, %int1_10803, %int2_10804 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8449 = torch.aten.view %8447, %8448 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_10805 = torch.constant.int 6
    %8450 = torch.prims.convert_element_type %8446, %int6_10805 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10806 = torch.constant.int 1
    %int24_10807 = torch.constant.int 24
    %int4608_10808 = torch.constant.int 4608
    %int64_10809 = torch.constant.int 64
    %int1_10810 = torch.constant.int 1
    %int2_10811 = torch.constant.int 2
    %8451 = torch.prim.ListConstruct %int1_10806, %int24_10807, %int4608_10808, %int64_10809, %int1_10810, %int2_10811 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8452 = torch.aten.view %8450, %8451 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_10812 = torch.constant.int 5
    %int0_10813 = torch.constant.int 0
    %8453 = torch.aten.select.int %211, %int5_10812, %int0_10813 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10814 = torch.constant.int 5
    %int0_10815 = torch.constant.int 0
    %8454 = torch.aten.select.int %8449, %int5_10814, %int0_10815 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8455 = torch.aten.mul.Tensor %8453, %8454 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10816 = torch.constant.int 5
    %int1_10817 = torch.constant.int 1
    %8456 = torch.aten.select.int %211, %int5_10816, %int1_10817 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10818 = torch.constant.int 5
    %int1_10819 = torch.constant.int 1
    %8457 = torch.aten.select.int %8449, %int5_10818, %int1_10819 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8458 = torch.aten.mul.Tensor %8456, %8457 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10820 = torch.constant.int 1
    %8459 = torch.aten.add.Tensor %8455, %8458, %int1_10820 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10821 = torch.constant.int 5
    %int0_10822 = torch.constant.int 0
    %8460 = torch.aten.select.int %211, %int5_10821, %int0_10822 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10823 = torch.constant.int 5
    %int0_10824 = torch.constant.int 0
    %8461 = torch.aten.select.int %8452, %int5_10823, %int0_10824 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8462 = torch.aten.mul.Tensor %8460, %8461 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10825 = torch.constant.int 5
    %int1_10826 = torch.constant.int 1
    %8463 = torch.aten.select.int %211, %int5_10825, %int1_10826 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10827 = torch.constant.int 5
    %int1_10828 = torch.constant.int 1
    %8464 = torch.aten.select.int %8452, %int5_10827, %int1_10828 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8465 = torch.aten.mul.Tensor %8463, %8464 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10829 = torch.constant.int 1
    %8466 = torch.aten.add.Tensor %8462, %8465, %int1_10829 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10830 = torch.constant.int 1
    %int24_10831 = torch.constant.int 24
    %int4608_10832 = torch.constant.int 4608
    %int128_10833 = torch.constant.int 128
    %8467 = torch.prim.ListConstruct %int1_10830, %int24_10831, %int4608_10832, %int128_10833 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8468 = torch.aten.view %8459, %8467 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10834 = torch.constant.int 5
    %8469 = torch.prims.convert_element_type %8468, %int5_10834 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_10835 = torch.constant.int 1
    %int24_10836 = torch.constant.int 24
    %int4608_10837 = torch.constant.int 4608
    %int128_10838 = torch.constant.int 128
    %8470 = torch.prim.ListConstruct %int1_10835, %int24_10836, %int4608_10837, %int128_10838 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8471 = torch.aten.view %8466, %8470 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10839 = torch.constant.int 5
    %8472 = torch.prims.convert_element_type %8471, %int5_10839 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_10840 = torch.constant.float 0.000000e+00
    %false_10841 = torch.constant.bool false
    %none_10842 = torch.constant.none
    %none_10843 = torch.constant.none
    %8473:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8469, %8472, %8424, %float0.000000e00_10840, %false_10841, %none_10842, %none_10843) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_10844 = torch.constant.int 0
    %int2_10845 = torch.constant.int 2
    %int1_10846 = torch.constant.int 1
    %int3_10847 = torch.constant.int 3
    %8474 = torch.prim.ListConstruct %int0_10844, %int2_10845, %int1_10846, %int3_10847 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8475 = torch.aten.permute %8473#0, %8474 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_10848 = torch.constant.int 1
    %int4608_10849 = torch.constant.int 4608
    %int3072_10850 = torch.constant.int 3072
    %8476 = torch.prim.ListConstruct %int1_10848, %int4608_10849, %int3072_10850 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8477 = torch.aten.view %8475, %8476 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_10851 = torch.constant.str "tanh"
    %8478 = torch.aten.gelu %8417, %str_10851 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8479 = torch.prim.ListConstruct %8477, %8478 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_10852 = torch.constant.int 2
    %8480 = torch.aten.cat %8479, %int2_10852 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_10853 = torch.constant.int 4608
    %int15360_10854 = torch.constant.int 15360
    %8481 = torch.prim.ListConstruct %int4608_10853, %int15360_10854 : (!torch.int, !torch.int) -> !torch.list<int>
    %8482 = torch.aten.view %8480, %8481 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.18.linear2.weight = util.global.load @__auto.sampler.single_blocks.18.linear2.weight : tensor<3072x15360xf16>
    %8483 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_10855 = torch.constant.int 0
    %int1_10856 = torch.constant.int 1
    %8484 = torch.aten.transpose.int %8483, %int0_10855, %int1_10856 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.18.linear2.bias = util.global.load @__auto.sampler.single_blocks.18.linear2.bias : tensor<3072xf16>
    %8485 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.18.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_10857 = torch.constant.int 6
    %8486 = torch.prims.convert_element_type %8485, %int6_10857 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_10858 = torch.constant.int 6
    %8487 = torch.prims.convert_element_type %8482, %int6_10858 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_10859 = torch.constant.int 6
    %8488 = torch.prims.convert_element_type %8484, %int6_10859 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8489 = torch.aten.mm %8487, %8488 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_10860 = torch.constant.int 1
    %8490 = torch.aten.mul.Scalar %8489, %int1_10860 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_10861 = torch.constant.int 1
    %8491 = torch.aten.mul.Scalar %8486, %int1_10861 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_10862 = torch.constant.int 1
    %8492 = torch.aten.add.Tensor %8490, %8491, %int1_10862 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_10863 = torch.constant.int 5
    %8493 = torch.prims.convert_element_type %8492, %int5_10863 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_10864 = torch.constant.int 1
    %int4608_10865 = torch.constant.int 4608
    %int3072_10866 = torch.constant.int 3072
    %8494 = torch.prim.ListConstruct %int1_10864, %int4608_10865, %int3072_10866 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8495 = torch.aten.view %8493, %8494 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8496 = torch.aten.mul.Tensor %8390, %8495 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10867 = torch.constant.int 1
    %8497 = torch.aten.add.Tensor %8372, %8496, %int1_10867 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8498 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.19.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.19.modulation.lin.weight : tensor<9216x3072xf16>
    %8499 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_10868 = torch.constant.int 0
    %int1_10869 = torch.constant.int 1
    %8500 = torch.aten.transpose.int %8499, %int0_10868, %int1_10869 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.19.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.19.modulation.lin.bias : tensor<9216xf16>
    %8501 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_10870 = torch.constant.int 6
    %8502 = torch.prims.convert_element_type %8501, %int6_10870 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_10871 = torch.constant.int 6
    %8503 = torch.prims.convert_element_type %8498, %int6_10871 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_10872 = torch.constant.int 6
    %8504 = torch.prims.convert_element_type %8500, %int6_10872 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8505 = torch.aten.mm %8503, %8504 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_10873 = torch.constant.int 1
    %8506 = torch.aten.mul.Scalar %8505, %int1_10873 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_10874 = torch.constant.int 1
    %8507 = torch.aten.mul.Scalar %8502, %int1_10874 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_10875 = torch.constant.int 1
    %8508 = torch.aten.add.Tensor %8506, %8507, %int1_10875 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_10876 = torch.constant.int 5
    %8509 = torch.prims.convert_element_type %8508, %int5_10876 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_10877 = torch.constant.int 0
    %int0_10878 = torch.constant.int 0
    %int9223372036854775807_10879 = torch.constant.int 9223372036854775807
    %int1_10880 = torch.constant.int 1
    %8510 = torch.aten.slice.Tensor %8509, %int0_10877, %int0_10878, %int9223372036854775807_10879, %int1_10880 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_10881 = torch.constant.int 1
    %8511 = torch.aten.unsqueeze %8510, %int1_10881 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_10882 = torch.constant.int 2
    %int0_10883 = torch.constant.int 0
    %int9223372036854775807_10884 = torch.constant.int 9223372036854775807
    %int1_10885 = torch.constant.int 1
    %8512 = torch.aten.slice.Tensor %8511, %int2_10882, %int0_10883, %int9223372036854775807_10884, %int1_10885 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_10886 = torch.constant.int -1
    %int0_10887 = torch.constant.int 0
    %int3072_10888 = torch.constant.int 3072
    %int1_10889 = torch.constant.int 1
    %8513 = torch.aten.slice.Tensor %8512, %int-1_10886, %int0_10887, %int3072_10888, %int1_10889 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10890 = torch.constant.int -1
    %int3072_10891 = torch.constant.int 3072
    %int6144_10892 = torch.constant.int 6144
    %int1_10893 = torch.constant.int 1
    %8514 = torch.aten.slice.Tensor %8512, %int-1_10890, %int3072_10891, %int6144_10892, %int1_10893 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_10894 = torch.constant.int -1
    %int6144_10895 = torch.constant.int 6144
    %int9216_10896 = torch.constant.int 9216
    %int1_10897 = torch.constant.int 1
    %8515 = torch.aten.slice.Tensor %8512, %int-1_10894, %int6144_10895, %int9216_10896, %int1_10897 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_10898 = torch.constant.int 1
    %int1_10899 = torch.constant.int 1
    %8516 = torch.aten.add.Scalar %8514, %int1_10898, %int1_10899 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_10900 = torch.constant.int 6
    %8517 = torch.prims.convert_element_type %8497, %int6_10900 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_10901 = torch.constant.int 2
    %8518 = torch.prim.ListConstruct %int2_10901 : (!torch.int) -> !torch.list<int>
    %int0_10902 = torch.constant.int 0
    %true_10903 = torch.constant.bool true
    %result0_10904, %result1_10905 = torch.aten.var_mean.correction %8517, %8518, %int0_10902, %true_10903 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_10906 = torch.constant.float 9.9999999999999995E-7
    %int1_10907 = torch.constant.int 1
    %8519 = torch.aten.add.Scalar %result0_10904, %float9.999990e-07_10906, %int1_10907 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8520 = torch.aten.rsqrt %8519 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_10908 = torch.constant.int 1
    %8521 = torch.aten.sub.Tensor %8497, %result1_10905, %int1_10908 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8522 = torch.aten.mul.Tensor %8521, %8520 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_10909 = torch.constant.int 5
    %8523 = torch.prims.convert_element_type %8522, %int5_10909 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8524 = torch.aten.mul.Tensor %8516, %8523 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_10910 = torch.constant.int 1
    %8525 = torch.aten.add.Tensor %8524, %8513, %int1_10910 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_10911 = torch.constant.int 4608
    %int3072_10912 = torch.constant.int 3072
    %8526 = torch.prim.ListConstruct %int4608_10911, %int3072_10912 : (!torch.int, !torch.int) -> !torch.list<int>
    %8527 = torch.aten.view %8525, %8526 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.19.linear1.weight = util.global.load @__auto.sampler.single_blocks.19.linear1.weight : tensor<21504x3072xf16>
    %8528 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_10913 = torch.constant.int 0
    %int1_10914 = torch.constant.int 1
    %8529 = torch.aten.transpose.int %8528, %int0_10913, %int1_10914 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.19.linear1.bias = util.global.load @__auto.sampler.single_blocks.19.linear1.bias : tensor<21504xf16>
    %8530 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_10915 = torch.constant.int 6
    %8531 = torch.prims.convert_element_type %8530, %int6_10915 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_10916 = torch.constant.int 6
    %8532 = torch.prims.convert_element_type %8527, %int6_10916 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_10917 = torch.constant.int 6
    %8533 = torch.prims.convert_element_type %8529, %int6_10917 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8534 = torch.aten.mm %8532, %8533 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_10918 = torch.constant.int 1
    %8535 = torch.aten.mul.Scalar %8534, %int1_10918 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_10919 = torch.constant.int 1
    %8536 = torch.aten.mul.Scalar %8531, %int1_10919 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_10920 = torch.constant.int 1
    %8537 = torch.aten.add.Tensor %8535, %8536, %int1_10920 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_10921 = torch.constant.int 5
    %8538 = torch.prims.convert_element_type %8537, %int5_10921 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_10922 = torch.constant.int 1
    %int4608_10923 = torch.constant.int 4608
    %int21504_10924 = torch.constant.int 21504
    %8539 = torch.prim.ListConstruct %int1_10922, %int4608_10923, %int21504_10924 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8540 = torch.aten.view %8538, %8539 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_10925 = torch.constant.int -1
    %int0_10926 = torch.constant.int 0
    %int9216_10927 = torch.constant.int 9216
    %int1_10928 = torch.constant.int 1
    %8541 = torch.aten.slice.Tensor %8540, %int-1_10925, %int0_10926, %int9216_10927, %int1_10928 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_10929 = torch.constant.int -1
    %int9216_10930 = torch.constant.int 9216
    %int21504_10931 = torch.constant.int 21504
    %int1_10932 = torch.constant.int 1
    %8542 = torch.aten.slice.Tensor %8540, %int-1_10929, %int9216_10930, %int21504_10931, %int1_10932 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_10933 = torch.constant.int 1
    %int4608_10934 = torch.constant.int 4608
    %int3_10935 = torch.constant.int 3
    %int24_10936 = torch.constant.int 24
    %int128_10937 = torch.constant.int 128
    %8543 = torch.prim.ListConstruct %int1_10933, %int4608_10934, %int3_10935, %int24_10936, %int128_10937 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8544 = torch.aten.view %8541, %8543 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_10938 = torch.constant.int 2
    %int0_10939 = torch.constant.int 0
    %int3_10940 = torch.constant.int 3
    %int1_10941 = torch.constant.int 1
    %int4_10942 = torch.constant.int 4
    %8545 = torch.prim.ListConstruct %int2_10938, %int0_10939, %int3_10940, %int1_10941, %int4_10942 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8546 = torch.aten.permute %8544, %8545 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_10943 = torch.constant.int 0
    %int0_10944 = torch.constant.int 0
    %8547 = torch.aten.select.int %8546, %int0_10943, %int0_10944 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10945 = torch.constant.int 0
    %int1_10946 = torch.constant.int 1
    %8548 = torch.aten.select.int %8546, %int0_10945, %int1_10946 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_10947 = torch.constant.int 0
    %int2_10948 = torch.constant.int 2
    %8549 = torch.aten.select.int %8546, %int0_10947, %int2_10948 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10949 = torch.constant.int 6
    %8550 = torch.prims.convert_element_type %8547, %int6_10949 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10950 = torch.constant.int 2
    %8551 = torch.aten.pow.Tensor_Scalar %8550, %int2_10950 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10951 = torch.constant.int -1
    %8552 = torch.prim.ListConstruct %int-1_10951 : (!torch.int) -> !torch.list<int>
    %true_10952 = torch.constant.bool true
    %none_10953 = torch.constant.none
    %8553 = torch.aten.mean.dim %8551, %8552, %true_10952, %none_10953 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10954 = torch.constant.float 9.9999999999999995E-7
    %int1_10955 = torch.constant.int 1
    %8554 = torch.aten.add.Scalar %8553, %float9.999990e-07_10954, %int1_10955 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8555 = torch.aten.rsqrt %8554 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8556 = torch.aten.mul.Tensor %8550, %8555 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10956 = torch.constant.int 5
    %8557 = torch.prims.convert_element_type %8556, %int5_10956 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.19.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.19.norm.query_norm.scale : tensor<128xf16>
    %8558 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8559 = torch.aten.mul.Tensor %8557, %8558 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10957 = torch.constant.int 6
    %8560 = torch.prims.convert_element_type %8548, %int6_10957 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_10958 = torch.constant.int 2
    %8561 = torch.aten.pow.Tensor_Scalar %8560, %int2_10958 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_10959 = torch.constant.int -1
    %8562 = torch.prim.ListConstruct %int-1_10959 : (!torch.int) -> !torch.list<int>
    %true_10960 = torch.constant.bool true
    %none_10961 = torch.constant.none
    %8563 = torch.aten.mean.dim %8561, %8562, %true_10960, %none_10961 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_10962 = torch.constant.float 9.9999999999999995E-7
    %int1_10963 = torch.constant.int 1
    %8564 = torch.aten.add.Scalar %8563, %float9.999990e-07_10962, %int1_10963 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8565 = torch.aten.rsqrt %8564 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8566 = torch.aten.mul.Tensor %8560, %8565 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_10964 = torch.constant.int 5
    %8567 = torch.prims.convert_element_type %8566, %int5_10964 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.19.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.19.norm.key_norm.scale : tensor<128xf16>
    %8568 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8569 = torch.aten.mul.Tensor %8567, %8568 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10965 = torch.constant.int 5
    %8570 = torch.prims.convert_element_type %8559, %int5_10965 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_10966 = torch.constant.int 5
    %8571 = torch.prims.convert_element_type %8569, %int5_10966 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_10967 = torch.constant.int 6
    %8572 = torch.prims.convert_element_type %8570, %int6_10967 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10968 = torch.constant.int 1
    %int24_10969 = torch.constant.int 24
    %int4608_10970 = torch.constant.int 4608
    %int64_10971 = torch.constant.int 64
    %int1_10972 = torch.constant.int 1
    %int2_10973 = torch.constant.int 2
    %8573 = torch.prim.ListConstruct %int1_10968, %int24_10969, %int4608_10970, %int64_10971, %int1_10972, %int2_10973 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8574 = torch.aten.view %8572, %8573 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_10974 = torch.constant.int 6
    %8575 = torch.prims.convert_element_type %8571, %int6_10974 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_10975 = torch.constant.int 1
    %int24_10976 = torch.constant.int 24
    %int4608_10977 = torch.constant.int 4608
    %int64_10978 = torch.constant.int 64
    %int1_10979 = torch.constant.int 1
    %int2_10980 = torch.constant.int 2
    %8576 = torch.prim.ListConstruct %int1_10975, %int24_10976, %int4608_10977, %int64_10978, %int1_10979, %int2_10980 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8577 = torch.aten.view %8575, %8576 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_10981 = torch.constant.int 5
    %int0_10982 = torch.constant.int 0
    %8578 = torch.aten.select.int %211, %int5_10981, %int0_10982 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10983 = torch.constant.int 5
    %int0_10984 = torch.constant.int 0
    %8579 = torch.aten.select.int %8574, %int5_10983, %int0_10984 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8580 = torch.aten.mul.Tensor %8578, %8579 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10985 = torch.constant.int 5
    %int1_10986 = torch.constant.int 1
    %8581 = torch.aten.select.int %211, %int5_10985, %int1_10986 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10987 = torch.constant.int 5
    %int1_10988 = torch.constant.int 1
    %8582 = torch.aten.select.int %8574, %int5_10987, %int1_10988 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8583 = torch.aten.mul.Tensor %8581, %8582 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10989 = torch.constant.int 1
    %8584 = torch.aten.add.Tensor %8580, %8583, %int1_10989 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10990 = torch.constant.int 5
    %int0_10991 = torch.constant.int 0
    %8585 = torch.aten.select.int %211, %int5_10990, %int0_10991 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10992 = torch.constant.int 5
    %int0_10993 = torch.constant.int 0
    %8586 = torch.aten.select.int %8577, %int5_10992, %int0_10993 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8587 = torch.aten.mul.Tensor %8585, %8586 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_10994 = torch.constant.int 5
    %int1_10995 = torch.constant.int 1
    %8588 = torch.aten.select.int %211, %int5_10994, %int1_10995 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_10996 = torch.constant.int 5
    %int1_10997 = torch.constant.int 1
    %8589 = torch.aten.select.int %8577, %int5_10996, %int1_10997 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8590 = torch.aten.mul.Tensor %8588, %8589 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10998 = torch.constant.int 1
    %8591 = torch.aten.add.Tensor %8587, %8590, %int1_10998 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_10999 = torch.constant.int 1
    %int24_11000 = torch.constant.int 24
    %int4608_11001 = torch.constant.int 4608
    %int128_11002 = torch.constant.int 128
    %8592 = torch.prim.ListConstruct %int1_10999, %int24_11000, %int4608_11001, %int128_11002 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8593 = torch.aten.view %8584, %8592 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11003 = torch.constant.int 5
    %8594 = torch.prims.convert_element_type %8593, %int5_11003 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_11004 = torch.constant.int 1
    %int24_11005 = torch.constant.int 24
    %int4608_11006 = torch.constant.int 4608
    %int128_11007 = torch.constant.int 128
    %8595 = torch.prim.ListConstruct %int1_11004, %int24_11005, %int4608_11006, %int128_11007 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8596 = torch.aten.view %8591, %8595 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11008 = torch.constant.int 5
    %8597 = torch.prims.convert_element_type %8596, %int5_11008 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_11009 = torch.constant.float 0.000000e+00
    %false_11010 = torch.constant.bool false
    %none_11011 = torch.constant.none
    %none_11012 = torch.constant.none
    %8598:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8594, %8597, %8549, %float0.000000e00_11009, %false_11010, %none_11011, %none_11012) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_11013 = torch.constant.int 0
    %int2_11014 = torch.constant.int 2
    %int1_11015 = torch.constant.int 1
    %int3_11016 = torch.constant.int 3
    %8599 = torch.prim.ListConstruct %int0_11013, %int2_11014, %int1_11015, %int3_11016 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8600 = torch.aten.permute %8598#0, %8599 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_11017 = torch.constant.int 1
    %int4608_11018 = torch.constant.int 4608
    %int3072_11019 = torch.constant.int 3072
    %8601 = torch.prim.ListConstruct %int1_11017, %int4608_11018, %int3072_11019 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8602 = torch.aten.view %8600, %8601 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_11020 = torch.constant.str "tanh"
    %8603 = torch.aten.gelu %8542, %str_11020 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8604 = torch.prim.ListConstruct %8602, %8603 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_11021 = torch.constant.int 2
    %8605 = torch.aten.cat %8604, %int2_11021 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_11022 = torch.constant.int 4608
    %int15360_11023 = torch.constant.int 15360
    %8606 = torch.prim.ListConstruct %int4608_11022, %int15360_11023 : (!torch.int, !torch.int) -> !torch.list<int>
    %8607 = torch.aten.view %8605, %8606 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.19.linear2.weight = util.global.load @__auto.sampler.single_blocks.19.linear2.weight : tensor<3072x15360xf16>
    %8608 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_11024 = torch.constant.int 0
    %int1_11025 = torch.constant.int 1
    %8609 = torch.aten.transpose.int %8608, %int0_11024, %int1_11025 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.19.linear2.bias = util.global.load @__auto.sampler.single_blocks.19.linear2.bias : tensor<3072xf16>
    %8610 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.19.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_11026 = torch.constant.int 6
    %8611 = torch.prims.convert_element_type %8610, %int6_11026 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_11027 = torch.constant.int 6
    %8612 = torch.prims.convert_element_type %8607, %int6_11027 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_11028 = torch.constant.int 6
    %8613 = torch.prims.convert_element_type %8609, %int6_11028 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8614 = torch.aten.mm %8612, %8613 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_11029 = torch.constant.int 1
    %8615 = torch.aten.mul.Scalar %8614, %int1_11029 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_11030 = torch.constant.int 1
    %8616 = torch.aten.mul.Scalar %8611, %int1_11030 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_11031 = torch.constant.int 1
    %8617 = torch.aten.add.Tensor %8615, %8616, %int1_11031 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_11032 = torch.constant.int 5
    %8618 = torch.prims.convert_element_type %8617, %int5_11032 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_11033 = torch.constant.int 1
    %int4608_11034 = torch.constant.int 4608
    %int3072_11035 = torch.constant.int 3072
    %8619 = torch.prim.ListConstruct %int1_11033, %int4608_11034, %int3072_11035 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8620 = torch.aten.view %8618, %8619 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8621 = torch.aten.mul.Tensor %8515, %8620 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11036 = torch.constant.int 1
    %8622 = torch.aten.add.Tensor %8497, %8621, %int1_11036 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8623 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.20.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.20.modulation.lin.weight : tensor<9216x3072xf16>
    %8624 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_11037 = torch.constant.int 0
    %int1_11038 = torch.constant.int 1
    %8625 = torch.aten.transpose.int %8624, %int0_11037, %int1_11038 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.20.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.20.modulation.lin.bias : tensor<9216xf16>
    %8626 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_11039 = torch.constant.int 6
    %8627 = torch.prims.convert_element_type %8626, %int6_11039 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_11040 = torch.constant.int 6
    %8628 = torch.prims.convert_element_type %8623, %int6_11040 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_11041 = torch.constant.int 6
    %8629 = torch.prims.convert_element_type %8625, %int6_11041 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8630 = torch.aten.mm %8628, %8629 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_11042 = torch.constant.int 1
    %8631 = torch.aten.mul.Scalar %8630, %int1_11042 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_11043 = torch.constant.int 1
    %8632 = torch.aten.mul.Scalar %8627, %int1_11043 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_11044 = torch.constant.int 1
    %8633 = torch.aten.add.Tensor %8631, %8632, %int1_11044 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_11045 = torch.constant.int 5
    %8634 = torch.prims.convert_element_type %8633, %int5_11045 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_11046 = torch.constant.int 0
    %int0_11047 = torch.constant.int 0
    %int9223372036854775807_11048 = torch.constant.int 9223372036854775807
    %int1_11049 = torch.constant.int 1
    %8635 = torch.aten.slice.Tensor %8634, %int0_11046, %int0_11047, %int9223372036854775807_11048, %int1_11049 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_11050 = torch.constant.int 1
    %8636 = torch.aten.unsqueeze %8635, %int1_11050 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_11051 = torch.constant.int 2
    %int0_11052 = torch.constant.int 0
    %int9223372036854775807_11053 = torch.constant.int 9223372036854775807
    %int1_11054 = torch.constant.int 1
    %8637 = torch.aten.slice.Tensor %8636, %int2_11051, %int0_11052, %int9223372036854775807_11053, %int1_11054 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_11055 = torch.constant.int -1
    %int0_11056 = torch.constant.int 0
    %int3072_11057 = torch.constant.int 3072
    %int1_11058 = torch.constant.int 1
    %8638 = torch.aten.slice.Tensor %8637, %int-1_11055, %int0_11056, %int3072_11057, %int1_11058 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11059 = torch.constant.int -1
    %int3072_11060 = torch.constant.int 3072
    %int6144_11061 = torch.constant.int 6144
    %int1_11062 = torch.constant.int 1
    %8639 = torch.aten.slice.Tensor %8637, %int-1_11059, %int3072_11060, %int6144_11061, %int1_11062 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11063 = torch.constant.int -1
    %int6144_11064 = torch.constant.int 6144
    %int9216_11065 = torch.constant.int 9216
    %int1_11066 = torch.constant.int 1
    %8640 = torch.aten.slice.Tensor %8637, %int-1_11063, %int6144_11064, %int9216_11065, %int1_11066 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_11067 = torch.constant.int 1
    %int1_11068 = torch.constant.int 1
    %8641 = torch.aten.add.Scalar %8639, %int1_11067, %int1_11068 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_11069 = torch.constant.int 6
    %8642 = torch.prims.convert_element_type %8622, %int6_11069 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_11070 = torch.constant.int 2
    %8643 = torch.prim.ListConstruct %int2_11070 : (!torch.int) -> !torch.list<int>
    %int0_11071 = torch.constant.int 0
    %true_11072 = torch.constant.bool true
    %result0_11073, %result1_11074 = torch.aten.var_mean.correction %8642, %8643, %int0_11071, %true_11072 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_11075 = torch.constant.float 9.9999999999999995E-7
    %int1_11076 = torch.constant.int 1
    %8644 = torch.aten.add.Scalar %result0_11073, %float9.999990e-07_11075, %int1_11076 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8645 = torch.aten.rsqrt %8644 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_11077 = torch.constant.int 1
    %8646 = torch.aten.sub.Tensor %8622, %result1_11074, %int1_11077 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8647 = torch.aten.mul.Tensor %8646, %8645 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_11078 = torch.constant.int 5
    %8648 = torch.prims.convert_element_type %8647, %int5_11078 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8649 = torch.aten.mul.Tensor %8641, %8648 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11079 = torch.constant.int 1
    %8650 = torch.aten.add.Tensor %8649, %8638, %int1_11079 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_11080 = torch.constant.int 4608
    %int3072_11081 = torch.constant.int 3072
    %8651 = torch.prim.ListConstruct %int4608_11080, %int3072_11081 : (!torch.int, !torch.int) -> !torch.list<int>
    %8652 = torch.aten.view %8650, %8651 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.20.linear1.weight = util.global.load @__auto.sampler.single_blocks.20.linear1.weight : tensor<21504x3072xf16>
    %8653 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_11082 = torch.constant.int 0
    %int1_11083 = torch.constant.int 1
    %8654 = torch.aten.transpose.int %8653, %int0_11082, %int1_11083 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.20.linear1.bias = util.global.load @__auto.sampler.single_blocks.20.linear1.bias : tensor<21504xf16>
    %8655 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_11084 = torch.constant.int 6
    %8656 = torch.prims.convert_element_type %8655, %int6_11084 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_11085 = torch.constant.int 6
    %8657 = torch.prims.convert_element_type %8652, %int6_11085 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_11086 = torch.constant.int 6
    %8658 = torch.prims.convert_element_type %8654, %int6_11086 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8659 = torch.aten.mm %8657, %8658 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_11087 = torch.constant.int 1
    %8660 = torch.aten.mul.Scalar %8659, %int1_11087 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_11088 = torch.constant.int 1
    %8661 = torch.aten.mul.Scalar %8656, %int1_11088 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_11089 = torch.constant.int 1
    %8662 = torch.aten.add.Tensor %8660, %8661, %int1_11089 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_11090 = torch.constant.int 5
    %8663 = torch.prims.convert_element_type %8662, %int5_11090 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_11091 = torch.constant.int 1
    %int4608_11092 = torch.constant.int 4608
    %int21504_11093 = torch.constant.int 21504
    %8664 = torch.prim.ListConstruct %int1_11091, %int4608_11092, %int21504_11093 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8665 = torch.aten.view %8663, %8664 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_11094 = torch.constant.int -1
    %int0_11095 = torch.constant.int 0
    %int9216_11096 = torch.constant.int 9216
    %int1_11097 = torch.constant.int 1
    %8666 = torch.aten.slice.Tensor %8665, %int-1_11094, %int0_11095, %int9216_11096, %int1_11097 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_11098 = torch.constant.int -1
    %int9216_11099 = torch.constant.int 9216
    %int21504_11100 = torch.constant.int 21504
    %int1_11101 = torch.constant.int 1
    %8667 = torch.aten.slice.Tensor %8665, %int-1_11098, %int9216_11099, %int21504_11100, %int1_11101 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_11102 = torch.constant.int 1
    %int4608_11103 = torch.constant.int 4608
    %int3_11104 = torch.constant.int 3
    %int24_11105 = torch.constant.int 24
    %int128_11106 = torch.constant.int 128
    %8668 = torch.prim.ListConstruct %int1_11102, %int4608_11103, %int3_11104, %int24_11105, %int128_11106 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8669 = torch.aten.view %8666, %8668 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_11107 = torch.constant.int 2
    %int0_11108 = torch.constant.int 0
    %int3_11109 = torch.constant.int 3
    %int1_11110 = torch.constant.int 1
    %int4_11111 = torch.constant.int 4
    %8670 = torch.prim.ListConstruct %int2_11107, %int0_11108, %int3_11109, %int1_11110, %int4_11111 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8671 = torch.aten.permute %8669, %8670 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_11112 = torch.constant.int 0
    %int0_11113 = torch.constant.int 0
    %8672 = torch.aten.select.int %8671, %int0_11112, %int0_11113 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11114 = torch.constant.int 0
    %int1_11115 = torch.constant.int 1
    %8673 = torch.aten.select.int %8671, %int0_11114, %int1_11115 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11116 = torch.constant.int 0
    %int2_11117 = torch.constant.int 2
    %8674 = torch.aten.select.int %8671, %int0_11116, %int2_11117 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11118 = torch.constant.int 6
    %8675 = torch.prims.convert_element_type %8672, %int6_11118 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11119 = torch.constant.int 2
    %8676 = torch.aten.pow.Tensor_Scalar %8675, %int2_11119 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11120 = torch.constant.int -1
    %8677 = torch.prim.ListConstruct %int-1_11120 : (!torch.int) -> !torch.list<int>
    %true_11121 = torch.constant.bool true
    %none_11122 = torch.constant.none
    %8678 = torch.aten.mean.dim %8676, %8677, %true_11121, %none_11122 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11123 = torch.constant.float 9.9999999999999995E-7
    %int1_11124 = torch.constant.int 1
    %8679 = torch.aten.add.Scalar %8678, %float9.999990e-07_11123, %int1_11124 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8680 = torch.aten.rsqrt %8679 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8681 = torch.aten.mul.Tensor %8675, %8680 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11125 = torch.constant.int 5
    %8682 = torch.prims.convert_element_type %8681, %int5_11125 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.20.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.20.norm.query_norm.scale : tensor<128xf16>
    %8683 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8684 = torch.aten.mul.Tensor %8682, %8683 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11126 = torch.constant.int 6
    %8685 = torch.prims.convert_element_type %8673, %int6_11126 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11127 = torch.constant.int 2
    %8686 = torch.aten.pow.Tensor_Scalar %8685, %int2_11127 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11128 = torch.constant.int -1
    %8687 = torch.prim.ListConstruct %int-1_11128 : (!torch.int) -> !torch.list<int>
    %true_11129 = torch.constant.bool true
    %none_11130 = torch.constant.none
    %8688 = torch.aten.mean.dim %8686, %8687, %true_11129, %none_11130 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11131 = torch.constant.float 9.9999999999999995E-7
    %int1_11132 = torch.constant.int 1
    %8689 = torch.aten.add.Scalar %8688, %float9.999990e-07_11131, %int1_11132 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8690 = torch.aten.rsqrt %8689 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8691 = torch.aten.mul.Tensor %8685, %8690 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11133 = torch.constant.int 5
    %8692 = torch.prims.convert_element_type %8691, %int5_11133 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.20.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.20.norm.key_norm.scale : tensor<128xf16>
    %8693 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8694 = torch.aten.mul.Tensor %8692, %8693 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11134 = torch.constant.int 5
    %8695 = torch.prims.convert_element_type %8684, %int5_11134 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11135 = torch.constant.int 5
    %8696 = torch.prims.convert_element_type %8694, %int5_11135 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11136 = torch.constant.int 6
    %8697 = torch.prims.convert_element_type %8695, %int6_11136 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11137 = torch.constant.int 1
    %int24_11138 = torch.constant.int 24
    %int4608_11139 = torch.constant.int 4608
    %int64_11140 = torch.constant.int 64
    %int1_11141 = torch.constant.int 1
    %int2_11142 = torch.constant.int 2
    %8698 = torch.prim.ListConstruct %int1_11137, %int24_11138, %int4608_11139, %int64_11140, %int1_11141, %int2_11142 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8699 = torch.aten.view %8697, %8698 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_11143 = torch.constant.int 6
    %8700 = torch.prims.convert_element_type %8696, %int6_11143 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11144 = torch.constant.int 1
    %int24_11145 = torch.constant.int 24
    %int4608_11146 = torch.constant.int 4608
    %int64_11147 = torch.constant.int 64
    %int1_11148 = torch.constant.int 1
    %int2_11149 = torch.constant.int 2
    %8701 = torch.prim.ListConstruct %int1_11144, %int24_11145, %int4608_11146, %int64_11147, %int1_11148, %int2_11149 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8702 = torch.aten.view %8700, %8701 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_11150 = torch.constant.int 5
    %int0_11151 = torch.constant.int 0
    %8703 = torch.aten.select.int %211, %int5_11150, %int0_11151 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11152 = torch.constant.int 5
    %int0_11153 = torch.constant.int 0
    %8704 = torch.aten.select.int %8699, %int5_11152, %int0_11153 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8705 = torch.aten.mul.Tensor %8703, %8704 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11154 = torch.constant.int 5
    %int1_11155 = torch.constant.int 1
    %8706 = torch.aten.select.int %211, %int5_11154, %int1_11155 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11156 = torch.constant.int 5
    %int1_11157 = torch.constant.int 1
    %8707 = torch.aten.select.int %8699, %int5_11156, %int1_11157 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8708 = torch.aten.mul.Tensor %8706, %8707 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11158 = torch.constant.int 1
    %8709 = torch.aten.add.Tensor %8705, %8708, %int1_11158 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11159 = torch.constant.int 5
    %int0_11160 = torch.constant.int 0
    %8710 = torch.aten.select.int %211, %int5_11159, %int0_11160 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11161 = torch.constant.int 5
    %int0_11162 = torch.constant.int 0
    %8711 = torch.aten.select.int %8702, %int5_11161, %int0_11162 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8712 = torch.aten.mul.Tensor %8710, %8711 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11163 = torch.constant.int 5
    %int1_11164 = torch.constant.int 1
    %8713 = torch.aten.select.int %211, %int5_11163, %int1_11164 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11165 = torch.constant.int 5
    %int1_11166 = torch.constant.int 1
    %8714 = torch.aten.select.int %8702, %int5_11165, %int1_11166 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8715 = torch.aten.mul.Tensor %8713, %8714 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11167 = torch.constant.int 1
    %8716 = torch.aten.add.Tensor %8712, %8715, %int1_11167 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11168 = torch.constant.int 1
    %int24_11169 = torch.constant.int 24
    %int4608_11170 = torch.constant.int 4608
    %int128_11171 = torch.constant.int 128
    %8717 = torch.prim.ListConstruct %int1_11168, %int24_11169, %int4608_11170, %int128_11171 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8718 = torch.aten.view %8709, %8717 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11172 = torch.constant.int 5
    %8719 = torch.prims.convert_element_type %8718, %int5_11172 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_11173 = torch.constant.int 1
    %int24_11174 = torch.constant.int 24
    %int4608_11175 = torch.constant.int 4608
    %int128_11176 = torch.constant.int 128
    %8720 = torch.prim.ListConstruct %int1_11173, %int24_11174, %int4608_11175, %int128_11176 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8721 = torch.aten.view %8716, %8720 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11177 = torch.constant.int 5
    %8722 = torch.prims.convert_element_type %8721, %int5_11177 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_11178 = torch.constant.float 0.000000e+00
    %false_11179 = torch.constant.bool false
    %none_11180 = torch.constant.none
    %none_11181 = torch.constant.none
    %8723:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8719, %8722, %8674, %float0.000000e00_11178, %false_11179, %none_11180, %none_11181) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_11182 = torch.constant.int 0
    %int2_11183 = torch.constant.int 2
    %int1_11184 = torch.constant.int 1
    %int3_11185 = torch.constant.int 3
    %8724 = torch.prim.ListConstruct %int0_11182, %int2_11183, %int1_11184, %int3_11185 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8725 = torch.aten.permute %8723#0, %8724 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_11186 = torch.constant.int 1
    %int4608_11187 = torch.constant.int 4608
    %int3072_11188 = torch.constant.int 3072
    %8726 = torch.prim.ListConstruct %int1_11186, %int4608_11187, %int3072_11188 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8727 = torch.aten.view %8725, %8726 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_11189 = torch.constant.str "tanh"
    %8728 = torch.aten.gelu %8667, %str_11189 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8729 = torch.prim.ListConstruct %8727, %8728 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_11190 = torch.constant.int 2
    %8730 = torch.aten.cat %8729, %int2_11190 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_11191 = torch.constant.int 4608
    %int15360_11192 = torch.constant.int 15360
    %8731 = torch.prim.ListConstruct %int4608_11191, %int15360_11192 : (!torch.int, !torch.int) -> !torch.list<int>
    %8732 = torch.aten.view %8730, %8731 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.20.linear2.weight = util.global.load @__auto.sampler.single_blocks.20.linear2.weight : tensor<3072x15360xf16>
    %8733 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_11193 = torch.constant.int 0
    %int1_11194 = torch.constant.int 1
    %8734 = torch.aten.transpose.int %8733, %int0_11193, %int1_11194 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.20.linear2.bias = util.global.load @__auto.sampler.single_blocks.20.linear2.bias : tensor<3072xf16>
    %8735 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.20.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_11195 = torch.constant.int 6
    %8736 = torch.prims.convert_element_type %8735, %int6_11195 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_11196 = torch.constant.int 6
    %8737 = torch.prims.convert_element_type %8732, %int6_11196 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_11197 = torch.constant.int 6
    %8738 = torch.prims.convert_element_type %8734, %int6_11197 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8739 = torch.aten.mm %8737, %8738 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_11198 = torch.constant.int 1
    %8740 = torch.aten.mul.Scalar %8739, %int1_11198 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_11199 = torch.constant.int 1
    %8741 = torch.aten.mul.Scalar %8736, %int1_11199 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_11200 = torch.constant.int 1
    %8742 = torch.aten.add.Tensor %8740, %8741, %int1_11200 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_11201 = torch.constant.int 5
    %8743 = torch.prims.convert_element_type %8742, %int5_11201 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_11202 = torch.constant.int 1
    %int4608_11203 = torch.constant.int 4608
    %int3072_11204 = torch.constant.int 3072
    %8744 = torch.prim.ListConstruct %int1_11202, %int4608_11203, %int3072_11204 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8745 = torch.aten.view %8743, %8744 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8746 = torch.aten.mul.Tensor %8640, %8745 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11205 = torch.constant.int 1
    %8747 = torch.aten.add.Tensor %8622, %8746, %int1_11205 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8748 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.21.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.21.modulation.lin.weight : tensor<9216x3072xf16>
    %8749 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_11206 = torch.constant.int 0
    %int1_11207 = torch.constant.int 1
    %8750 = torch.aten.transpose.int %8749, %int0_11206, %int1_11207 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.21.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.21.modulation.lin.bias : tensor<9216xf16>
    %8751 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_11208 = torch.constant.int 6
    %8752 = torch.prims.convert_element_type %8751, %int6_11208 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_11209 = torch.constant.int 6
    %8753 = torch.prims.convert_element_type %8748, %int6_11209 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_11210 = torch.constant.int 6
    %8754 = torch.prims.convert_element_type %8750, %int6_11210 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8755 = torch.aten.mm %8753, %8754 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_11211 = torch.constant.int 1
    %8756 = torch.aten.mul.Scalar %8755, %int1_11211 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_11212 = torch.constant.int 1
    %8757 = torch.aten.mul.Scalar %8752, %int1_11212 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_11213 = torch.constant.int 1
    %8758 = torch.aten.add.Tensor %8756, %8757, %int1_11213 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_11214 = torch.constant.int 5
    %8759 = torch.prims.convert_element_type %8758, %int5_11214 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_11215 = torch.constant.int 0
    %int0_11216 = torch.constant.int 0
    %int9223372036854775807_11217 = torch.constant.int 9223372036854775807
    %int1_11218 = torch.constant.int 1
    %8760 = torch.aten.slice.Tensor %8759, %int0_11215, %int0_11216, %int9223372036854775807_11217, %int1_11218 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_11219 = torch.constant.int 1
    %8761 = torch.aten.unsqueeze %8760, %int1_11219 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_11220 = torch.constant.int 2
    %int0_11221 = torch.constant.int 0
    %int9223372036854775807_11222 = torch.constant.int 9223372036854775807
    %int1_11223 = torch.constant.int 1
    %8762 = torch.aten.slice.Tensor %8761, %int2_11220, %int0_11221, %int9223372036854775807_11222, %int1_11223 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_11224 = torch.constant.int -1
    %int0_11225 = torch.constant.int 0
    %int3072_11226 = torch.constant.int 3072
    %int1_11227 = torch.constant.int 1
    %8763 = torch.aten.slice.Tensor %8762, %int-1_11224, %int0_11225, %int3072_11226, %int1_11227 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11228 = torch.constant.int -1
    %int3072_11229 = torch.constant.int 3072
    %int6144_11230 = torch.constant.int 6144
    %int1_11231 = torch.constant.int 1
    %8764 = torch.aten.slice.Tensor %8762, %int-1_11228, %int3072_11229, %int6144_11230, %int1_11231 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11232 = torch.constant.int -1
    %int6144_11233 = torch.constant.int 6144
    %int9216_11234 = torch.constant.int 9216
    %int1_11235 = torch.constant.int 1
    %8765 = torch.aten.slice.Tensor %8762, %int-1_11232, %int6144_11233, %int9216_11234, %int1_11235 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_11236 = torch.constant.int 1
    %int1_11237 = torch.constant.int 1
    %8766 = torch.aten.add.Scalar %8764, %int1_11236, %int1_11237 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_11238 = torch.constant.int 6
    %8767 = torch.prims.convert_element_type %8747, %int6_11238 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_11239 = torch.constant.int 2
    %8768 = torch.prim.ListConstruct %int2_11239 : (!torch.int) -> !torch.list<int>
    %int0_11240 = torch.constant.int 0
    %true_11241 = torch.constant.bool true
    %result0_11242, %result1_11243 = torch.aten.var_mean.correction %8767, %8768, %int0_11240, %true_11241 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_11244 = torch.constant.float 9.9999999999999995E-7
    %int1_11245 = torch.constant.int 1
    %8769 = torch.aten.add.Scalar %result0_11242, %float9.999990e-07_11244, %int1_11245 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8770 = torch.aten.rsqrt %8769 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_11246 = torch.constant.int 1
    %8771 = torch.aten.sub.Tensor %8747, %result1_11243, %int1_11246 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8772 = torch.aten.mul.Tensor %8771, %8770 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_11247 = torch.constant.int 5
    %8773 = torch.prims.convert_element_type %8772, %int5_11247 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8774 = torch.aten.mul.Tensor %8766, %8773 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11248 = torch.constant.int 1
    %8775 = torch.aten.add.Tensor %8774, %8763, %int1_11248 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_11249 = torch.constant.int 4608
    %int3072_11250 = torch.constant.int 3072
    %8776 = torch.prim.ListConstruct %int4608_11249, %int3072_11250 : (!torch.int, !torch.int) -> !torch.list<int>
    %8777 = torch.aten.view %8775, %8776 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.21.linear1.weight = util.global.load @__auto.sampler.single_blocks.21.linear1.weight : tensor<21504x3072xf16>
    %8778 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_11251 = torch.constant.int 0
    %int1_11252 = torch.constant.int 1
    %8779 = torch.aten.transpose.int %8778, %int0_11251, %int1_11252 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.21.linear1.bias = util.global.load @__auto.sampler.single_blocks.21.linear1.bias : tensor<21504xf16>
    %8780 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_11253 = torch.constant.int 6
    %8781 = torch.prims.convert_element_type %8780, %int6_11253 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_11254 = torch.constant.int 6
    %8782 = torch.prims.convert_element_type %8777, %int6_11254 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_11255 = torch.constant.int 6
    %8783 = torch.prims.convert_element_type %8779, %int6_11255 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8784 = torch.aten.mm %8782, %8783 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_11256 = torch.constant.int 1
    %8785 = torch.aten.mul.Scalar %8784, %int1_11256 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_11257 = torch.constant.int 1
    %8786 = torch.aten.mul.Scalar %8781, %int1_11257 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_11258 = torch.constant.int 1
    %8787 = torch.aten.add.Tensor %8785, %8786, %int1_11258 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_11259 = torch.constant.int 5
    %8788 = torch.prims.convert_element_type %8787, %int5_11259 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_11260 = torch.constant.int 1
    %int4608_11261 = torch.constant.int 4608
    %int21504_11262 = torch.constant.int 21504
    %8789 = torch.prim.ListConstruct %int1_11260, %int4608_11261, %int21504_11262 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8790 = torch.aten.view %8788, %8789 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_11263 = torch.constant.int -1
    %int0_11264 = torch.constant.int 0
    %int9216_11265 = torch.constant.int 9216
    %int1_11266 = torch.constant.int 1
    %8791 = torch.aten.slice.Tensor %8790, %int-1_11263, %int0_11264, %int9216_11265, %int1_11266 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_11267 = torch.constant.int -1
    %int9216_11268 = torch.constant.int 9216
    %int21504_11269 = torch.constant.int 21504
    %int1_11270 = torch.constant.int 1
    %8792 = torch.aten.slice.Tensor %8790, %int-1_11267, %int9216_11268, %int21504_11269, %int1_11270 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_11271 = torch.constant.int 1
    %int4608_11272 = torch.constant.int 4608
    %int3_11273 = torch.constant.int 3
    %int24_11274 = torch.constant.int 24
    %int128_11275 = torch.constant.int 128
    %8793 = torch.prim.ListConstruct %int1_11271, %int4608_11272, %int3_11273, %int24_11274, %int128_11275 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8794 = torch.aten.view %8791, %8793 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_11276 = torch.constant.int 2
    %int0_11277 = torch.constant.int 0
    %int3_11278 = torch.constant.int 3
    %int1_11279 = torch.constant.int 1
    %int4_11280 = torch.constant.int 4
    %8795 = torch.prim.ListConstruct %int2_11276, %int0_11277, %int3_11278, %int1_11279, %int4_11280 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8796 = torch.aten.permute %8794, %8795 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_11281 = torch.constant.int 0
    %int0_11282 = torch.constant.int 0
    %8797 = torch.aten.select.int %8796, %int0_11281, %int0_11282 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11283 = torch.constant.int 0
    %int1_11284 = torch.constant.int 1
    %8798 = torch.aten.select.int %8796, %int0_11283, %int1_11284 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11285 = torch.constant.int 0
    %int2_11286 = torch.constant.int 2
    %8799 = torch.aten.select.int %8796, %int0_11285, %int2_11286 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11287 = torch.constant.int 6
    %8800 = torch.prims.convert_element_type %8797, %int6_11287 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11288 = torch.constant.int 2
    %8801 = torch.aten.pow.Tensor_Scalar %8800, %int2_11288 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11289 = torch.constant.int -1
    %8802 = torch.prim.ListConstruct %int-1_11289 : (!torch.int) -> !torch.list<int>
    %true_11290 = torch.constant.bool true
    %none_11291 = torch.constant.none
    %8803 = torch.aten.mean.dim %8801, %8802, %true_11290, %none_11291 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11292 = torch.constant.float 9.9999999999999995E-7
    %int1_11293 = torch.constant.int 1
    %8804 = torch.aten.add.Scalar %8803, %float9.999990e-07_11292, %int1_11293 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8805 = torch.aten.rsqrt %8804 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8806 = torch.aten.mul.Tensor %8800, %8805 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11294 = torch.constant.int 5
    %8807 = torch.prims.convert_element_type %8806, %int5_11294 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.21.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.21.norm.query_norm.scale : tensor<128xf16>
    %8808 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8809 = torch.aten.mul.Tensor %8807, %8808 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11295 = torch.constant.int 6
    %8810 = torch.prims.convert_element_type %8798, %int6_11295 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11296 = torch.constant.int 2
    %8811 = torch.aten.pow.Tensor_Scalar %8810, %int2_11296 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11297 = torch.constant.int -1
    %8812 = torch.prim.ListConstruct %int-1_11297 : (!torch.int) -> !torch.list<int>
    %true_11298 = torch.constant.bool true
    %none_11299 = torch.constant.none
    %8813 = torch.aten.mean.dim %8811, %8812, %true_11298, %none_11299 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11300 = torch.constant.float 9.9999999999999995E-7
    %int1_11301 = torch.constant.int 1
    %8814 = torch.aten.add.Scalar %8813, %float9.999990e-07_11300, %int1_11301 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8815 = torch.aten.rsqrt %8814 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8816 = torch.aten.mul.Tensor %8810, %8815 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11302 = torch.constant.int 5
    %8817 = torch.prims.convert_element_type %8816, %int5_11302 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.21.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.21.norm.key_norm.scale : tensor<128xf16>
    %8818 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8819 = torch.aten.mul.Tensor %8817, %8818 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11303 = torch.constant.int 5
    %8820 = torch.prims.convert_element_type %8809, %int5_11303 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11304 = torch.constant.int 5
    %8821 = torch.prims.convert_element_type %8819, %int5_11304 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11305 = torch.constant.int 6
    %8822 = torch.prims.convert_element_type %8820, %int6_11305 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11306 = torch.constant.int 1
    %int24_11307 = torch.constant.int 24
    %int4608_11308 = torch.constant.int 4608
    %int64_11309 = torch.constant.int 64
    %int1_11310 = torch.constant.int 1
    %int2_11311 = torch.constant.int 2
    %8823 = torch.prim.ListConstruct %int1_11306, %int24_11307, %int4608_11308, %int64_11309, %int1_11310, %int2_11311 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8824 = torch.aten.view %8822, %8823 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_11312 = torch.constant.int 6
    %8825 = torch.prims.convert_element_type %8821, %int6_11312 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11313 = torch.constant.int 1
    %int24_11314 = torch.constant.int 24
    %int4608_11315 = torch.constant.int 4608
    %int64_11316 = torch.constant.int 64
    %int1_11317 = torch.constant.int 1
    %int2_11318 = torch.constant.int 2
    %8826 = torch.prim.ListConstruct %int1_11313, %int24_11314, %int4608_11315, %int64_11316, %int1_11317, %int2_11318 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8827 = torch.aten.view %8825, %8826 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_11319 = torch.constant.int 5
    %int0_11320 = torch.constant.int 0
    %8828 = torch.aten.select.int %211, %int5_11319, %int0_11320 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11321 = torch.constant.int 5
    %int0_11322 = torch.constant.int 0
    %8829 = torch.aten.select.int %8824, %int5_11321, %int0_11322 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8830 = torch.aten.mul.Tensor %8828, %8829 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11323 = torch.constant.int 5
    %int1_11324 = torch.constant.int 1
    %8831 = torch.aten.select.int %211, %int5_11323, %int1_11324 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11325 = torch.constant.int 5
    %int1_11326 = torch.constant.int 1
    %8832 = torch.aten.select.int %8824, %int5_11325, %int1_11326 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8833 = torch.aten.mul.Tensor %8831, %8832 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11327 = torch.constant.int 1
    %8834 = torch.aten.add.Tensor %8830, %8833, %int1_11327 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11328 = torch.constant.int 5
    %int0_11329 = torch.constant.int 0
    %8835 = torch.aten.select.int %211, %int5_11328, %int0_11329 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11330 = torch.constant.int 5
    %int0_11331 = torch.constant.int 0
    %8836 = torch.aten.select.int %8827, %int5_11330, %int0_11331 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8837 = torch.aten.mul.Tensor %8835, %8836 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11332 = torch.constant.int 5
    %int1_11333 = torch.constant.int 1
    %8838 = torch.aten.select.int %211, %int5_11332, %int1_11333 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11334 = torch.constant.int 5
    %int1_11335 = torch.constant.int 1
    %8839 = torch.aten.select.int %8827, %int5_11334, %int1_11335 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8840 = torch.aten.mul.Tensor %8838, %8839 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11336 = torch.constant.int 1
    %8841 = torch.aten.add.Tensor %8837, %8840, %int1_11336 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11337 = torch.constant.int 1
    %int24_11338 = torch.constant.int 24
    %int4608_11339 = torch.constant.int 4608
    %int128_11340 = torch.constant.int 128
    %8842 = torch.prim.ListConstruct %int1_11337, %int24_11338, %int4608_11339, %int128_11340 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8843 = torch.aten.view %8834, %8842 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11341 = torch.constant.int 5
    %8844 = torch.prims.convert_element_type %8843, %int5_11341 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_11342 = torch.constant.int 1
    %int24_11343 = torch.constant.int 24
    %int4608_11344 = torch.constant.int 4608
    %int128_11345 = torch.constant.int 128
    %8845 = torch.prim.ListConstruct %int1_11342, %int24_11343, %int4608_11344, %int128_11345 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8846 = torch.aten.view %8841, %8845 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11346 = torch.constant.int 5
    %8847 = torch.prims.convert_element_type %8846, %int5_11346 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_11347 = torch.constant.float 0.000000e+00
    %false_11348 = torch.constant.bool false
    %none_11349 = torch.constant.none
    %none_11350 = torch.constant.none
    %8848:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8844, %8847, %8799, %float0.000000e00_11347, %false_11348, %none_11349, %none_11350) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_11351 = torch.constant.int 0
    %int2_11352 = torch.constant.int 2
    %int1_11353 = torch.constant.int 1
    %int3_11354 = torch.constant.int 3
    %8849 = torch.prim.ListConstruct %int0_11351, %int2_11352, %int1_11353, %int3_11354 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8850 = torch.aten.permute %8848#0, %8849 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_11355 = torch.constant.int 1
    %int4608_11356 = torch.constant.int 4608
    %int3072_11357 = torch.constant.int 3072
    %8851 = torch.prim.ListConstruct %int1_11355, %int4608_11356, %int3072_11357 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8852 = torch.aten.view %8850, %8851 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_11358 = torch.constant.str "tanh"
    %8853 = torch.aten.gelu %8792, %str_11358 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8854 = torch.prim.ListConstruct %8852, %8853 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_11359 = torch.constant.int 2
    %8855 = torch.aten.cat %8854, %int2_11359 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_11360 = torch.constant.int 4608
    %int15360_11361 = torch.constant.int 15360
    %8856 = torch.prim.ListConstruct %int4608_11360, %int15360_11361 : (!torch.int, !torch.int) -> !torch.list<int>
    %8857 = torch.aten.view %8855, %8856 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.21.linear2.weight = util.global.load @__auto.sampler.single_blocks.21.linear2.weight : tensor<3072x15360xf16>
    %8858 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_11362 = torch.constant.int 0
    %int1_11363 = torch.constant.int 1
    %8859 = torch.aten.transpose.int %8858, %int0_11362, %int1_11363 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.21.linear2.bias = util.global.load @__auto.sampler.single_blocks.21.linear2.bias : tensor<3072xf16>
    %8860 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.21.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_11364 = torch.constant.int 6
    %8861 = torch.prims.convert_element_type %8860, %int6_11364 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_11365 = torch.constant.int 6
    %8862 = torch.prims.convert_element_type %8857, %int6_11365 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_11366 = torch.constant.int 6
    %8863 = torch.prims.convert_element_type %8859, %int6_11366 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8864 = torch.aten.mm %8862, %8863 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_11367 = torch.constant.int 1
    %8865 = torch.aten.mul.Scalar %8864, %int1_11367 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_11368 = torch.constant.int 1
    %8866 = torch.aten.mul.Scalar %8861, %int1_11368 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_11369 = torch.constant.int 1
    %8867 = torch.aten.add.Tensor %8865, %8866, %int1_11369 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_11370 = torch.constant.int 5
    %8868 = torch.prims.convert_element_type %8867, %int5_11370 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_11371 = torch.constant.int 1
    %int4608_11372 = torch.constant.int 4608
    %int3072_11373 = torch.constant.int 3072
    %8869 = torch.prim.ListConstruct %int1_11371, %int4608_11372, %int3072_11373 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8870 = torch.aten.view %8868, %8869 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8871 = torch.aten.mul.Tensor %8765, %8870 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11374 = torch.constant.int 1
    %8872 = torch.aten.add.Tensor %8747, %8871, %int1_11374 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8873 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.22.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.22.modulation.lin.weight : tensor<9216x3072xf16>
    %8874 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_11375 = torch.constant.int 0
    %int1_11376 = torch.constant.int 1
    %8875 = torch.aten.transpose.int %8874, %int0_11375, %int1_11376 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.22.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.22.modulation.lin.bias : tensor<9216xf16>
    %8876 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_11377 = torch.constant.int 6
    %8877 = torch.prims.convert_element_type %8876, %int6_11377 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_11378 = torch.constant.int 6
    %8878 = torch.prims.convert_element_type %8873, %int6_11378 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_11379 = torch.constant.int 6
    %8879 = torch.prims.convert_element_type %8875, %int6_11379 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %8880 = torch.aten.mm %8878, %8879 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_11380 = torch.constant.int 1
    %8881 = torch.aten.mul.Scalar %8880, %int1_11380 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_11381 = torch.constant.int 1
    %8882 = torch.aten.mul.Scalar %8877, %int1_11381 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_11382 = torch.constant.int 1
    %8883 = torch.aten.add.Tensor %8881, %8882, %int1_11382 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_11383 = torch.constant.int 5
    %8884 = torch.prims.convert_element_type %8883, %int5_11383 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_11384 = torch.constant.int 0
    %int0_11385 = torch.constant.int 0
    %int9223372036854775807_11386 = torch.constant.int 9223372036854775807
    %int1_11387 = torch.constant.int 1
    %8885 = torch.aten.slice.Tensor %8884, %int0_11384, %int0_11385, %int9223372036854775807_11386, %int1_11387 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_11388 = torch.constant.int 1
    %8886 = torch.aten.unsqueeze %8885, %int1_11388 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_11389 = torch.constant.int 2
    %int0_11390 = torch.constant.int 0
    %int9223372036854775807_11391 = torch.constant.int 9223372036854775807
    %int1_11392 = torch.constant.int 1
    %8887 = torch.aten.slice.Tensor %8886, %int2_11389, %int0_11390, %int9223372036854775807_11391, %int1_11392 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_11393 = torch.constant.int -1
    %int0_11394 = torch.constant.int 0
    %int3072_11395 = torch.constant.int 3072
    %int1_11396 = torch.constant.int 1
    %8888 = torch.aten.slice.Tensor %8887, %int-1_11393, %int0_11394, %int3072_11395, %int1_11396 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11397 = torch.constant.int -1
    %int3072_11398 = torch.constant.int 3072
    %int6144_11399 = torch.constant.int 6144
    %int1_11400 = torch.constant.int 1
    %8889 = torch.aten.slice.Tensor %8887, %int-1_11397, %int3072_11398, %int6144_11399, %int1_11400 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11401 = torch.constant.int -1
    %int6144_11402 = torch.constant.int 6144
    %int9216_11403 = torch.constant.int 9216
    %int1_11404 = torch.constant.int 1
    %8890 = torch.aten.slice.Tensor %8887, %int-1_11401, %int6144_11402, %int9216_11403, %int1_11404 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_11405 = torch.constant.int 1
    %int1_11406 = torch.constant.int 1
    %8891 = torch.aten.add.Scalar %8889, %int1_11405, %int1_11406 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_11407 = torch.constant.int 6
    %8892 = torch.prims.convert_element_type %8872, %int6_11407 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_11408 = torch.constant.int 2
    %8893 = torch.prim.ListConstruct %int2_11408 : (!torch.int) -> !torch.list<int>
    %int0_11409 = torch.constant.int 0
    %true_11410 = torch.constant.bool true
    %result0_11411, %result1_11412 = torch.aten.var_mean.correction %8892, %8893, %int0_11409, %true_11410 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_11413 = torch.constant.float 9.9999999999999995E-7
    %int1_11414 = torch.constant.int 1
    %8894 = torch.aten.add.Scalar %result0_11411, %float9.999990e-07_11413, %int1_11414 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %8895 = torch.aten.rsqrt %8894 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_11415 = torch.constant.int 1
    %8896 = torch.aten.sub.Tensor %8872, %result1_11412, %int1_11415 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %8897 = torch.aten.mul.Tensor %8896, %8895 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_11416 = torch.constant.int 5
    %8898 = torch.prims.convert_element_type %8897, %int5_11416 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8899 = torch.aten.mul.Tensor %8891, %8898 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11417 = torch.constant.int 1
    %8900 = torch.aten.add.Tensor %8899, %8888, %int1_11417 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_11418 = torch.constant.int 4608
    %int3072_11419 = torch.constant.int 3072
    %8901 = torch.prim.ListConstruct %int4608_11418, %int3072_11419 : (!torch.int, !torch.int) -> !torch.list<int>
    %8902 = torch.aten.view %8900, %8901 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.22.linear1.weight = util.global.load @__auto.sampler.single_blocks.22.linear1.weight : tensor<21504x3072xf16>
    %8903 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_11420 = torch.constant.int 0
    %int1_11421 = torch.constant.int 1
    %8904 = torch.aten.transpose.int %8903, %int0_11420, %int1_11421 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.22.linear1.bias = util.global.load @__auto.sampler.single_blocks.22.linear1.bias : tensor<21504xf16>
    %8905 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_11422 = torch.constant.int 6
    %8906 = torch.prims.convert_element_type %8905, %int6_11422 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_11423 = torch.constant.int 6
    %8907 = torch.prims.convert_element_type %8902, %int6_11423 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_11424 = torch.constant.int 6
    %8908 = torch.prims.convert_element_type %8904, %int6_11424 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %8909 = torch.aten.mm %8907, %8908 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_11425 = torch.constant.int 1
    %8910 = torch.aten.mul.Scalar %8909, %int1_11425 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_11426 = torch.constant.int 1
    %8911 = torch.aten.mul.Scalar %8906, %int1_11426 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_11427 = torch.constant.int 1
    %8912 = torch.aten.add.Tensor %8910, %8911, %int1_11427 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_11428 = torch.constant.int 5
    %8913 = torch.prims.convert_element_type %8912, %int5_11428 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_11429 = torch.constant.int 1
    %int4608_11430 = torch.constant.int 4608
    %int21504_11431 = torch.constant.int 21504
    %8914 = torch.prim.ListConstruct %int1_11429, %int4608_11430, %int21504_11431 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8915 = torch.aten.view %8913, %8914 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_11432 = torch.constant.int -1
    %int0_11433 = torch.constant.int 0
    %int9216_11434 = torch.constant.int 9216
    %int1_11435 = torch.constant.int 1
    %8916 = torch.aten.slice.Tensor %8915, %int-1_11432, %int0_11433, %int9216_11434, %int1_11435 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_11436 = torch.constant.int -1
    %int9216_11437 = torch.constant.int 9216
    %int21504_11438 = torch.constant.int 21504
    %int1_11439 = torch.constant.int 1
    %8917 = torch.aten.slice.Tensor %8915, %int-1_11436, %int9216_11437, %int21504_11438, %int1_11439 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_11440 = torch.constant.int 1
    %int4608_11441 = torch.constant.int 4608
    %int3_11442 = torch.constant.int 3
    %int24_11443 = torch.constant.int 24
    %int128_11444 = torch.constant.int 128
    %8918 = torch.prim.ListConstruct %int1_11440, %int4608_11441, %int3_11442, %int24_11443, %int128_11444 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8919 = torch.aten.view %8916, %8918 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_11445 = torch.constant.int 2
    %int0_11446 = torch.constant.int 0
    %int3_11447 = torch.constant.int 3
    %int1_11448 = torch.constant.int 1
    %int4_11449 = torch.constant.int 4
    %8920 = torch.prim.ListConstruct %int2_11445, %int0_11446, %int3_11447, %int1_11448, %int4_11449 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8921 = torch.aten.permute %8919, %8920 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_11450 = torch.constant.int 0
    %int0_11451 = torch.constant.int 0
    %8922 = torch.aten.select.int %8921, %int0_11450, %int0_11451 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11452 = torch.constant.int 0
    %int1_11453 = torch.constant.int 1
    %8923 = torch.aten.select.int %8921, %int0_11452, %int1_11453 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11454 = torch.constant.int 0
    %int2_11455 = torch.constant.int 2
    %8924 = torch.aten.select.int %8921, %int0_11454, %int2_11455 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11456 = torch.constant.int 6
    %8925 = torch.prims.convert_element_type %8922, %int6_11456 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11457 = torch.constant.int 2
    %8926 = torch.aten.pow.Tensor_Scalar %8925, %int2_11457 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11458 = torch.constant.int -1
    %8927 = torch.prim.ListConstruct %int-1_11458 : (!torch.int) -> !torch.list<int>
    %true_11459 = torch.constant.bool true
    %none_11460 = torch.constant.none
    %8928 = torch.aten.mean.dim %8926, %8927, %true_11459, %none_11460 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11461 = torch.constant.float 9.9999999999999995E-7
    %int1_11462 = torch.constant.int 1
    %8929 = torch.aten.add.Scalar %8928, %float9.999990e-07_11461, %int1_11462 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8930 = torch.aten.rsqrt %8929 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8931 = torch.aten.mul.Tensor %8925, %8930 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11463 = torch.constant.int 5
    %8932 = torch.prims.convert_element_type %8931, %int5_11463 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.22.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.22.norm.query_norm.scale : tensor<128xf16>
    %8933 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8934 = torch.aten.mul.Tensor %8932, %8933 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11464 = torch.constant.int 6
    %8935 = torch.prims.convert_element_type %8923, %int6_11464 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11465 = torch.constant.int 2
    %8936 = torch.aten.pow.Tensor_Scalar %8935, %int2_11465 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11466 = torch.constant.int -1
    %8937 = torch.prim.ListConstruct %int-1_11466 : (!torch.int) -> !torch.list<int>
    %true_11467 = torch.constant.bool true
    %none_11468 = torch.constant.none
    %8938 = torch.aten.mean.dim %8936, %8937, %true_11467, %none_11468 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11469 = torch.constant.float 9.9999999999999995E-7
    %int1_11470 = torch.constant.int 1
    %8939 = torch.aten.add.Scalar %8938, %float9.999990e-07_11469, %int1_11470 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %8940 = torch.aten.rsqrt %8939 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %8941 = torch.aten.mul.Tensor %8935, %8940 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11471 = torch.constant.int 5
    %8942 = torch.prims.convert_element_type %8941, %int5_11471 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.22.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.22.norm.key_norm.scale : tensor<128xf16>
    %8943 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %8944 = torch.aten.mul.Tensor %8942, %8943 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11472 = torch.constant.int 5
    %8945 = torch.prims.convert_element_type %8934, %int5_11472 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11473 = torch.constant.int 5
    %8946 = torch.prims.convert_element_type %8944, %int5_11473 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11474 = torch.constant.int 6
    %8947 = torch.prims.convert_element_type %8945, %int6_11474 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11475 = torch.constant.int 1
    %int24_11476 = torch.constant.int 24
    %int4608_11477 = torch.constant.int 4608
    %int64_11478 = torch.constant.int 64
    %int1_11479 = torch.constant.int 1
    %int2_11480 = torch.constant.int 2
    %8948 = torch.prim.ListConstruct %int1_11475, %int24_11476, %int4608_11477, %int64_11478, %int1_11479, %int2_11480 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8949 = torch.aten.view %8947, %8948 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_11481 = torch.constant.int 6
    %8950 = torch.prims.convert_element_type %8946, %int6_11481 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11482 = torch.constant.int 1
    %int24_11483 = torch.constant.int 24
    %int4608_11484 = torch.constant.int 4608
    %int64_11485 = torch.constant.int 64
    %int1_11486 = torch.constant.int 1
    %int2_11487 = torch.constant.int 2
    %8951 = torch.prim.ListConstruct %int1_11482, %int24_11483, %int4608_11484, %int64_11485, %int1_11486, %int2_11487 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8952 = torch.aten.view %8950, %8951 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_11488 = torch.constant.int 5
    %int0_11489 = torch.constant.int 0
    %8953 = torch.aten.select.int %211, %int5_11488, %int0_11489 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11490 = torch.constant.int 5
    %int0_11491 = torch.constant.int 0
    %8954 = torch.aten.select.int %8949, %int5_11490, %int0_11491 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8955 = torch.aten.mul.Tensor %8953, %8954 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11492 = torch.constant.int 5
    %int1_11493 = torch.constant.int 1
    %8956 = torch.aten.select.int %211, %int5_11492, %int1_11493 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11494 = torch.constant.int 5
    %int1_11495 = torch.constant.int 1
    %8957 = torch.aten.select.int %8949, %int5_11494, %int1_11495 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8958 = torch.aten.mul.Tensor %8956, %8957 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11496 = torch.constant.int 1
    %8959 = torch.aten.add.Tensor %8955, %8958, %int1_11496 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11497 = torch.constant.int 5
    %int0_11498 = torch.constant.int 0
    %8960 = torch.aten.select.int %211, %int5_11497, %int0_11498 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11499 = torch.constant.int 5
    %int0_11500 = torch.constant.int 0
    %8961 = torch.aten.select.int %8952, %int5_11499, %int0_11500 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8962 = torch.aten.mul.Tensor %8960, %8961 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11501 = torch.constant.int 5
    %int1_11502 = torch.constant.int 1
    %8963 = torch.aten.select.int %211, %int5_11501, %int1_11502 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11503 = torch.constant.int 5
    %int1_11504 = torch.constant.int 1
    %8964 = torch.aten.select.int %8952, %int5_11503, %int1_11504 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %8965 = torch.aten.mul.Tensor %8963, %8964 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11505 = torch.constant.int 1
    %8966 = torch.aten.add.Tensor %8962, %8965, %int1_11505 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11506 = torch.constant.int 1
    %int24_11507 = torch.constant.int 24
    %int4608_11508 = torch.constant.int 4608
    %int128_11509 = torch.constant.int 128
    %8967 = torch.prim.ListConstruct %int1_11506, %int24_11507, %int4608_11508, %int128_11509 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8968 = torch.aten.view %8959, %8967 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11510 = torch.constant.int 5
    %8969 = torch.prims.convert_element_type %8968, %int5_11510 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_11511 = torch.constant.int 1
    %int24_11512 = torch.constant.int 24
    %int4608_11513 = torch.constant.int 4608
    %int128_11514 = torch.constant.int 128
    %8970 = torch.prim.ListConstruct %int1_11511, %int24_11512, %int4608_11513, %int128_11514 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8971 = torch.aten.view %8966, %8970 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11515 = torch.constant.int 5
    %8972 = torch.prims.convert_element_type %8971, %int5_11515 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_11516 = torch.constant.float 0.000000e+00
    %false_11517 = torch.constant.bool false
    %none_11518 = torch.constant.none
    %none_11519 = torch.constant.none
    %8973:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%8969, %8972, %8924, %float0.000000e00_11516, %false_11517, %none_11518, %none_11519) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_11520 = torch.constant.int 0
    %int2_11521 = torch.constant.int 2
    %int1_11522 = torch.constant.int 1
    %int3_11523 = torch.constant.int 3
    %8974 = torch.prim.ListConstruct %int0_11520, %int2_11521, %int1_11522, %int3_11523 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8975 = torch.aten.permute %8973#0, %8974 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_11524 = torch.constant.int 1
    %int4608_11525 = torch.constant.int 4608
    %int3072_11526 = torch.constant.int 3072
    %8976 = torch.prim.ListConstruct %int1_11524, %int4608_11525, %int3072_11526 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8977 = torch.aten.view %8975, %8976 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_11527 = torch.constant.str "tanh"
    %8978 = torch.aten.gelu %8917, %str_11527 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %8979 = torch.prim.ListConstruct %8977, %8978 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_11528 = torch.constant.int 2
    %8980 = torch.aten.cat %8979, %int2_11528 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_11529 = torch.constant.int 4608
    %int15360_11530 = torch.constant.int 15360
    %8981 = torch.prim.ListConstruct %int4608_11529, %int15360_11530 : (!torch.int, !torch.int) -> !torch.list<int>
    %8982 = torch.aten.view %8980, %8981 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.22.linear2.weight = util.global.load @__auto.sampler.single_blocks.22.linear2.weight : tensor<3072x15360xf16>
    %8983 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_11531 = torch.constant.int 0
    %int1_11532 = torch.constant.int 1
    %8984 = torch.aten.transpose.int %8983, %int0_11531, %int1_11532 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.22.linear2.bias = util.global.load @__auto.sampler.single_blocks.22.linear2.bias : tensor<3072xf16>
    %8985 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.22.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_11533 = torch.constant.int 6
    %8986 = torch.prims.convert_element_type %8985, %int6_11533 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_11534 = torch.constant.int 6
    %8987 = torch.prims.convert_element_type %8982, %int6_11534 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_11535 = torch.constant.int 6
    %8988 = torch.prims.convert_element_type %8984, %int6_11535 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %8989 = torch.aten.mm %8987, %8988 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_11536 = torch.constant.int 1
    %8990 = torch.aten.mul.Scalar %8989, %int1_11536 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_11537 = torch.constant.int 1
    %8991 = torch.aten.mul.Scalar %8986, %int1_11537 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_11538 = torch.constant.int 1
    %8992 = torch.aten.add.Tensor %8990, %8991, %int1_11538 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_11539 = torch.constant.int 5
    %8993 = torch.prims.convert_element_type %8992, %int5_11539 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_11540 = torch.constant.int 1
    %int4608_11541 = torch.constant.int 4608
    %int3072_11542 = torch.constant.int 3072
    %8994 = torch.prim.ListConstruct %int1_11540, %int4608_11541, %int3072_11542 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %8995 = torch.aten.view %8993, %8994 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %8996 = torch.aten.mul.Tensor %8890, %8995 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11543 = torch.constant.int 1
    %8997 = torch.aten.add.Tensor %8872, %8996, %int1_11543 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %8998 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.23.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.23.modulation.lin.weight : tensor<9216x3072xf16>
    %8999 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_11544 = torch.constant.int 0
    %int1_11545 = torch.constant.int 1
    %9000 = torch.aten.transpose.int %8999, %int0_11544, %int1_11545 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.23.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.23.modulation.lin.bias : tensor<9216xf16>
    %9001 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_11546 = torch.constant.int 6
    %9002 = torch.prims.convert_element_type %9001, %int6_11546 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_11547 = torch.constant.int 6
    %9003 = torch.prims.convert_element_type %8998, %int6_11547 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_11548 = torch.constant.int 6
    %9004 = torch.prims.convert_element_type %9000, %int6_11548 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9005 = torch.aten.mm %9003, %9004 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_11549 = torch.constant.int 1
    %9006 = torch.aten.mul.Scalar %9005, %int1_11549 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_11550 = torch.constant.int 1
    %9007 = torch.aten.mul.Scalar %9002, %int1_11550 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_11551 = torch.constant.int 1
    %9008 = torch.aten.add.Tensor %9006, %9007, %int1_11551 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_11552 = torch.constant.int 5
    %9009 = torch.prims.convert_element_type %9008, %int5_11552 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_11553 = torch.constant.int 0
    %int0_11554 = torch.constant.int 0
    %int9223372036854775807_11555 = torch.constant.int 9223372036854775807
    %int1_11556 = torch.constant.int 1
    %9010 = torch.aten.slice.Tensor %9009, %int0_11553, %int0_11554, %int9223372036854775807_11555, %int1_11556 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_11557 = torch.constant.int 1
    %9011 = torch.aten.unsqueeze %9010, %int1_11557 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_11558 = torch.constant.int 2
    %int0_11559 = torch.constant.int 0
    %int9223372036854775807_11560 = torch.constant.int 9223372036854775807
    %int1_11561 = torch.constant.int 1
    %9012 = torch.aten.slice.Tensor %9011, %int2_11558, %int0_11559, %int9223372036854775807_11560, %int1_11561 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_11562 = torch.constant.int -1
    %int0_11563 = torch.constant.int 0
    %int3072_11564 = torch.constant.int 3072
    %int1_11565 = torch.constant.int 1
    %9013 = torch.aten.slice.Tensor %9012, %int-1_11562, %int0_11563, %int3072_11564, %int1_11565 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11566 = torch.constant.int -1
    %int3072_11567 = torch.constant.int 3072
    %int6144_11568 = torch.constant.int 6144
    %int1_11569 = torch.constant.int 1
    %9014 = torch.aten.slice.Tensor %9012, %int-1_11566, %int3072_11567, %int6144_11568, %int1_11569 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11570 = torch.constant.int -1
    %int6144_11571 = torch.constant.int 6144
    %int9216_11572 = torch.constant.int 9216
    %int1_11573 = torch.constant.int 1
    %9015 = torch.aten.slice.Tensor %9012, %int-1_11570, %int6144_11571, %int9216_11572, %int1_11573 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_11574 = torch.constant.int 1
    %int1_11575 = torch.constant.int 1
    %9016 = torch.aten.add.Scalar %9014, %int1_11574, %int1_11575 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_11576 = torch.constant.int 6
    %9017 = torch.prims.convert_element_type %8997, %int6_11576 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_11577 = torch.constant.int 2
    %9018 = torch.prim.ListConstruct %int2_11577 : (!torch.int) -> !torch.list<int>
    %int0_11578 = torch.constant.int 0
    %true_11579 = torch.constant.bool true
    %result0_11580, %result1_11581 = torch.aten.var_mean.correction %9017, %9018, %int0_11578, %true_11579 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_11582 = torch.constant.float 9.9999999999999995E-7
    %int1_11583 = torch.constant.int 1
    %9019 = torch.aten.add.Scalar %result0_11580, %float9.999990e-07_11582, %int1_11583 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9020 = torch.aten.rsqrt %9019 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_11584 = torch.constant.int 1
    %9021 = torch.aten.sub.Tensor %8997, %result1_11581, %int1_11584 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9022 = torch.aten.mul.Tensor %9021, %9020 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_11585 = torch.constant.int 5
    %9023 = torch.prims.convert_element_type %9022, %int5_11585 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9024 = torch.aten.mul.Tensor %9016, %9023 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11586 = torch.constant.int 1
    %9025 = torch.aten.add.Tensor %9024, %9013, %int1_11586 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_11587 = torch.constant.int 4608
    %int3072_11588 = torch.constant.int 3072
    %9026 = torch.prim.ListConstruct %int4608_11587, %int3072_11588 : (!torch.int, !torch.int) -> !torch.list<int>
    %9027 = torch.aten.view %9025, %9026 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.23.linear1.weight = util.global.load @__auto.sampler.single_blocks.23.linear1.weight : tensor<21504x3072xf16>
    %9028 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_11589 = torch.constant.int 0
    %int1_11590 = torch.constant.int 1
    %9029 = torch.aten.transpose.int %9028, %int0_11589, %int1_11590 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.23.linear1.bias = util.global.load @__auto.sampler.single_blocks.23.linear1.bias : tensor<21504xf16>
    %9030 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_11591 = torch.constant.int 6
    %9031 = torch.prims.convert_element_type %9030, %int6_11591 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_11592 = torch.constant.int 6
    %9032 = torch.prims.convert_element_type %9027, %int6_11592 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_11593 = torch.constant.int 6
    %9033 = torch.prims.convert_element_type %9029, %int6_11593 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9034 = torch.aten.mm %9032, %9033 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_11594 = torch.constant.int 1
    %9035 = torch.aten.mul.Scalar %9034, %int1_11594 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_11595 = torch.constant.int 1
    %9036 = torch.aten.mul.Scalar %9031, %int1_11595 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_11596 = torch.constant.int 1
    %9037 = torch.aten.add.Tensor %9035, %9036, %int1_11596 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_11597 = torch.constant.int 5
    %9038 = torch.prims.convert_element_type %9037, %int5_11597 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_11598 = torch.constant.int 1
    %int4608_11599 = torch.constant.int 4608
    %int21504_11600 = torch.constant.int 21504
    %9039 = torch.prim.ListConstruct %int1_11598, %int4608_11599, %int21504_11600 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9040 = torch.aten.view %9038, %9039 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_11601 = torch.constant.int -1
    %int0_11602 = torch.constant.int 0
    %int9216_11603 = torch.constant.int 9216
    %int1_11604 = torch.constant.int 1
    %9041 = torch.aten.slice.Tensor %9040, %int-1_11601, %int0_11602, %int9216_11603, %int1_11604 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_11605 = torch.constant.int -1
    %int9216_11606 = torch.constant.int 9216
    %int21504_11607 = torch.constant.int 21504
    %int1_11608 = torch.constant.int 1
    %9042 = torch.aten.slice.Tensor %9040, %int-1_11605, %int9216_11606, %int21504_11607, %int1_11608 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_11609 = torch.constant.int 1
    %int4608_11610 = torch.constant.int 4608
    %int3_11611 = torch.constant.int 3
    %int24_11612 = torch.constant.int 24
    %int128_11613 = torch.constant.int 128
    %9043 = torch.prim.ListConstruct %int1_11609, %int4608_11610, %int3_11611, %int24_11612, %int128_11613 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9044 = torch.aten.view %9041, %9043 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_11614 = torch.constant.int 2
    %int0_11615 = torch.constant.int 0
    %int3_11616 = torch.constant.int 3
    %int1_11617 = torch.constant.int 1
    %int4_11618 = torch.constant.int 4
    %9045 = torch.prim.ListConstruct %int2_11614, %int0_11615, %int3_11616, %int1_11617, %int4_11618 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9046 = torch.aten.permute %9044, %9045 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_11619 = torch.constant.int 0
    %int0_11620 = torch.constant.int 0
    %9047 = torch.aten.select.int %9046, %int0_11619, %int0_11620 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11621 = torch.constant.int 0
    %int1_11622 = torch.constant.int 1
    %9048 = torch.aten.select.int %9046, %int0_11621, %int1_11622 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11623 = torch.constant.int 0
    %int2_11624 = torch.constant.int 2
    %9049 = torch.aten.select.int %9046, %int0_11623, %int2_11624 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11625 = torch.constant.int 6
    %9050 = torch.prims.convert_element_type %9047, %int6_11625 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11626 = torch.constant.int 2
    %9051 = torch.aten.pow.Tensor_Scalar %9050, %int2_11626 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11627 = torch.constant.int -1
    %9052 = torch.prim.ListConstruct %int-1_11627 : (!torch.int) -> !torch.list<int>
    %true_11628 = torch.constant.bool true
    %none_11629 = torch.constant.none
    %9053 = torch.aten.mean.dim %9051, %9052, %true_11628, %none_11629 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11630 = torch.constant.float 9.9999999999999995E-7
    %int1_11631 = torch.constant.int 1
    %9054 = torch.aten.add.Scalar %9053, %float9.999990e-07_11630, %int1_11631 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9055 = torch.aten.rsqrt %9054 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9056 = torch.aten.mul.Tensor %9050, %9055 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11632 = torch.constant.int 5
    %9057 = torch.prims.convert_element_type %9056, %int5_11632 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.23.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.23.norm.query_norm.scale : tensor<128xf16>
    %9058 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9059 = torch.aten.mul.Tensor %9057, %9058 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11633 = torch.constant.int 6
    %9060 = torch.prims.convert_element_type %9048, %int6_11633 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11634 = torch.constant.int 2
    %9061 = torch.aten.pow.Tensor_Scalar %9060, %int2_11634 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11635 = torch.constant.int -1
    %9062 = torch.prim.ListConstruct %int-1_11635 : (!torch.int) -> !torch.list<int>
    %true_11636 = torch.constant.bool true
    %none_11637 = torch.constant.none
    %9063 = torch.aten.mean.dim %9061, %9062, %true_11636, %none_11637 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11638 = torch.constant.float 9.9999999999999995E-7
    %int1_11639 = torch.constant.int 1
    %9064 = torch.aten.add.Scalar %9063, %float9.999990e-07_11638, %int1_11639 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9065 = torch.aten.rsqrt %9064 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9066 = torch.aten.mul.Tensor %9060, %9065 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11640 = torch.constant.int 5
    %9067 = torch.prims.convert_element_type %9066, %int5_11640 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.23.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.23.norm.key_norm.scale : tensor<128xf16>
    %9068 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9069 = torch.aten.mul.Tensor %9067, %9068 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11641 = torch.constant.int 5
    %9070 = torch.prims.convert_element_type %9059, %int5_11641 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11642 = torch.constant.int 5
    %9071 = torch.prims.convert_element_type %9069, %int5_11642 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11643 = torch.constant.int 6
    %9072 = torch.prims.convert_element_type %9070, %int6_11643 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11644 = torch.constant.int 1
    %int24_11645 = torch.constant.int 24
    %int4608_11646 = torch.constant.int 4608
    %int64_11647 = torch.constant.int 64
    %int1_11648 = torch.constant.int 1
    %int2_11649 = torch.constant.int 2
    %9073 = torch.prim.ListConstruct %int1_11644, %int24_11645, %int4608_11646, %int64_11647, %int1_11648, %int2_11649 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9074 = torch.aten.view %9072, %9073 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_11650 = torch.constant.int 6
    %9075 = torch.prims.convert_element_type %9071, %int6_11650 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11651 = torch.constant.int 1
    %int24_11652 = torch.constant.int 24
    %int4608_11653 = torch.constant.int 4608
    %int64_11654 = torch.constant.int 64
    %int1_11655 = torch.constant.int 1
    %int2_11656 = torch.constant.int 2
    %9076 = torch.prim.ListConstruct %int1_11651, %int24_11652, %int4608_11653, %int64_11654, %int1_11655, %int2_11656 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9077 = torch.aten.view %9075, %9076 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_11657 = torch.constant.int 5
    %int0_11658 = torch.constant.int 0
    %9078 = torch.aten.select.int %211, %int5_11657, %int0_11658 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11659 = torch.constant.int 5
    %int0_11660 = torch.constant.int 0
    %9079 = torch.aten.select.int %9074, %int5_11659, %int0_11660 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9080 = torch.aten.mul.Tensor %9078, %9079 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11661 = torch.constant.int 5
    %int1_11662 = torch.constant.int 1
    %9081 = torch.aten.select.int %211, %int5_11661, %int1_11662 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11663 = torch.constant.int 5
    %int1_11664 = torch.constant.int 1
    %9082 = torch.aten.select.int %9074, %int5_11663, %int1_11664 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9083 = torch.aten.mul.Tensor %9081, %9082 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11665 = torch.constant.int 1
    %9084 = torch.aten.add.Tensor %9080, %9083, %int1_11665 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11666 = torch.constant.int 5
    %int0_11667 = torch.constant.int 0
    %9085 = torch.aten.select.int %211, %int5_11666, %int0_11667 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11668 = torch.constant.int 5
    %int0_11669 = torch.constant.int 0
    %9086 = torch.aten.select.int %9077, %int5_11668, %int0_11669 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9087 = torch.aten.mul.Tensor %9085, %9086 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11670 = torch.constant.int 5
    %int1_11671 = torch.constant.int 1
    %9088 = torch.aten.select.int %211, %int5_11670, %int1_11671 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11672 = torch.constant.int 5
    %int1_11673 = torch.constant.int 1
    %9089 = torch.aten.select.int %9077, %int5_11672, %int1_11673 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9090 = torch.aten.mul.Tensor %9088, %9089 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11674 = torch.constant.int 1
    %9091 = torch.aten.add.Tensor %9087, %9090, %int1_11674 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11675 = torch.constant.int 1
    %int24_11676 = torch.constant.int 24
    %int4608_11677 = torch.constant.int 4608
    %int128_11678 = torch.constant.int 128
    %9092 = torch.prim.ListConstruct %int1_11675, %int24_11676, %int4608_11677, %int128_11678 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9093 = torch.aten.view %9084, %9092 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11679 = torch.constant.int 5
    %9094 = torch.prims.convert_element_type %9093, %int5_11679 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_11680 = torch.constant.int 1
    %int24_11681 = torch.constant.int 24
    %int4608_11682 = torch.constant.int 4608
    %int128_11683 = torch.constant.int 128
    %9095 = torch.prim.ListConstruct %int1_11680, %int24_11681, %int4608_11682, %int128_11683 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9096 = torch.aten.view %9091, %9095 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11684 = torch.constant.int 5
    %9097 = torch.prims.convert_element_type %9096, %int5_11684 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_11685 = torch.constant.float 0.000000e+00
    %false_11686 = torch.constant.bool false
    %none_11687 = torch.constant.none
    %none_11688 = torch.constant.none
    %9098:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9094, %9097, %9049, %float0.000000e00_11685, %false_11686, %none_11687, %none_11688) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_11689 = torch.constant.int 0
    %int2_11690 = torch.constant.int 2
    %int1_11691 = torch.constant.int 1
    %int3_11692 = torch.constant.int 3
    %9099 = torch.prim.ListConstruct %int0_11689, %int2_11690, %int1_11691, %int3_11692 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9100 = torch.aten.permute %9098#0, %9099 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_11693 = torch.constant.int 1
    %int4608_11694 = torch.constant.int 4608
    %int3072_11695 = torch.constant.int 3072
    %9101 = torch.prim.ListConstruct %int1_11693, %int4608_11694, %int3072_11695 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9102 = torch.aten.view %9100, %9101 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_11696 = torch.constant.str "tanh"
    %9103 = torch.aten.gelu %9042, %str_11696 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9104 = torch.prim.ListConstruct %9102, %9103 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_11697 = torch.constant.int 2
    %9105 = torch.aten.cat %9104, %int2_11697 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_11698 = torch.constant.int 4608
    %int15360_11699 = torch.constant.int 15360
    %9106 = torch.prim.ListConstruct %int4608_11698, %int15360_11699 : (!torch.int, !torch.int) -> !torch.list<int>
    %9107 = torch.aten.view %9105, %9106 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.23.linear2.weight = util.global.load @__auto.sampler.single_blocks.23.linear2.weight : tensor<3072x15360xf16>
    %9108 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_11700 = torch.constant.int 0
    %int1_11701 = torch.constant.int 1
    %9109 = torch.aten.transpose.int %9108, %int0_11700, %int1_11701 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.23.linear2.bias = util.global.load @__auto.sampler.single_blocks.23.linear2.bias : tensor<3072xf16>
    %9110 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.23.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_11702 = torch.constant.int 6
    %9111 = torch.prims.convert_element_type %9110, %int6_11702 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_11703 = torch.constant.int 6
    %9112 = torch.prims.convert_element_type %9107, %int6_11703 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_11704 = torch.constant.int 6
    %9113 = torch.prims.convert_element_type %9109, %int6_11704 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9114 = torch.aten.mm %9112, %9113 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_11705 = torch.constant.int 1
    %9115 = torch.aten.mul.Scalar %9114, %int1_11705 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_11706 = torch.constant.int 1
    %9116 = torch.aten.mul.Scalar %9111, %int1_11706 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_11707 = torch.constant.int 1
    %9117 = torch.aten.add.Tensor %9115, %9116, %int1_11707 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_11708 = torch.constant.int 5
    %9118 = torch.prims.convert_element_type %9117, %int5_11708 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_11709 = torch.constant.int 1
    %int4608_11710 = torch.constant.int 4608
    %int3072_11711 = torch.constant.int 3072
    %9119 = torch.prim.ListConstruct %int1_11709, %int4608_11710, %int3072_11711 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9120 = torch.aten.view %9118, %9119 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9121 = torch.aten.mul.Tensor %9015, %9120 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11712 = torch.constant.int 1
    %9122 = torch.aten.add.Tensor %8997, %9121, %int1_11712 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9123 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.24.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.24.modulation.lin.weight : tensor<9216x3072xf16>
    %9124 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_11713 = torch.constant.int 0
    %int1_11714 = torch.constant.int 1
    %9125 = torch.aten.transpose.int %9124, %int0_11713, %int1_11714 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.24.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.24.modulation.lin.bias : tensor<9216xf16>
    %9126 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_11715 = torch.constant.int 6
    %9127 = torch.prims.convert_element_type %9126, %int6_11715 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_11716 = torch.constant.int 6
    %9128 = torch.prims.convert_element_type %9123, %int6_11716 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_11717 = torch.constant.int 6
    %9129 = torch.prims.convert_element_type %9125, %int6_11717 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9130 = torch.aten.mm %9128, %9129 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_11718 = torch.constant.int 1
    %9131 = torch.aten.mul.Scalar %9130, %int1_11718 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_11719 = torch.constant.int 1
    %9132 = torch.aten.mul.Scalar %9127, %int1_11719 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_11720 = torch.constant.int 1
    %9133 = torch.aten.add.Tensor %9131, %9132, %int1_11720 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_11721 = torch.constant.int 5
    %9134 = torch.prims.convert_element_type %9133, %int5_11721 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_11722 = torch.constant.int 0
    %int0_11723 = torch.constant.int 0
    %int9223372036854775807_11724 = torch.constant.int 9223372036854775807
    %int1_11725 = torch.constant.int 1
    %9135 = torch.aten.slice.Tensor %9134, %int0_11722, %int0_11723, %int9223372036854775807_11724, %int1_11725 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_11726 = torch.constant.int 1
    %9136 = torch.aten.unsqueeze %9135, %int1_11726 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_11727 = torch.constant.int 2
    %int0_11728 = torch.constant.int 0
    %int9223372036854775807_11729 = torch.constant.int 9223372036854775807
    %int1_11730 = torch.constant.int 1
    %9137 = torch.aten.slice.Tensor %9136, %int2_11727, %int0_11728, %int9223372036854775807_11729, %int1_11730 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_11731 = torch.constant.int -1
    %int0_11732 = torch.constant.int 0
    %int3072_11733 = torch.constant.int 3072
    %int1_11734 = torch.constant.int 1
    %9138 = torch.aten.slice.Tensor %9137, %int-1_11731, %int0_11732, %int3072_11733, %int1_11734 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11735 = torch.constant.int -1
    %int3072_11736 = torch.constant.int 3072
    %int6144_11737 = torch.constant.int 6144
    %int1_11738 = torch.constant.int 1
    %9139 = torch.aten.slice.Tensor %9137, %int-1_11735, %int3072_11736, %int6144_11737, %int1_11738 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11739 = torch.constant.int -1
    %int6144_11740 = torch.constant.int 6144
    %int9216_11741 = torch.constant.int 9216
    %int1_11742 = torch.constant.int 1
    %9140 = torch.aten.slice.Tensor %9137, %int-1_11739, %int6144_11740, %int9216_11741, %int1_11742 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_11743 = torch.constant.int 1
    %int1_11744 = torch.constant.int 1
    %9141 = torch.aten.add.Scalar %9139, %int1_11743, %int1_11744 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_11745 = torch.constant.int 6
    %9142 = torch.prims.convert_element_type %9122, %int6_11745 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_11746 = torch.constant.int 2
    %9143 = torch.prim.ListConstruct %int2_11746 : (!torch.int) -> !torch.list<int>
    %int0_11747 = torch.constant.int 0
    %true_11748 = torch.constant.bool true
    %result0_11749, %result1_11750 = torch.aten.var_mean.correction %9142, %9143, %int0_11747, %true_11748 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_11751 = torch.constant.float 9.9999999999999995E-7
    %int1_11752 = torch.constant.int 1
    %9144 = torch.aten.add.Scalar %result0_11749, %float9.999990e-07_11751, %int1_11752 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9145 = torch.aten.rsqrt %9144 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_11753 = torch.constant.int 1
    %9146 = torch.aten.sub.Tensor %9122, %result1_11750, %int1_11753 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9147 = torch.aten.mul.Tensor %9146, %9145 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_11754 = torch.constant.int 5
    %9148 = torch.prims.convert_element_type %9147, %int5_11754 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9149 = torch.aten.mul.Tensor %9141, %9148 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11755 = torch.constant.int 1
    %9150 = torch.aten.add.Tensor %9149, %9138, %int1_11755 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_11756 = torch.constant.int 4608
    %int3072_11757 = torch.constant.int 3072
    %9151 = torch.prim.ListConstruct %int4608_11756, %int3072_11757 : (!torch.int, !torch.int) -> !torch.list<int>
    %9152 = torch.aten.view %9150, %9151 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.24.linear1.weight = util.global.load @__auto.sampler.single_blocks.24.linear1.weight : tensor<21504x3072xf16>
    %9153 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_11758 = torch.constant.int 0
    %int1_11759 = torch.constant.int 1
    %9154 = torch.aten.transpose.int %9153, %int0_11758, %int1_11759 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.24.linear1.bias = util.global.load @__auto.sampler.single_blocks.24.linear1.bias : tensor<21504xf16>
    %9155 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_11760 = torch.constant.int 6
    %9156 = torch.prims.convert_element_type %9155, %int6_11760 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_11761 = torch.constant.int 6
    %9157 = torch.prims.convert_element_type %9152, %int6_11761 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_11762 = torch.constant.int 6
    %9158 = torch.prims.convert_element_type %9154, %int6_11762 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9159 = torch.aten.mm %9157, %9158 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_11763 = torch.constant.int 1
    %9160 = torch.aten.mul.Scalar %9159, %int1_11763 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_11764 = torch.constant.int 1
    %9161 = torch.aten.mul.Scalar %9156, %int1_11764 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_11765 = torch.constant.int 1
    %9162 = torch.aten.add.Tensor %9160, %9161, %int1_11765 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_11766 = torch.constant.int 5
    %9163 = torch.prims.convert_element_type %9162, %int5_11766 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_11767 = torch.constant.int 1
    %int4608_11768 = torch.constant.int 4608
    %int21504_11769 = torch.constant.int 21504
    %9164 = torch.prim.ListConstruct %int1_11767, %int4608_11768, %int21504_11769 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9165 = torch.aten.view %9163, %9164 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_11770 = torch.constant.int -1
    %int0_11771 = torch.constant.int 0
    %int9216_11772 = torch.constant.int 9216
    %int1_11773 = torch.constant.int 1
    %9166 = torch.aten.slice.Tensor %9165, %int-1_11770, %int0_11771, %int9216_11772, %int1_11773 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_11774 = torch.constant.int -1
    %int9216_11775 = torch.constant.int 9216
    %int21504_11776 = torch.constant.int 21504
    %int1_11777 = torch.constant.int 1
    %9167 = torch.aten.slice.Tensor %9165, %int-1_11774, %int9216_11775, %int21504_11776, %int1_11777 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_11778 = torch.constant.int 1
    %int4608_11779 = torch.constant.int 4608
    %int3_11780 = torch.constant.int 3
    %int24_11781 = torch.constant.int 24
    %int128_11782 = torch.constant.int 128
    %9168 = torch.prim.ListConstruct %int1_11778, %int4608_11779, %int3_11780, %int24_11781, %int128_11782 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9169 = torch.aten.view %9166, %9168 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_11783 = torch.constant.int 2
    %int0_11784 = torch.constant.int 0
    %int3_11785 = torch.constant.int 3
    %int1_11786 = torch.constant.int 1
    %int4_11787 = torch.constant.int 4
    %9170 = torch.prim.ListConstruct %int2_11783, %int0_11784, %int3_11785, %int1_11786, %int4_11787 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9171 = torch.aten.permute %9169, %9170 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_11788 = torch.constant.int 0
    %int0_11789 = torch.constant.int 0
    %9172 = torch.aten.select.int %9171, %int0_11788, %int0_11789 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11790 = torch.constant.int 0
    %int1_11791 = torch.constant.int 1
    %9173 = torch.aten.select.int %9171, %int0_11790, %int1_11791 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11792 = torch.constant.int 0
    %int2_11793 = torch.constant.int 2
    %9174 = torch.aten.select.int %9171, %int0_11792, %int2_11793 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11794 = torch.constant.int 6
    %9175 = torch.prims.convert_element_type %9172, %int6_11794 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11795 = torch.constant.int 2
    %9176 = torch.aten.pow.Tensor_Scalar %9175, %int2_11795 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11796 = torch.constant.int -1
    %9177 = torch.prim.ListConstruct %int-1_11796 : (!torch.int) -> !torch.list<int>
    %true_11797 = torch.constant.bool true
    %none_11798 = torch.constant.none
    %9178 = torch.aten.mean.dim %9176, %9177, %true_11797, %none_11798 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11799 = torch.constant.float 9.9999999999999995E-7
    %int1_11800 = torch.constant.int 1
    %9179 = torch.aten.add.Scalar %9178, %float9.999990e-07_11799, %int1_11800 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9180 = torch.aten.rsqrt %9179 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9181 = torch.aten.mul.Tensor %9175, %9180 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11801 = torch.constant.int 5
    %9182 = torch.prims.convert_element_type %9181, %int5_11801 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.24.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.24.norm.query_norm.scale : tensor<128xf16>
    %9183 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9184 = torch.aten.mul.Tensor %9182, %9183 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11802 = torch.constant.int 6
    %9185 = torch.prims.convert_element_type %9173, %int6_11802 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11803 = torch.constant.int 2
    %9186 = torch.aten.pow.Tensor_Scalar %9185, %int2_11803 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11804 = torch.constant.int -1
    %9187 = torch.prim.ListConstruct %int-1_11804 : (!torch.int) -> !torch.list<int>
    %true_11805 = torch.constant.bool true
    %none_11806 = torch.constant.none
    %9188 = torch.aten.mean.dim %9186, %9187, %true_11805, %none_11806 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11807 = torch.constant.float 9.9999999999999995E-7
    %int1_11808 = torch.constant.int 1
    %9189 = torch.aten.add.Scalar %9188, %float9.999990e-07_11807, %int1_11808 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9190 = torch.aten.rsqrt %9189 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9191 = torch.aten.mul.Tensor %9185, %9190 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11809 = torch.constant.int 5
    %9192 = torch.prims.convert_element_type %9191, %int5_11809 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.24.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.24.norm.key_norm.scale : tensor<128xf16>
    %9193 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9194 = torch.aten.mul.Tensor %9192, %9193 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11810 = torch.constant.int 5
    %9195 = torch.prims.convert_element_type %9184, %int5_11810 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11811 = torch.constant.int 5
    %9196 = torch.prims.convert_element_type %9194, %int5_11811 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11812 = torch.constant.int 6
    %9197 = torch.prims.convert_element_type %9195, %int6_11812 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11813 = torch.constant.int 1
    %int24_11814 = torch.constant.int 24
    %int4608_11815 = torch.constant.int 4608
    %int64_11816 = torch.constant.int 64
    %int1_11817 = torch.constant.int 1
    %int2_11818 = torch.constant.int 2
    %9198 = torch.prim.ListConstruct %int1_11813, %int24_11814, %int4608_11815, %int64_11816, %int1_11817, %int2_11818 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9199 = torch.aten.view %9197, %9198 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_11819 = torch.constant.int 6
    %9200 = torch.prims.convert_element_type %9196, %int6_11819 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11820 = torch.constant.int 1
    %int24_11821 = torch.constant.int 24
    %int4608_11822 = torch.constant.int 4608
    %int64_11823 = torch.constant.int 64
    %int1_11824 = torch.constant.int 1
    %int2_11825 = torch.constant.int 2
    %9201 = torch.prim.ListConstruct %int1_11820, %int24_11821, %int4608_11822, %int64_11823, %int1_11824, %int2_11825 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9202 = torch.aten.view %9200, %9201 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_11826 = torch.constant.int 5
    %int0_11827 = torch.constant.int 0
    %9203 = torch.aten.select.int %211, %int5_11826, %int0_11827 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11828 = torch.constant.int 5
    %int0_11829 = torch.constant.int 0
    %9204 = torch.aten.select.int %9199, %int5_11828, %int0_11829 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9205 = torch.aten.mul.Tensor %9203, %9204 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11830 = torch.constant.int 5
    %int1_11831 = torch.constant.int 1
    %9206 = torch.aten.select.int %211, %int5_11830, %int1_11831 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11832 = torch.constant.int 5
    %int1_11833 = torch.constant.int 1
    %9207 = torch.aten.select.int %9199, %int5_11832, %int1_11833 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9208 = torch.aten.mul.Tensor %9206, %9207 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11834 = torch.constant.int 1
    %9209 = torch.aten.add.Tensor %9205, %9208, %int1_11834 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11835 = torch.constant.int 5
    %int0_11836 = torch.constant.int 0
    %9210 = torch.aten.select.int %211, %int5_11835, %int0_11836 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11837 = torch.constant.int 5
    %int0_11838 = torch.constant.int 0
    %9211 = torch.aten.select.int %9202, %int5_11837, %int0_11838 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9212 = torch.aten.mul.Tensor %9210, %9211 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11839 = torch.constant.int 5
    %int1_11840 = torch.constant.int 1
    %9213 = torch.aten.select.int %211, %int5_11839, %int1_11840 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11841 = torch.constant.int 5
    %int1_11842 = torch.constant.int 1
    %9214 = torch.aten.select.int %9202, %int5_11841, %int1_11842 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9215 = torch.aten.mul.Tensor %9213, %9214 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11843 = torch.constant.int 1
    %9216 = torch.aten.add.Tensor %9212, %9215, %int1_11843 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_11844 = torch.constant.int 1
    %int24_11845 = torch.constant.int 24
    %int4608_11846 = torch.constant.int 4608
    %int128_11847 = torch.constant.int 128
    %9217 = torch.prim.ListConstruct %int1_11844, %int24_11845, %int4608_11846, %int128_11847 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9218 = torch.aten.view %9209, %9217 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11848 = torch.constant.int 5
    %9219 = torch.prims.convert_element_type %9218, %int5_11848 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_11849 = torch.constant.int 1
    %int24_11850 = torch.constant.int 24
    %int4608_11851 = torch.constant.int 4608
    %int128_11852 = torch.constant.int 128
    %9220 = torch.prim.ListConstruct %int1_11849, %int24_11850, %int4608_11851, %int128_11852 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9221 = torch.aten.view %9216, %9220 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11853 = torch.constant.int 5
    %9222 = torch.prims.convert_element_type %9221, %int5_11853 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_11854 = torch.constant.float 0.000000e+00
    %false_11855 = torch.constant.bool false
    %none_11856 = torch.constant.none
    %none_11857 = torch.constant.none
    %9223:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9219, %9222, %9174, %float0.000000e00_11854, %false_11855, %none_11856, %none_11857) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_11858 = torch.constant.int 0
    %int2_11859 = torch.constant.int 2
    %int1_11860 = torch.constant.int 1
    %int3_11861 = torch.constant.int 3
    %9224 = torch.prim.ListConstruct %int0_11858, %int2_11859, %int1_11860, %int3_11861 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9225 = torch.aten.permute %9223#0, %9224 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_11862 = torch.constant.int 1
    %int4608_11863 = torch.constant.int 4608
    %int3072_11864 = torch.constant.int 3072
    %9226 = torch.prim.ListConstruct %int1_11862, %int4608_11863, %int3072_11864 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9227 = torch.aten.view %9225, %9226 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_11865 = torch.constant.str "tanh"
    %9228 = torch.aten.gelu %9167, %str_11865 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9229 = torch.prim.ListConstruct %9227, %9228 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_11866 = torch.constant.int 2
    %9230 = torch.aten.cat %9229, %int2_11866 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_11867 = torch.constant.int 4608
    %int15360_11868 = torch.constant.int 15360
    %9231 = torch.prim.ListConstruct %int4608_11867, %int15360_11868 : (!torch.int, !torch.int) -> !torch.list<int>
    %9232 = torch.aten.view %9230, %9231 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.24.linear2.weight = util.global.load @__auto.sampler.single_blocks.24.linear2.weight : tensor<3072x15360xf16>
    %9233 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_11869 = torch.constant.int 0
    %int1_11870 = torch.constant.int 1
    %9234 = torch.aten.transpose.int %9233, %int0_11869, %int1_11870 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.24.linear2.bias = util.global.load @__auto.sampler.single_blocks.24.linear2.bias : tensor<3072xf16>
    %9235 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.24.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_11871 = torch.constant.int 6
    %9236 = torch.prims.convert_element_type %9235, %int6_11871 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_11872 = torch.constant.int 6
    %9237 = torch.prims.convert_element_type %9232, %int6_11872 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_11873 = torch.constant.int 6
    %9238 = torch.prims.convert_element_type %9234, %int6_11873 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9239 = torch.aten.mm %9237, %9238 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_11874 = torch.constant.int 1
    %9240 = torch.aten.mul.Scalar %9239, %int1_11874 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_11875 = torch.constant.int 1
    %9241 = torch.aten.mul.Scalar %9236, %int1_11875 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_11876 = torch.constant.int 1
    %9242 = torch.aten.add.Tensor %9240, %9241, %int1_11876 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_11877 = torch.constant.int 5
    %9243 = torch.prims.convert_element_type %9242, %int5_11877 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_11878 = torch.constant.int 1
    %int4608_11879 = torch.constant.int 4608
    %int3072_11880 = torch.constant.int 3072
    %9244 = torch.prim.ListConstruct %int1_11878, %int4608_11879, %int3072_11880 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9245 = torch.aten.view %9243, %9244 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9246 = torch.aten.mul.Tensor %9140, %9245 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11881 = torch.constant.int 1
    %9247 = torch.aten.add.Tensor %9122, %9246, %int1_11881 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9248 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.25.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.25.modulation.lin.weight : tensor<9216x3072xf16>
    %9249 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_11882 = torch.constant.int 0
    %int1_11883 = torch.constant.int 1
    %9250 = torch.aten.transpose.int %9249, %int0_11882, %int1_11883 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.25.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.25.modulation.lin.bias : tensor<9216xf16>
    %9251 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_11884 = torch.constant.int 6
    %9252 = torch.prims.convert_element_type %9251, %int6_11884 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_11885 = torch.constant.int 6
    %9253 = torch.prims.convert_element_type %9248, %int6_11885 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_11886 = torch.constant.int 6
    %9254 = torch.prims.convert_element_type %9250, %int6_11886 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9255 = torch.aten.mm %9253, %9254 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_11887 = torch.constant.int 1
    %9256 = torch.aten.mul.Scalar %9255, %int1_11887 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_11888 = torch.constant.int 1
    %9257 = torch.aten.mul.Scalar %9252, %int1_11888 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_11889 = torch.constant.int 1
    %9258 = torch.aten.add.Tensor %9256, %9257, %int1_11889 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_11890 = torch.constant.int 5
    %9259 = torch.prims.convert_element_type %9258, %int5_11890 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_11891 = torch.constant.int 0
    %int0_11892 = torch.constant.int 0
    %int9223372036854775807_11893 = torch.constant.int 9223372036854775807
    %int1_11894 = torch.constant.int 1
    %9260 = torch.aten.slice.Tensor %9259, %int0_11891, %int0_11892, %int9223372036854775807_11893, %int1_11894 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_11895 = torch.constant.int 1
    %9261 = torch.aten.unsqueeze %9260, %int1_11895 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_11896 = torch.constant.int 2
    %int0_11897 = torch.constant.int 0
    %int9223372036854775807_11898 = torch.constant.int 9223372036854775807
    %int1_11899 = torch.constant.int 1
    %9262 = torch.aten.slice.Tensor %9261, %int2_11896, %int0_11897, %int9223372036854775807_11898, %int1_11899 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_11900 = torch.constant.int -1
    %int0_11901 = torch.constant.int 0
    %int3072_11902 = torch.constant.int 3072
    %int1_11903 = torch.constant.int 1
    %9263 = torch.aten.slice.Tensor %9262, %int-1_11900, %int0_11901, %int3072_11902, %int1_11903 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11904 = torch.constant.int -1
    %int3072_11905 = torch.constant.int 3072
    %int6144_11906 = torch.constant.int 6144
    %int1_11907 = torch.constant.int 1
    %9264 = torch.aten.slice.Tensor %9262, %int-1_11904, %int3072_11905, %int6144_11906, %int1_11907 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_11908 = torch.constant.int -1
    %int6144_11909 = torch.constant.int 6144
    %int9216_11910 = torch.constant.int 9216
    %int1_11911 = torch.constant.int 1
    %9265 = torch.aten.slice.Tensor %9262, %int-1_11908, %int6144_11909, %int9216_11910, %int1_11911 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_11912 = torch.constant.int 1
    %int1_11913 = torch.constant.int 1
    %9266 = torch.aten.add.Scalar %9264, %int1_11912, %int1_11913 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_11914 = torch.constant.int 6
    %9267 = torch.prims.convert_element_type %9247, %int6_11914 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_11915 = torch.constant.int 2
    %9268 = torch.prim.ListConstruct %int2_11915 : (!torch.int) -> !torch.list<int>
    %int0_11916 = torch.constant.int 0
    %true_11917 = torch.constant.bool true
    %result0_11918, %result1_11919 = torch.aten.var_mean.correction %9267, %9268, %int0_11916, %true_11917 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_11920 = torch.constant.float 9.9999999999999995E-7
    %int1_11921 = torch.constant.int 1
    %9269 = torch.aten.add.Scalar %result0_11918, %float9.999990e-07_11920, %int1_11921 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9270 = torch.aten.rsqrt %9269 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_11922 = torch.constant.int 1
    %9271 = torch.aten.sub.Tensor %9247, %result1_11919, %int1_11922 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9272 = torch.aten.mul.Tensor %9271, %9270 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_11923 = torch.constant.int 5
    %9273 = torch.prims.convert_element_type %9272, %int5_11923 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9274 = torch.aten.mul.Tensor %9266, %9273 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_11924 = torch.constant.int 1
    %9275 = torch.aten.add.Tensor %9274, %9263, %int1_11924 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_11925 = torch.constant.int 4608
    %int3072_11926 = torch.constant.int 3072
    %9276 = torch.prim.ListConstruct %int4608_11925, %int3072_11926 : (!torch.int, !torch.int) -> !torch.list<int>
    %9277 = torch.aten.view %9275, %9276 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.25.linear1.weight = util.global.load @__auto.sampler.single_blocks.25.linear1.weight : tensor<21504x3072xf16>
    %9278 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_11927 = torch.constant.int 0
    %int1_11928 = torch.constant.int 1
    %9279 = torch.aten.transpose.int %9278, %int0_11927, %int1_11928 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.25.linear1.bias = util.global.load @__auto.sampler.single_blocks.25.linear1.bias : tensor<21504xf16>
    %9280 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_11929 = torch.constant.int 6
    %9281 = torch.prims.convert_element_type %9280, %int6_11929 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_11930 = torch.constant.int 6
    %9282 = torch.prims.convert_element_type %9277, %int6_11930 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_11931 = torch.constant.int 6
    %9283 = torch.prims.convert_element_type %9279, %int6_11931 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9284 = torch.aten.mm %9282, %9283 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_11932 = torch.constant.int 1
    %9285 = torch.aten.mul.Scalar %9284, %int1_11932 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_11933 = torch.constant.int 1
    %9286 = torch.aten.mul.Scalar %9281, %int1_11933 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_11934 = torch.constant.int 1
    %9287 = torch.aten.add.Tensor %9285, %9286, %int1_11934 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_11935 = torch.constant.int 5
    %9288 = torch.prims.convert_element_type %9287, %int5_11935 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_11936 = torch.constant.int 1
    %int4608_11937 = torch.constant.int 4608
    %int21504_11938 = torch.constant.int 21504
    %9289 = torch.prim.ListConstruct %int1_11936, %int4608_11937, %int21504_11938 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9290 = torch.aten.view %9288, %9289 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_11939 = torch.constant.int -1
    %int0_11940 = torch.constant.int 0
    %int9216_11941 = torch.constant.int 9216
    %int1_11942 = torch.constant.int 1
    %9291 = torch.aten.slice.Tensor %9290, %int-1_11939, %int0_11940, %int9216_11941, %int1_11942 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_11943 = torch.constant.int -1
    %int9216_11944 = torch.constant.int 9216
    %int21504_11945 = torch.constant.int 21504
    %int1_11946 = torch.constant.int 1
    %9292 = torch.aten.slice.Tensor %9290, %int-1_11943, %int9216_11944, %int21504_11945, %int1_11946 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_11947 = torch.constant.int 1
    %int4608_11948 = torch.constant.int 4608
    %int3_11949 = torch.constant.int 3
    %int24_11950 = torch.constant.int 24
    %int128_11951 = torch.constant.int 128
    %9293 = torch.prim.ListConstruct %int1_11947, %int4608_11948, %int3_11949, %int24_11950, %int128_11951 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9294 = torch.aten.view %9291, %9293 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_11952 = torch.constant.int 2
    %int0_11953 = torch.constant.int 0
    %int3_11954 = torch.constant.int 3
    %int1_11955 = torch.constant.int 1
    %int4_11956 = torch.constant.int 4
    %9295 = torch.prim.ListConstruct %int2_11952, %int0_11953, %int3_11954, %int1_11955, %int4_11956 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9296 = torch.aten.permute %9294, %9295 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_11957 = torch.constant.int 0
    %int0_11958 = torch.constant.int 0
    %9297 = torch.aten.select.int %9296, %int0_11957, %int0_11958 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11959 = torch.constant.int 0
    %int1_11960 = torch.constant.int 1
    %9298 = torch.aten.select.int %9296, %int0_11959, %int1_11960 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_11961 = torch.constant.int 0
    %int2_11962 = torch.constant.int 2
    %9299 = torch.aten.select.int %9296, %int0_11961, %int2_11962 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11963 = torch.constant.int 6
    %9300 = torch.prims.convert_element_type %9297, %int6_11963 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11964 = torch.constant.int 2
    %9301 = torch.aten.pow.Tensor_Scalar %9300, %int2_11964 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11965 = torch.constant.int -1
    %9302 = torch.prim.ListConstruct %int-1_11965 : (!torch.int) -> !torch.list<int>
    %true_11966 = torch.constant.bool true
    %none_11967 = torch.constant.none
    %9303 = torch.aten.mean.dim %9301, %9302, %true_11966, %none_11967 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11968 = torch.constant.float 9.9999999999999995E-7
    %int1_11969 = torch.constant.int 1
    %9304 = torch.aten.add.Scalar %9303, %float9.999990e-07_11968, %int1_11969 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9305 = torch.aten.rsqrt %9304 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9306 = torch.aten.mul.Tensor %9300, %9305 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11970 = torch.constant.int 5
    %9307 = torch.prims.convert_element_type %9306, %int5_11970 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.25.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.25.norm.query_norm.scale : tensor<128xf16>
    %9308 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9309 = torch.aten.mul.Tensor %9307, %9308 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11971 = torch.constant.int 6
    %9310 = torch.prims.convert_element_type %9298, %int6_11971 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_11972 = torch.constant.int 2
    %9311 = torch.aten.pow.Tensor_Scalar %9310, %int2_11972 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_11973 = torch.constant.int -1
    %9312 = torch.prim.ListConstruct %int-1_11973 : (!torch.int) -> !torch.list<int>
    %true_11974 = torch.constant.bool true
    %none_11975 = torch.constant.none
    %9313 = torch.aten.mean.dim %9311, %9312, %true_11974, %none_11975 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_11976 = torch.constant.float 9.9999999999999995E-7
    %int1_11977 = torch.constant.int 1
    %9314 = torch.aten.add.Scalar %9313, %float9.999990e-07_11976, %int1_11977 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9315 = torch.aten.rsqrt %9314 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9316 = torch.aten.mul.Tensor %9310, %9315 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_11978 = torch.constant.int 5
    %9317 = torch.prims.convert_element_type %9316, %int5_11978 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.25.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.25.norm.key_norm.scale : tensor<128xf16>
    %9318 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9319 = torch.aten.mul.Tensor %9317, %9318 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11979 = torch.constant.int 5
    %9320 = torch.prims.convert_element_type %9309, %int5_11979 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_11980 = torch.constant.int 5
    %9321 = torch.prims.convert_element_type %9319, %int5_11980 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_11981 = torch.constant.int 6
    %9322 = torch.prims.convert_element_type %9320, %int6_11981 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11982 = torch.constant.int 1
    %int24_11983 = torch.constant.int 24
    %int4608_11984 = torch.constant.int 4608
    %int64_11985 = torch.constant.int 64
    %int1_11986 = torch.constant.int 1
    %int2_11987 = torch.constant.int 2
    %9323 = torch.prim.ListConstruct %int1_11982, %int24_11983, %int4608_11984, %int64_11985, %int1_11986, %int2_11987 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9324 = torch.aten.view %9322, %9323 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_11988 = torch.constant.int 6
    %9325 = torch.prims.convert_element_type %9321, %int6_11988 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_11989 = torch.constant.int 1
    %int24_11990 = torch.constant.int 24
    %int4608_11991 = torch.constant.int 4608
    %int64_11992 = torch.constant.int 64
    %int1_11993 = torch.constant.int 1
    %int2_11994 = torch.constant.int 2
    %9326 = torch.prim.ListConstruct %int1_11989, %int24_11990, %int4608_11991, %int64_11992, %int1_11993, %int2_11994 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9327 = torch.aten.view %9325, %9326 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_11995 = torch.constant.int 5
    %int0_11996 = torch.constant.int 0
    %9328 = torch.aten.select.int %211, %int5_11995, %int0_11996 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_11997 = torch.constant.int 5
    %int0_11998 = torch.constant.int 0
    %9329 = torch.aten.select.int %9324, %int5_11997, %int0_11998 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9330 = torch.aten.mul.Tensor %9328, %9329 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_11999 = torch.constant.int 5
    %int1_12000 = torch.constant.int 1
    %9331 = torch.aten.select.int %211, %int5_11999, %int1_12000 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12001 = torch.constant.int 5
    %int1_12002 = torch.constant.int 1
    %9332 = torch.aten.select.int %9324, %int5_12001, %int1_12002 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9333 = torch.aten.mul.Tensor %9331, %9332 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12003 = torch.constant.int 1
    %9334 = torch.aten.add.Tensor %9330, %9333, %int1_12003 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12004 = torch.constant.int 5
    %int0_12005 = torch.constant.int 0
    %9335 = torch.aten.select.int %211, %int5_12004, %int0_12005 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12006 = torch.constant.int 5
    %int0_12007 = torch.constant.int 0
    %9336 = torch.aten.select.int %9327, %int5_12006, %int0_12007 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9337 = torch.aten.mul.Tensor %9335, %9336 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12008 = torch.constant.int 5
    %int1_12009 = torch.constant.int 1
    %9338 = torch.aten.select.int %211, %int5_12008, %int1_12009 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12010 = torch.constant.int 5
    %int1_12011 = torch.constant.int 1
    %9339 = torch.aten.select.int %9327, %int5_12010, %int1_12011 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9340 = torch.aten.mul.Tensor %9338, %9339 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12012 = torch.constant.int 1
    %9341 = torch.aten.add.Tensor %9337, %9340, %int1_12012 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12013 = torch.constant.int 1
    %int24_12014 = torch.constant.int 24
    %int4608_12015 = torch.constant.int 4608
    %int128_12016 = torch.constant.int 128
    %9342 = torch.prim.ListConstruct %int1_12013, %int24_12014, %int4608_12015, %int128_12016 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9343 = torch.aten.view %9334, %9342 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12017 = torch.constant.int 5
    %9344 = torch.prims.convert_element_type %9343, %int5_12017 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_12018 = torch.constant.int 1
    %int24_12019 = torch.constant.int 24
    %int4608_12020 = torch.constant.int 4608
    %int128_12021 = torch.constant.int 128
    %9345 = torch.prim.ListConstruct %int1_12018, %int24_12019, %int4608_12020, %int128_12021 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9346 = torch.aten.view %9341, %9345 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12022 = torch.constant.int 5
    %9347 = torch.prims.convert_element_type %9346, %int5_12022 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_12023 = torch.constant.float 0.000000e+00
    %false_12024 = torch.constant.bool false
    %none_12025 = torch.constant.none
    %none_12026 = torch.constant.none
    %9348:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9344, %9347, %9299, %float0.000000e00_12023, %false_12024, %none_12025, %none_12026) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_12027 = torch.constant.int 0
    %int2_12028 = torch.constant.int 2
    %int1_12029 = torch.constant.int 1
    %int3_12030 = torch.constant.int 3
    %9349 = torch.prim.ListConstruct %int0_12027, %int2_12028, %int1_12029, %int3_12030 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9350 = torch.aten.permute %9348#0, %9349 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_12031 = torch.constant.int 1
    %int4608_12032 = torch.constant.int 4608
    %int3072_12033 = torch.constant.int 3072
    %9351 = torch.prim.ListConstruct %int1_12031, %int4608_12032, %int3072_12033 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9352 = torch.aten.view %9350, %9351 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_12034 = torch.constant.str "tanh"
    %9353 = torch.aten.gelu %9292, %str_12034 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9354 = torch.prim.ListConstruct %9352, %9353 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_12035 = torch.constant.int 2
    %9355 = torch.aten.cat %9354, %int2_12035 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_12036 = torch.constant.int 4608
    %int15360_12037 = torch.constant.int 15360
    %9356 = torch.prim.ListConstruct %int4608_12036, %int15360_12037 : (!torch.int, !torch.int) -> !torch.list<int>
    %9357 = torch.aten.view %9355, %9356 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.25.linear2.weight = util.global.load @__auto.sampler.single_blocks.25.linear2.weight : tensor<3072x15360xf16>
    %9358 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_12038 = torch.constant.int 0
    %int1_12039 = torch.constant.int 1
    %9359 = torch.aten.transpose.int %9358, %int0_12038, %int1_12039 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.25.linear2.bias = util.global.load @__auto.sampler.single_blocks.25.linear2.bias : tensor<3072xf16>
    %9360 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.25.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_12040 = torch.constant.int 6
    %9361 = torch.prims.convert_element_type %9360, %int6_12040 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_12041 = torch.constant.int 6
    %9362 = torch.prims.convert_element_type %9357, %int6_12041 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_12042 = torch.constant.int 6
    %9363 = torch.prims.convert_element_type %9359, %int6_12042 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9364 = torch.aten.mm %9362, %9363 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_12043 = torch.constant.int 1
    %9365 = torch.aten.mul.Scalar %9364, %int1_12043 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_12044 = torch.constant.int 1
    %9366 = torch.aten.mul.Scalar %9361, %int1_12044 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_12045 = torch.constant.int 1
    %9367 = torch.aten.add.Tensor %9365, %9366, %int1_12045 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_12046 = torch.constant.int 5
    %9368 = torch.prims.convert_element_type %9367, %int5_12046 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_12047 = torch.constant.int 1
    %int4608_12048 = torch.constant.int 4608
    %int3072_12049 = torch.constant.int 3072
    %9369 = torch.prim.ListConstruct %int1_12047, %int4608_12048, %int3072_12049 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9370 = torch.aten.view %9368, %9369 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9371 = torch.aten.mul.Tensor %9265, %9370 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12050 = torch.constant.int 1
    %9372 = torch.aten.add.Tensor %9247, %9371, %int1_12050 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9373 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.26.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.26.modulation.lin.weight : tensor<9216x3072xf16>
    %9374 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_12051 = torch.constant.int 0
    %int1_12052 = torch.constant.int 1
    %9375 = torch.aten.transpose.int %9374, %int0_12051, %int1_12052 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.26.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.26.modulation.lin.bias : tensor<9216xf16>
    %9376 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_12053 = torch.constant.int 6
    %9377 = torch.prims.convert_element_type %9376, %int6_12053 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_12054 = torch.constant.int 6
    %9378 = torch.prims.convert_element_type %9373, %int6_12054 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_12055 = torch.constant.int 6
    %9379 = torch.prims.convert_element_type %9375, %int6_12055 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9380 = torch.aten.mm %9378, %9379 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_12056 = torch.constant.int 1
    %9381 = torch.aten.mul.Scalar %9380, %int1_12056 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_12057 = torch.constant.int 1
    %9382 = torch.aten.mul.Scalar %9377, %int1_12057 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_12058 = torch.constant.int 1
    %9383 = torch.aten.add.Tensor %9381, %9382, %int1_12058 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_12059 = torch.constant.int 5
    %9384 = torch.prims.convert_element_type %9383, %int5_12059 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_12060 = torch.constant.int 0
    %int0_12061 = torch.constant.int 0
    %int9223372036854775807_12062 = torch.constant.int 9223372036854775807
    %int1_12063 = torch.constant.int 1
    %9385 = torch.aten.slice.Tensor %9384, %int0_12060, %int0_12061, %int9223372036854775807_12062, %int1_12063 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_12064 = torch.constant.int 1
    %9386 = torch.aten.unsqueeze %9385, %int1_12064 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_12065 = torch.constant.int 2
    %int0_12066 = torch.constant.int 0
    %int9223372036854775807_12067 = torch.constant.int 9223372036854775807
    %int1_12068 = torch.constant.int 1
    %9387 = torch.aten.slice.Tensor %9386, %int2_12065, %int0_12066, %int9223372036854775807_12067, %int1_12068 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_12069 = torch.constant.int -1
    %int0_12070 = torch.constant.int 0
    %int3072_12071 = torch.constant.int 3072
    %int1_12072 = torch.constant.int 1
    %9388 = torch.aten.slice.Tensor %9387, %int-1_12069, %int0_12070, %int3072_12071, %int1_12072 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12073 = torch.constant.int -1
    %int3072_12074 = torch.constant.int 3072
    %int6144_12075 = torch.constant.int 6144
    %int1_12076 = torch.constant.int 1
    %9389 = torch.aten.slice.Tensor %9387, %int-1_12073, %int3072_12074, %int6144_12075, %int1_12076 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12077 = torch.constant.int -1
    %int6144_12078 = torch.constant.int 6144
    %int9216_12079 = torch.constant.int 9216
    %int1_12080 = torch.constant.int 1
    %9390 = torch.aten.slice.Tensor %9387, %int-1_12077, %int6144_12078, %int9216_12079, %int1_12080 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_12081 = torch.constant.int 1
    %int1_12082 = torch.constant.int 1
    %9391 = torch.aten.add.Scalar %9389, %int1_12081, %int1_12082 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_12083 = torch.constant.int 6
    %9392 = torch.prims.convert_element_type %9372, %int6_12083 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_12084 = torch.constant.int 2
    %9393 = torch.prim.ListConstruct %int2_12084 : (!torch.int) -> !torch.list<int>
    %int0_12085 = torch.constant.int 0
    %true_12086 = torch.constant.bool true
    %result0_12087, %result1_12088 = torch.aten.var_mean.correction %9392, %9393, %int0_12085, %true_12086 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_12089 = torch.constant.float 9.9999999999999995E-7
    %int1_12090 = torch.constant.int 1
    %9394 = torch.aten.add.Scalar %result0_12087, %float9.999990e-07_12089, %int1_12090 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9395 = torch.aten.rsqrt %9394 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_12091 = torch.constant.int 1
    %9396 = torch.aten.sub.Tensor %9372, %result1_12088, %int1_12091 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9397 = torch.aten.mul.Tensor %9396, %9395 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_12092 = torch.constant.int 5
    %9398 = torch.prims.convert_element_type %9397, %int5_12092 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9399 = torch.aten.mul.Tensor %9391, %9398 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12093 = torch.constant.int 1
    %9400 = torch.aten.add.Tensor %9399, %9388, %int1_12093 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_12094 = torch.constant.int 4608
    %int3072_12095 = torch.constant.int 3072
    %9401 = torch.prim.ListConstruct %int4608_12094, %int3072_12095 : (!torch.int, !torch.int) -> !torch.list<int>
    %9402 = torch.aten.view %9400, %9401 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.26.linear1.weight = util.global.load @__auto.sampler.single_blocks.26.linear1.weight : tensor<21504x3072xf16>
    %9403 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_12096 = torch.constant.int 0
    %int1_12097 = torch.constant.int 1
    %9404 = torch.aten.transpose.int %9403, %int0_12096, %int1_12097 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.26.linear1.bias = util.global.load @__auto.sampler.single_blocks.26.linear1.bias : tensor<21504xf16>
    %9405 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_12098 = torch.constant.int 6
    %9406 = torch.prims.convert_element_type %9405, %int6_12098 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_12099 = torch.constant.int 6
    %9407 = torch.prims.convert_element_type %9402, %int6_12099 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_12100 = torch.constant.int 6
    %9408 = torch.prims.convert_element_type %9404, %int6_12100 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9409 = torch.aten.mm %9407, %9408 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_12101 = torch.constant.int 1
    %9410 = torch.aten.mul.Scalar %9409, %int1_12101 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_12102 = torch.constant.int 1
    %9411 = torch.aten.mul.Scalar %9406, %int1_12102 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_12103 = torch.constant.int 1
    %9412 = torch.aten.add.Tensor %9410, %9411, %int1_12103 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_12104 = torch.constant.int 5
    %9413 = torch.prims.convert_element_type %9412, %int5_12104 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_12105 = torch.constant.int 1
    %int4608_12106 = torch.constant.int 4608
    %int21504_12107 = torch.constant.int 21504
    %9414 = torch.prim.ListConstruct %int1_12105, %int4608_12106, %int21504_12107 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9415 = torch.aten.view %9413, %9414 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_12108 = torch.constant.int -1
    %int0_12109 = torch.constant.int 0
    %int9216_12110 = torch.constant.int 9216
    %int1_12111 = torch.constant.int 1
    %9416 = torch.aten.slice.Tensor %9415, %int-1_12108, %int0_12109, %int9216_12110, %int1_12111 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_12112 = torch.constant.int -1
    %int9216_12113 = torch.constant.int 9216
    %int21504_12114 = torch.constant.int 21504
    %int1_12115 = torch.constant.int 1
    %9417 = torch.aten.slice.Tensor %9415, %int-1_12112, %int9216_12113, %int21504_12114, %int1_12115 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_12116 = torch.constant.int 1
    %int4608_12117 = torch.constant.int 4608
    %int3_12118 = torch.constant.int 3
    %int24_12119 = torch.constant.int 24
    %int128_12120 = torch.constant.int 128
    %9418 = torch.prim.ListConstruct %int1_12116, %int4608_12117, %int3_12118, %int24_12119, %int128_12120 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9419 = torch.aten.view %9416, %9418 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_12121 = torch.constant.int 2
    %int0_12122 = torch.constant.int 0
    %int3_12123 = torch.constant.int 3
    %int1_12124 = torch.constant.int 1
    %int4_12125 = torch.constant.int 4
    %9420 = torch.prim.ListConstruct %int2_12121, %int0_12122, %int3_12123, %int1_12124, %int4_12125 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9421 = torch.aten.permute %9419, %9420 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_12126 = torch.constant.int 0
    %int0_12127 = torch.constant.int 0
    %9422 = torch.aten.select.int %9421, %int0_12126, %int0_12127 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12128 = torch.constant.int 0
    %int1_12129 = torch.constant.int 1
    %9423 = torch.aten.select.int %9421, %int0_12128, %int1_12129 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12130 = torch.constant.int 0
    %int2_12131 = torch.constant.int 2
    %9424 = torch.aten.select.int %9421, %int0_12130, %int2_12131 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12132 = torch.constant.int 6
    %9425 = torch.prims.convert_element_type %9422, %int6_12132 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12133 = torch.constant.int 2
    %9426 = torch.aten.pow.Tensor_Scalar %9425, %int2_12133 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12134 = torch.constant.int -1
    %9427 = torch.prim.ListConstruct %int-1_12134 : (!torch.int) -> !torch.list<int>
    %true_12135 = torch.constant.bool true
    %none_12136 = torch.constant.none
    %9428 = torch.aten.mean.dim %9426, %9427, %true_12135, %none_12136 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12137 = torch.constant.float 9.9999999999999995E-7
    %int1_12138 = torch.constant.int 1
    %9429 = torch.aten.add.Scalar %9428, %float9.999990e-07_12137, %int1_12138 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9430 = torch.aten.rsqrt %9429 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9431 = torch.aten.mul.Tensor %9425, %9430 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12139 = torch.constant.int 5
    %9432 = torch.prims.convert_element_type %9431, %int5_12139 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.26.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.26.norm.query_norm.scale : tensor<128xf16>
    %9433 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9434 = torch.aten.mul.Tensor %9432, %9433 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12140 = torch.constant.int 6
    %9435 = torch.prims.convert_element_type %9423, %int6_12140 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12141 = torch.constant.int 2
    %9436 = torch.aten.pow.Tensor_Scalar %9435, %int2_12141 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12142 = torch.constant.int -1
    %9437 = torch.prim.ListConstruct %int-1_12142 : (!torch.int) -> !torch.list<int>
    %true_12143 = torch.constant.bool true
    %none_12144 = torch.constant.none
    %9438 = torch.aten.mean.dim %9436, %9437, %true_12143, %none_12144 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12145 = torch.constant.float 9.9999999999999995E-7
    %int1_12146 = torch.constant.int 1
    %9439 = torch.aten.add.Scalar %9438, %float9.999990e-07_12145, %int1_12146 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9440 = torch.aten.rsqrt %9439 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9441 = torch.aten.mul.Tensor %9435, %9440 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12147 = torch.constant.int 5
    %9442 = torch.prims.convert_element_type %9441, %int5_12147 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.26.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.26.norm.key_norm.scale : tensor<128xf16>
    %9443 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9444 = torch.aten.mul.Tensor %9442, %9443 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12148 = torch.constant.int 5
    %9445 = torch.prims.convert_element_type %9434, %int5_12148 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12149 = torch.constant.int 5
    %9446 = torch.prims.convert_element_type %9444, %int5_12149 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12150 = torch.constant.int 6
    %9447 = torch.prims.convert_element_type %9445, %int6_12150 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12151 = torch.constant.int 1
    %int24_12152 = torch.constant.int 24
    %int4608_12153 = torch.constant.int 4608
    %int64_12154 = torch.constant.int 64
    %int1_12155 = torch.constant.int 1
    %int2_12156 = torch.constant.int 2
    %9448 = torch.prim.ListConstruct %int1_12151, %int24_12152, %int4608_12153, %int64_12154, %int1_12155, %int2_12156 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9449 = torch.aten.view %9447, %9448 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_12157 = torch.constant.int 6
    %9450 = torch.prims.convert_element_type %9446, %int6_12157 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12158 = torch.constant.int 1
    %int24_12159 = torch.constant.int 24
    %int4608_12160 = torch.constant.int 4608
    %int64_12161 = torch.constant.int 64
    %int1_12162 = torch.constant.int 1
    %int2_12163 = torch.constant.int 2
    %9451 = torch.prim.ListConstruct %int1_12158, %int24_12159, %int4608_12160, %int64_12161, %int1_12162, %int2_12163 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9452 = torch.aten.view %9450, %9451 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_12164 = torch.constant.int 5
    %int0_12165 = torch.constant.int 0
    %9453 = torch.aten.select.int %211, %int5_12164, %int0_12165 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12166 = torch.constant.int 5
    %int0_12167 = torch.constant.int 0
    %9454 = torch.aten.select.int %9449, %int5_12166, %int0_12167 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9455 = torch.aten.mul.Tensor %9453, %9454 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12168 = torch.constant.int 5
    %int1_12169 = torch.constant.int 1
    %9456 = torch.aten.select.int %211, %int5_12168, %int1_12169 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12170 = torch.constant.int 5
    %int1_12171 = torch.constant.int 1
    %9457 = torch.aten.select.int %9449, %int5_12170, %int1_12171 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9458 = torch.aten.mul.Tensor %9456, %9457 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12172 = torch.constant.int 1
    %9459 = torch.aten.add.Tensor %9455, %9458, %int1_12172 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12173 = torch.constant.int 5
    %int0_12174 = torch.constant.int 0
    %9460 = torch.aten.select.int %211, %int5_12173, %int0_12174 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12175 = torch.constant.int 5
    %int0_12176 = torch.constant.int 0
    %9461 = torch.aten.select.int %9452, %int5_12175, %int0_12176 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9462 = torch.aten.mul.Tensor %9460, %9461 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12177 = torch.constant.int 5
    %int1_12178 = torch.constant.int 1
    %9463 = torch.aten.select.int %211, %int5_12177, %int1_12178 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12179 = torch.constant.int 5
    %int1_12180 = torch.constant.int 1
    %9464 = torch.aten.select.int %9452, %int5_12179, %int1_12180 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9465 = torch.aten.mul.Tensor %9463, %9464 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12181 = torch.constant.int 1
    %9466 = torch.aten.add.Tensor %9462, %9465, %int1_12181 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12182 = torch.constant.int 1
    %int24_12183 = torch.constant.int 24
    %int4608_12184 = torch.constant.int 4608
    %int128_12185 = torch.constant.int 128
    %9467 = torch.prim.ListConstruct %int1_12182, %int24_12183, %int4608_12184, %int128_12185 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9468 = torch.aten.view %9459, %9467 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12186 = torch.constant.int 5
    %9469 = torch.prims.convert_element_type %9468, %int5_12186 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_12187 = torch.constant.int 1
    %int24_12188 = torch.constant.int 24
    %int4608_12189 = torch.constant.int 4608
    %int128_12190 = torch.constant.int 128
    %9470 = torch.prim.ListConstruct %int1_12187, %int24_12188, %int4608_12189, %int128_12190 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9471 = torch.aten.view %9466, %9470 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12191 = torch.constant.int 5
    %9472 = torch.prims.convert_element_type %9471, %int5_12191 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_12192 = torch.constant.float 0.000000e+00
    %false_12193 = torch.constant.bool false
    %none_12194 = torch.constant.none
    %none_12195 = torch.constant.none
    %9473:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9469, %9472, %9424, %float0.000000e00_12192, %false_12193, %none_12194, %none_12195) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_12196 = torch.constant.int 0
    %int2_12197 = torch.constant.int 2
    %int1_12198 = torch.constant.int 1
    %int3_12199 = torch.constant.int 3
    %9474 = torch.prim.ListConstruct %int0_12196, %int2_12197, %int1_12198, %int3_12199 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9475 = torch.aten.permute %9473#0, %9474 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_12200 = torch.constant.int 1
    %int4608_12201 = torch.constant.int 4608
    %int3072_12202 = torch.constant.int 3072
    %9476 = torch.prim.ListConstruct %int1_12200, %int4608_12201, %int3072_12202 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9477 = torch.aten.view %9475, %9476 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_12203 = torch.constant.str "tanh"
    %9478 = torch.aten.gelu %9417, %str_12203 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9479 = torch.prim.ListConstruct %9477, %9478 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_12204 = torch.constant.int 2
    %9480 = torch.aten.cat %9479, %int2_12204 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_12205 = torch.constant.int 4608
    %int15360_12206 = torch.constant.int 15360
    %9481 = torch.prim.ListConstruct %int4608_12205, %int15360_12206 : (!torch.int, !torch.int) -> !torch.list<int>
    %9482 = torch.aten.view %9480, %9481 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.26.linear2.weight = util.global.load @__auto.sampler.single_blocks.26.linear2.weight : tensor<3072x15360xf16>
    %9483 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_12207 = torch.constant.int 0
    %int1_12208 = torch.constant.int 1
    %9484 = torch.aten.transpose.int %9483, %int0_12207, %int1_12208 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.26.linear2.bias = util.global.load @__auto.sampler.single_blocks.26.linear2.bias : tensor<3072xf16>
    %9485 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.26.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_12209 = torch.constant.int 6
    %9486 = torch.prims.convert_element_type %9485, %int6_12209 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_12210 = torch.constant.int 6
    %9487 = torch.prims.convert_element_type %9482, %int6_12210 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_12211 = torch.constant.int 6
    %9488 = torch.prims.convert_element_type %9484, %int6_12211 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9489 = torch.aten.mm %9487, %9488 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_12212 = torch.constant.int 1
    %9490 = torch.aten.mul.Scalar %9489, %int1_12212 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_12213 = torch.constant.int 1
    %9491 = torch.aten.mul.Scalar %9486, %int1_12213 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_12214 = torch.constant.int 1
    %9492 = torch.aten.add.Tensor %9490, %9491, %int1_12214 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_12215 = torch.constant.int 5
    %9493 = torch.prims.convert_element_type %9492, %int5_12215 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_12216 = torch.constant.int 1
    %int4608_12217 = torch.constant.int 4608
    %int3072_12218 = torch.constant.int 3072
    %9494 = torch.prim.ListConstruct %int1_12216, %int4608_12217, %int3072_12218 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9495 = torch.aten.view %9493, %9494 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9496 = torch.aten.mul.Tensor %9390, %9495 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12219 = torch.constant.int 1
    %9497 = torch.aten.add.Tensor %9372, %9496, %int1_12219 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9498 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.27.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.27.modulation.lin.weight : tensor<9216x3072xf16>
    %9499 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_12220 = torch.constant.int 0
    %int1_12221 = torch.constant.int 1
    %9500 = torch.aten.transpose.int %9499, %int0_12220, %int1_12221 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.27.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.27.modulation.lin.bias : tensor<9216xf16>
    %9501 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_12222 = torch.constant.int 6
    %9502 = torch.prims.convert_element_type %9501, %int6_12222 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_12223 = torch.constant.int 6
    %9503 = torch.prims.convert_element_type %9498, %int6_12223 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_12224 = torch.constant.int 6
    %9504 = torch.prims.convert_element_type %9500, %int6_12224 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9505 = torch.aten.mm %9503, %9504 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_12225 = torch.constant.int 1
    %9506 = torch.aten.mul.Scalar %9505, %int1_12225 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_12226 = torch.constant.int 1
    %9507 = torch.aten.mul.Scalar %9502, %int1_12226 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_12227 = torch.constant.int 1
    %9508 = torch.aten.add.Tensor %9506, %9507, %int1_12227 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_12228 = torch.constant.int 5
    %9509 = torch.prims.convert_element_type %9508, %int5_12228 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_12229 = torch.constant.int 0
    %int0_12230 = torch.constant.int 0
    %int9223372036854775807_12231 = torch.constant.int 9223372036854775807
    %int1_12232 = torch.constant.int 1
    %9510 = torch.aten.slice.Tensor %9509, %int0_12229, %int0_12230, %int9223372036854775807_12231, %int1_12232 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_12233 = torch.constant.int 1
    %9511 = torch.aten.unsqueeze %9510, %int1_12233 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_12234 = torch.constant.int 2
    %int0_12235 = torch.constant.int 0
    %int9223372036854775807_12236 = torch.constant.int 9223372036854775807
    %int1_12237 = torch.constant.int 1
    %9512 = torch.aten.slice.Tensor %9511, %int2_12234, %int0_12235, %int9223372036854775807_12236, %int1_12237 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_12238 = torch.constant.int -1
    %int0_12239 = torch.constant.int 0
    %int3072_12240 = torch.constant.int 3072
    %int1_12241 = torch.constant.int 1
    %9513 = torch.aten.slice.Tensor %9512, %int-1_12238, %int0_12239, %int3072_12240, %int1_12241 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12242 = torch.constant.int -1
    %int3072_12243 = torch.constant.int 3072
    %int6144_12244 = torch.constant.int 6144
    %int1_12245 = torch.constant.int 1
    %9514 = torch.aten.slice.Tensor %9512, %int-1_12242, %int3072_12243, %int6144_12244, %int1_12245 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12246 = torch.constant.int -1
    %int6144_12247 = torch.constant.int 6144
    %int9216_12248 = torch.constant.int 9216
    %int1_12249 = torch.constant.int 1
    %9515 = torch.aten.slice.Tensor %9512, %int-1_12246, %int6144_12247, %int9216_12248, %int1_12249 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_12250 = torch.constant.int 1
    %int1_12251 = torch.constant.int 1
    %9516 = torch.aten.add.Scalar %9514, %int1_12250, %int1_12251 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_12252 = torch.constant.int 6
    %9517 = torch.prims.convert_element_type %9497, %int6_12252 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_12253 = torch.constant.int 2
    %9518 = torch.prim.ListConstruct %int2_12253 : (!torch.int) -> !torch.list<int>
    %int0_12254 = torch.constant.int 0
    %true_12255 = torch.constant.bool true
    %result0_12256, %result1_12257 = torch.aten.var_mean.correction %9517, %9518, %int0_12254, %true_12255 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_12258 = torch.constant.float 9.9999999999999995E-7
    %int1_12259 = torch.constant.int 1
    %9519 = torch.aten.add.Scalar %result0_12256, %float9.999990e-07_12258, %int1_12259 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9520 = torch.aten.rsqrt %9519 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_12260 = torch.constant.int 1
    %9521 = torch.aten.sub.Tensor %9497, %result1_12257, %int1_12260 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9522 = torch.aten.mul.Tensor %9521, %9520 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_12261 = torch.constant.int 5
    %9523 = torch.prims.convert_element_type %9522, %int5_12261 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9524 = torch.aten.mul.Tensor %9516, %9523 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12262 = torch.constant.int 1
    %9525 = torch.aten.add.Tensor %9524, %9513, %int1_12262 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_12263 = torch.constant.int 4608
    %int3072_12264 = torch.constant.int 3072
    %9526 = torch.prim.ListConstruct %int4608_12263, %int3072_12264 : (!torch.int, !torch.int) -> !torch.list<int>
    %9527 = torch.aten.view %9525, %9526 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.27.linear1.weight = util.global.load @__auto.sampler.single_blocks.27.linear1.weight : tensor<21504x3072xf16>
    %9528 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_12265 = torch.constant.int 0
    %int1_12266 = torch.constant.int 1
    %9529 = torch.aten.transpose.int %9528, %int0_12265, %int1_12266 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.27.linear1.bias = util.global.load @__auto.sampler.single_blocks.27.linear1.bias : tensor<21504xf16>
    %9530 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_12267 = torch.constant.int 6
    %9531 = torch.prims.convert_element_type %9530, %int6_12267 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_12268 = torch.constant.int 6
    %9532 = torch.prims.convert_element_type %9527, %int6_12268 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_12269 = torch.constant.int 6
    %9533 = torch.prims.convert_element_type %9529, %int6_12269 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9534 = torch.aten.mm %9532, %9533 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_12270 = torch.constant.int 1
    %9535 = torch.aten.mul.Scalar %9534, %int1_12270 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_12271 = torch.constant.int 1
    %9536 = torch.aten.mul.Scalar %9531, %int1_12271 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_12272 = torch.constant.int 1
    %9537 = torch.aten.add.Tensor %9535, %9536, %int1_12272 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_12273 = torch.constant.int 5
    %9538 = torch.prims.convert_element_type %9537, %int5_12273 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_12274 = torch.constant.int 1
    %int4608_12275 = torch.constant.int 4608
    %int21504_12276 = torch.constant.int 21504
    %9539 = torch.prim.ListConstruct %int1_12274, %int4608_12275, %int21504_12276 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9540 = torch.aten.view %9538, %9539 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_12277 = torch.constant.int -1
    %int0_12278 = torch.constant.int 0
    %int9216_12279 = torch.constant.int 9216
    %int1_12280 = torch.constant.int 1
    %9541 = torch.aten.slice.Tensor %9540, %int-1_12277, %int0_12278, %int9216_12279, %int1_12280 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_12281 = torch.constant.int -1
    %int9216_12282 = torch.constant.int 9216
    %int21504_12283 = torch.constant.int 21504
    %int1_12284 = torch.constant.int 1
    %9542 = torch.aten.slice.Tensor %9540, %int-1_12281, %int9216_12282, %int21504_12283, %int1_12284 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_12285 = torch.constant.int 1
    %int4608_12286 = torch.constant.int 4608
    %int3_12287 = torch.constant.int 3
    %int24_12288 = torch.constant.int 24
    %int128_12289 = torch.constant.int 128
    %9543 = torch.prim.ListConstruct %int1_12285, %int4608_12286, %int3_12287, %int24_12288, %int128_12289 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9544 = torch.aten.view %9541, %9543 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_12290 = torch.constant.int 2
    %int0_12291 = torch.constant.int 0
    %int3_12292 = torch.constant.int 3
    %int1_12293 = torch.constant.int 1
    %int4_12294 = torch.constant.int 4
    %9545 = torch.prim.ListConstruct %int2_12290, %int0_12291, %int3_12292, %int1_12293, %int4_12294 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9546 = torch.aten.permute %9544, %9545 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_12295 = torch.constant.int 0
    %int0_12296 = torch.constant.int 0
    %9547 = torch.aten.select.int %9546, %int0_12295, %int0_12296 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12297 = torch.constant.int 0
    %int1_12298 = torch.constant.int 1
    %9548 = torch.aten.select.int %9546, %int0_12297, %int1_12298 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12299 = torch.constant.int 0
    %int2_12300 = torch.constant.int 2
    %9549 = torch.aten.select.int %9546, %int0_12299, %int2_12300 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12301 = torch.constant.int 6
    %9550 = torch.prims.convert_element_type %9547, %int6_12301 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12302 = torch.constant.int 2
    %9551 = torch.aten.pow.Tensor_Scalar %9550, %int2_12302 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12303 = torch.constant.int -1
    %9552 = torch.prim.ListConstruct %int-1_12303 : (!torch.int) -> !torch.list<int>
    %true_12304 = torch.constant.bool true
    %none_12305 = torch.constant.none
    %9553 = torch.aten.mean.dim %9551, %9552, %true_12304, %none_12305 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12306 = torch.constant.float 9.9999999999999995E-7
    %int1_12307 = torch.constant.int 1
    %9554 = torch.aten.add.Scalar %9553, %float9.999990e-07_12306, %int1_12307 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9555 = torch.aten.rsqrt %9554 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9556 = torch.aten.mul.Tensor %9550, %9555 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12308 = torch.constant.int 5
    %9557 = torch.prims.convert_element_type %9556, %int5_12308 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.27.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.27.norm.query_norm.scale : tensor<128xf16>
    %9558 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9559 = torch.aten.mul.Tensor %9557, %9558 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12309 = torch.constant.int 6
    %9560 = torch.prims.convert_element_type %9548, %int6_12309 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12310 = torch.constant.int 2
    %9561 = torch.aten.pow.Tensor_Scalar %9560, %int2_12310 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12311 = torch.constant.int -1
    %9562 = torch.prim.ListConstruct %int-1_12311 : (!torch.int) -> !torch.list<int>
    %true_12312 = torch.constant.bool true
    %none_12313 = torch.constant.none
    %9563 = torch.aten.mean.dim %9561, %9562, %true_12312, %none_12313 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12314 = torch.constant.float 9.9999999999999995E-7
    %int1_12315 = torch.constant.int 1
    %9564 = torch.aten.add.Scalar %9563, %float9.999990e-07_12314, %int1_12315 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9565 = torch.aten.rsqrt %9564 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9566 = torch.aten.mul.Tensor %9560, %9565 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12316 = torch.constant.int 5
    %9567 = torch.prims.convert_element_type %9566, %int5_12316 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.27.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.27.norm.key_norm.scale : tensor<128xf16>
    %9568 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9569 = torch.aten.mul.Tensor %9567, %9568 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12317 = torch.constant.int 5
    %9570 = torch.prims.convert_element_type %9559, %int5_12317 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12318 = torch.constant.int 5
    %9571 = torch.prims.convert_element_type %9569, %int5_12318 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12319 = torch.constant.int 6
    %9572 = torch.prims.convert_element_type %9570, %int6_12319 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12320 = torch.constant.int 1
    %int24_12321 = torch.constant.int 24
    %int4608_12322 = torch.constant.int 4608
    %int64_12323 = torch.constant.int 64
    %int1_12324 = torch.constant.int 1
    %int2_12325 = torch.constant.int 2
    %9573 = torch.prim.ListConstruct %int1_12320, %int24_12321, %int4608_12322, %int64_12323, %int1_12324, %int2_12325 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9574 = torch.aten.view %9572, %9573 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_12326 = torch.constant.int 6
    %9575 = torch.prims.convert_element_type %9571, %int6_12326 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12327 = torch.constant.int 1
    %int24_12328 = torch.constant.int 24
    %int4608_12329 = torch.constant.int 4608
    %int64_12330 = torch.constant.int 64
    %int1_12331 = torch.constant.int 1
    %int2_12332 = torch.constant.int 2
    %9576 = torch.prim.ListConstruct %int1_12327, %int24_12328, %int4608_12329, %int64_12330, %int1_12331, %int2_12332 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9577 = torch.aten.view %9575, %9576 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_12333 = torch.constant.int 5
    %int0_12334 = torch.constant.int 0
    %9578 = torch.aten.select.int %211, %int5_12333, %int0_12334 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12335 = torch.constant.int 5
    %int0_12336 = torch.constant.int 0
    %9579 = torch.aten.select.int %9574, %int5_12335, %int0_12336 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9580 = torch.aten.mul.Tensor %9578, %9579 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12337 = torch.constant.int 5
    %int1_12338 = torch.constant.int 1
    %9581 = torch.aten.select.int %211, %int5_12337, %int1_12338 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12339 = torch.constant.int 5
    %int1_12340 = torch.constant.int 1
    %9582 = torch.aten.select.int %9574, %int5_12339, %int1_12340 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9583 = torch.aten.mul.Tensor %9581, %9582 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12341 = torch.constant.int 1
    %9584 = torch.aten.add.Tensor %9580, %9583, %int1_12341 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12342 = torch.constant.int 5
    %int0_12343 = torch.constant.int 0
    %9585 = torch.aten.select.int %211, %int5_12342, %int0_12343 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12344 = torch.constant.int 5
    %int0_12345 = torch.constant.int 0
    %9586 = torch.aten.select.int %9577, %int5_12344, %int0_12345 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9587 = torch.aten.mul.Tensor %9585, %9586 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12346 = torch.constant.int 5
    %int1_12347 = torch.constant.int 1
    %9588 = torch.aten.select.int %211, %int5_12346, %int1_12347 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12348 = torch.constant.int 5
    %int1_12349 = torch.constant.int 1
    %9589 = torch.aten.select.int %9577, %int5_12348, %int1_12349 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9590 = torch.aten.mul.Tensor %9588, %9589 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12350 = torch.constant.int 1
    %9591 = torch.aten.add.Tensor %9587, %9590, %int1_12350 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12351 = torch.constant.int 1
    %int24_12352 = torch.constant.int 24
    %int4608_12353 = torch.constant.int 4608
    %int128_12354 = torch.constant.int 128
    %9592 = torch.prim.ListConstruct %int1_12351, %int24_12352, %int4608_12353, %int128_12354 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9593 = torch.aten.view %9584, %9592 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12355 = torch.constant.int 5
    %9594 = torch.prims.convert_element_type %9593, %int5_12355 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_12356 = torch.constant.int 1
    %int24_12357 = torch.constant.int 24
    %int4608_12358 = torch.constant.int 4608
    %int128_12359 = torch.constant.int 128
    %9595 = torch.prim.ListConstruct %int1_12356, %int24_12357, %int4608_12358, %int128_12359 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9596 = torch.aten.view %9591, %9595 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12360 = torch.constant.int 5
    %9597 = torch.prims.convert_element_type %9596, %int5_12360 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_12361 = torch.constant.float 0.000000e+00
    %false_12362 = torch.constant.bool false
    %none_12363 = torch.constant.none
    %none_12364 = torch.constant.none
    %9598:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9594, %9597, %9549, %float0.000000e00_12361, %false_12362, %none_12363, %none_12364) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_12365 = torch.constant.int 0
    %int2_12366 = torch.constant.int 2
    %int1_12367 = torch.constant.int 1
    %int3_12368 = torch.constant.int 3
    %9599 = torch.prim.ListConstruct %int0_12365, %int2_12366, %int1_12367, %int3_12368 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9600 = torch.aten.permute %9598#0, %9599 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_12369 = torch.constant.int 1
    %int4608_12370 = torch.constant.int 4608
    %int3072_12371 = torch.constant.int 3072
    %9601 = torch.prim.ListConstruct %int1_12369, %int4608_12370, %int3072_12371 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9602 = torch.aten.view %9600, %9601 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_12372 = torch.constant.str "tanh"
    %9603 = torch.aten.gelu %9542, %str_12372 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9604 = torch.prim.ListConstruct %9602, %9603 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_12373 = torch.constant.int 2
    %9605 = torch.aten.cat %9604, %int2_12373 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_12374 = torch.constant.int 4608
    %int15360_12375 = torch.constant.int 15360
    %9606 = torch.prim.ListConstruct %int4608_12374, %int15360_12375 : (!torch.int, !torch.int) -> !torch.list<int>
    %9607 = torch.aten.view %9605, %9606 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.27.linear2.weight = util.global.load @__auto.sampler.single_blocks.27.linear2.weight : tensor<3072x15360xf16>
    %9608 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_12376 = torch.constant.int 0
    %int1_12377 = torch.constant.int 1
    %9609 = torch.aten.transpose.int %9608, %int0_12376, %int1_12377 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.27.linear2.bias = util.global.load @__auto.sampler.single_blocks.27.linear2.bias : tensor<3072xf16>
    %9610 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.27.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_12378 = torch.constant.int 6
    %9611 = torch.prims.convert_element_type %9610, %int6_12378 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_12379 = torch.constant.int 6
    %9612 = torch.prims.convert_element_type %9607, %int6_12379 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_12380 = torch.constant.int 6
    %9613 = torch.prims.convert_element_type %9609, %int6_12380 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9614 = torch.aten.mm %9612, %9613 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_12381 = torch.constant.int 1
    %9615 = torch.aten.mul.Scalar %9614, %int1_12381 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_12382 = torch.constant.int 1
    %9616 = torch.aten.mul.Scalar %9611, %int1_12382 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_12383 = torch.constant.int 1
    %9617 = torch.aten.add.Tensor %9615, %9616, %int1_12383 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_12384 = torch.constant.int 5
    %9618 = torch.prims.convert_element_type %9617, %int5_12384 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_12385 = torch.constant.int 1
    %int4608_12386 = torch.constant.int 4608
    %int3072_12387 = torch.constant.int 3072
    %9619 = torch.prim.ListConstruct %int1_12385, %int4608_12386, %int3072_12387 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9620 = torch.aten.view %9618, %9619 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9621 = torch.aten.mul.Tensor %9515, %9620 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12388 = torch.constant.int 1
    %9622 = torch.aten.add.Tensor %9497, %9621, %int1_12388 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9623 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.28.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.28.modulation.lin.weight : tensor<9216x3072xf16>
    %9624 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_12389 = torch.constant.int 0
    %int1_12390 = torch.constant.int 1
    %9625 = torch.aten.transpose.int %9624, %int0_12389, %int1_12390 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.28.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.28.modulation.lin.bias : tensor<9216xf16>
    %9626 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_12391 = torch.constant.int 6
    %9627 = torch.prims.convert_element_type %9626, %int6_12391 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_12392 = torch.constant.int 6
    %9628 = torch.prims.convert_element_type %9623, %int6_12392 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_12393 = torch.constant.int 6
    %9629 = torch.prims.convert_element_type %9625, %int6_12393 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9630 = torch.aten.mm %9628, %9629 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_12394 = torch.constant.int 1
    %9631 = torch.aten.mul.Scalar %9630, %int1_12394 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_12395 = torch.constant.int 1
    %9632 = torch.aten.mul.Scalar %9627, %int1_12395 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_12396 = torch.constant.int 1
    %9633 = torch.aten.add.Tensor %9631, %9632, %int1_12396 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_12397 = torch.constant.int 5
    %9634 = torch.prims.convert_element_type %9633, %int5_12397 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_12398 = torch.constant.int 0
    %int0_12399 = torch.constant.int 0
    %int9223372036854775807_12400 = torch.constant.int 9223372036854775807
    %int1_12401 = torch.constant.int 1
    %9635 = torch.aten.slice.Tensor %9634, %int0_12398, %int0_12399, %int9223372036854775807_12400, %int1_12401 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_12402 = torch.constant.int 1
    %9636 = torch.aten.unsqueeze %9635, %int1_12402 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_12403 = torch.constant.int 2
    %int0_12404 = torch.constant.int 0
    %int9223372036854775807_12405 = torch.constant.int 9223372036854775807
    %int1_12406 = torch.constant.int 1
    %9637 = torch.aten.slice.Tensor %9636, %int2_12403, %int0_12404, %int9223372036854775807_12405, %int1_12406 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_12407 = torch.constant.int -1
    %int0_12408 = torch.constant.int 0
    %int3072_12409 = torch.constant.int 3072
    %int1_12410 = torch.constant.int 1
    %9638 = torch.aten.slice.Tensor %9637, %int-1_12407, %int0_12408, %int3072_12409, %int1_12410 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12411 = torch.constant.int -1
    %int3072_12412 = torch.constant.int 3072
    %int6144_12413 = torch.constant.int 6144
    %int1_12414 = torch.constant.int 1
    %9639 = torch.aten.slice.Tensor %9637, %int-1_12411, %int3072_12412, %int6144_12413, %int1_12414 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12415 = torch.constant.int -1
    %int6144_12416 = torch.constant.int 6144
    %int9216_12417 = torch.constant.int 9216
    %int1_12418 = torch.constant.int 1
    %9640 = torch.aten.slice.Tensor %9637, %int-1_12415, %int6144_12416, %int9216_12417, %int1_12418 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_12419 = torch.constant.int 1
    %int1_12420 = torch.constant.int 1
    %9641 = torch.aten.add.Scalar %9639, %int1_12419, %int1_12420 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_12421 = torch.constant.int 6
    %9642 = torch.prims.convert_element_type %9622, %int6_12421 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_12422 = torch.constant.int 2
    %9643 = torch.prim.ListConstruct %int2_12422 : (!torch.int) -> !torch.list<int>
    %int0_12423 = torch.constant.int 0
    %true_12424 = torch.constant.bool true
    %result0_12425, %result1_12426 = torch.aten.var_mean.correction %9642, %9643, %int0_12423, %true_12424 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_12427 = torch.constant.float 9.9999999999999995E-7
    %int1_12428 = torch.constant.int 1
    %9644 = torch.aten.add.Scalar %result0_12425, %float9.999990e-07_12427, %int1_12428 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9645 = torch.aten.rsqrt %9644 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_12429 = torch.constant.int 1
    %9646 = torch.aten.sub.Tensor %9622, %result1_12426, %int1_12429 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9647 = torch.aten.mul.Tensor %9646, %9645 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_12430 = torch.constant.int 5
    %9648 = torch.prims.convert_element_type %9647, %int5_12430 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9649 = torch.aten.mul.Tensor %9641, %9648 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12431 = torch.constant.int 1
    %9650 = torch.aten.add.Tensor %9649, %9638, %int1_12431 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_12432 = torch.constant.int 4608
    %int3072_12433 = torch.constant.int 3072
    %9651 = torch.prim.ListConstruct %int4608_12432, %int3072_12433 : (!torch.int, !torch.int) -> !torch.list<int>
    %9652 = torch.aten.view %9650, %9651 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.28.linear1.weight = util.global.load @__auto.sampler.single_blocks.28.linear1.weight : tensor<21504x3072xf16>
    %9653 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_12434 = torch.constant.int 0
    %int1_12435 = torch.constant.int 1
    %9654 = torch.aten.transpose.int %9653, %int0_12434, %int1_12435 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.28.linear1.bias = util.global.load @__auto.sampler.single_blocks.28.linear1.bias : tensor<21504xf16>
    %9655 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_12436 = torch.constant.int 6
    %9656 = torch.prims.convert_element_type %9655, %int6_12436 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_12437 = torch.constant.int 6
    %9657 = torch.prims.convert_element_type %9652, %int6_12437 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_12438 = torch.constant.int 6
    %9658 = torch.prims.convert_element_type %9654, %int6_12438 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9659 = torch.aten.mm %9657, %9658 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_12439 = torch.constant.int 1
    %9660 = torch.aten.mul.Scalar %9659, %int1_12439 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_12440 = torch.constant.int 1
    %9661 = torch.aten.mul.Scalar %9656, %int1_12440 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_12441 = torch.constant.int 1
    %9662 = torch.aten.add.Tensor %9660, %9661, %int1_12441 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_12442 = torch.constant.int 5
    %9663 = torch.prims.convert_element_type %9662, %int5_12442 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_12443 = torch.constant.int 1
    %int4608_12444 = torch.constant.int 4608
    %int21504_12445 = torch.constant.int 21504
    %9664 = torch.prim.ListConstruct %int1_12443, %int4608_12444, %int21504_12445 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9665 = torch.aten.view %9663, %9664 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_12446 = torch.constant.int -1
    %int0_12447 = torch.constant.int 0
    %int9216_12448 = torch.constant.int 9216
    %int1_12449 = torch.constant.int 1
    %9666 = torch.aten.slice.Tensor %9665, %int-1_12446, %int0_12447, %int9216_12448, %int1_12449 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_12450 = torch.constant.int -1
    %int9216_12451 = torch.constant.int 9216
    %int21504_12452 = torch.constant.int 21504
    %int1_12453 = torch.constant.int 1
    %9667 = torch.aten.slice.Tensor %9665, %int-1_12450, %int9216_12451, %int21504_12452, %int1_12453 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_12454 = torch.constant.int 1
    %int4608_12455 = torch.constant.int 4608
    %int3_12456 = torch.constant.int 3
    %int24_12457 = torch.constant.int 24
    %int128_12458 = torch.constant.int 128
    %9668 = torch.prim.ListConstruct %int1_12454, %int4608_12455, %int3_12456, %int24_12457, %int128_12458 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9669 = torch.aten.view %9666, %9668 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_12459 = torch.constant.int 2
    %int0_12460 = torch.constant.int 0
    %int3_12461 = torch.constant.int 3
    %int1_12462 = torch.constant.int 1
    %int4_12463 = torch.constant.int 4
    %9670 = torch.prim.ListConstruct %int2_12459, %int0_12460, %int3_12461, %int1_12462, %int4_12463 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9671 = torch.aten.permute %9669, %9670 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_12464 = torch.constant.int 0
    %int0_12465 = torch.constant.int 0
    %9672 = torch.aten.select.int %9671, %int0_12464, %int0_12465 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12466 = torch.constant.int 0
    %int1_12467 = torch.constant.int 1
    %9673 = torch.aten.select.int %9671, %int0_12466, %int1_12467 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12468 = torch.constant.int 0
    %int2_12469 = torch.constant.int 2
    %9674 = torch.aten.select.int %9671, %int0_12468, %int2_12469 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12470 = torch.constant.int 6
    %9675 = torch.prims.convert_element_type %9672, %int6_12470 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12471 = torch.constant.int 2
    %9676 = torch.aten.pow.Tensor_Scalar %9675, %int2_12471 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12472 = torch.constant.int -1
    %9677 = torch.prim.ListConstruct %int-1_12472 : (!torch.int) -> !torch.list<int>
    %true_12473 = torch.constant.bool true
    %none_12474 = torch.constant.none
    %9678 = torch.aten.mean.dim %9676, %9677, %true_12473, %none_12474 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12475 = torch.constant.float 9.9999999999999995E-7
    %int1_12476 = torch.constant.int 1
    %9679 = torch.aten.add.Scalar %9678, %float9.999990e-07_12475, %int1_12476 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9680 = torch.aten.rsqrt %9679 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9681 = torch.aten.mul.Tensor %9675, %9680 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12477 = torch.constant.int 5
    %9682 = torch.prims.convert_element_type %9681, %int5_12477 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.28.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.28.norm.query_norm.scale : tensor<128xf16>
    %9683 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9684 = torch.aten.mul.Tensor %9682, %9683 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12478 = torch.constant.int 6
    %9685 = torch.prims.convert_element_type %9673, %int6_12478 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12479 = torch.constant.int 2
    %9686 = torch.aten.pow.Tensor_Scalar %9685, %int2_12479 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12480 = torch.constant.int -1
    %9687 = torch.prim.ListConstruct %int-1_12480 : (!torch.int) -> !torch.list<int>
    %true_12481 = torch.constant.bool true
    %none_12482 = torch.constant.none
    %9688 = torch.aten.mean.dim %9686, %9687, %true_12481, %none_12482 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12483 = torch.constant.float 9.9999999999999995E-7
    %int1_12484 = torch.constant.int 1
    %9689 = torch.aten.add.Scalar %9688, %float9.999990e-07_12483, %int1_12484 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9690 = torch.aten.rsqrt %9689 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9691 = torch.aten.mul.Tensor %9685, %9690 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12485 = torch.constant.int 5
    %9692 = torch.prims.convert_element_type %9691, %int5_12485 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.28.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.28.norm.key_norm.scale : tensor<128xf16>
    %9693 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9694 = torch.aten.mul.Tensor %9692, %9693 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12486 = torch.constant.int 5
    %9695 = torch.prims.convert_element_type %9684, %int5_12486 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12487 = torch.constant.int 5
    %9696 = torch.prims.convert_element_type %9694, %int5_12487 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12488 = torch.constant.int 6
    %9697 = torch.prims.convert_element_type %9695, %int6_12488 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12489 = torch.constant.int 1
    %int24_12490 = torch.constant.int 24
    %int4608_12491 = torch.constant.int 4608
    %int64_12492 = torch.constant.int 64
    %int1_12493 = torch.constant.int 1
    %int2_12494 = torch.constant.int 2
    %9698 = torch.prim.ListConstruct %int1_12489, %int24_12490, %int4608_12491, %int64_12492, %int1_12493, %int2_12494 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9699 = torch.aten.view %9697, %9698 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_12495 = torch.constant.int 6
    %9700 = torch.prims.convert_element_type %9696, %int6_12495 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12496 = torch.constant.int 1
    %int24_12497 = torch.constant.int 24
    %int4608_12498 = torch.constant.int 4608
    %int64_12499 = torch.constant.int 64
    %int1_12500 = torch.constant.int 1
    %int2_12501 = torch.constant.int 2
    %9701 = torch.prim.ListConstruct %int1_12496, %int24_12497, %int4608_12498, %int64_12499, %int1_12500, %int2_12501 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9702 = torch.aten.view %9700, %9701 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_12502 = torch.constant.int 5
    %int0_12503 = torch.constant.int 0
    %9703 = torch.aten.select.int %211, %int5_12502, %int0_12503 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12504 = torch.constant.int 5
    %int0_12505 = torch.constant.int 0
    %9704 = torch.aten.select.int %9699, %int5_12504, %int0_12505 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9705 = torch.aten.mul.Tensor %9703, %9704 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12506 = torch.constant.int 5
    %int1_12507 = torch.constant.int 1
    %9706 = torch.aten.select.int %211, %int5_12506, %int1_12507 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12508 = torch.constant.int 5
    %int1_12509 = torch.constant.int 1
    %9707 = torch.aten.select.int %9699, %int5_12508, %int1_12509 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9708 = torch.aten.mul.Tensor %9706, %9707 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12510 = torch.constant.int 1
    %9709 = torch.aten.add.Tensor %9705, %9708, %int1_12510 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12511 = torch.constant.int 5
    %int0_12512 = torch.constant.int 0
    %9710 = torch.aten.select.int %211, %int5_12511, %int0_12512 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12513 = torch.constant.int 5
    %int0_12514 = torch.constant.int 0
    %9711 = torch.aten.select.int %9702, %int5_12513, %int0_12514 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9712 = torch.aten.mul.Tensor %9710, %9711 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12515 = torch.constant.int 5
    %int1_12516 = torch.constant.int 1
    %9713 = torch.aten.select.int %211, %int5_12515, %int1_12516 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12517 = torch.constant.int 5
    %int1_12518 = torch.constant.int 1
    %9714 = torch.aten.select.int %9702, %int5_12517, %int1_12518 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9715 = torch.aten.mul.Tensor %9713, %9714 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12519 = torch.constant.int 1
    %9716 = torch.aten.add.Tensor %9712, %9715, %int1_12519 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12520 = torch.constant.int 1
    %int24_12521 = torch.constant.int 24
    %int4608_12522 = torch.constant.int 4608
    %int128_12523 = torch.constant.int 128
    %9717 = torch.prim.ListConstruct %int1_12520, %int24_12521, %int4608_12522, %int128_12523 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9718 = torch.aten.view %9709, %9717 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12524 = torch.constant.int 5
    %9719 = torch.prims.convert_element_type %9718, %int5_12524 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_12525 = torch.constant.int 1
    %int24_12526 = torch.constant.int 24
    %int4608_12527 = torch.constant.int 4608
    %int128_12528 = torch.constant.int 128
    %9720 = torch.prim.ListConstruct %int1_12525, %int24_12526, %int4608_12527, %int128_12528 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9721 = torch.aten.view %9716, %9720 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12529 = torch.constant.int 5
    %9722 = torch.prims.convert_element_type %9721, %int5_12529 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_12530 = torch.constant.float 0.000000e+00
    %false_12531 = torch.constant.bool false
    %none_12532 = torch.constant.none
    %none_12533 = torch.constant.none
    %9723:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9719, %9722, %9674, %float0.000000e00_12530, %false_12531, %none_12532, %none_12533) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_12534 = torch.constant.int 0
    %int2_12535 = torch.constant.int 2
    %int1_12536 = torch.constant.int 1
    %int3_12537 = torch.constant.int 3
    %9724 = torch.prim.ListConstruct %int0_12534, %int2_12535, %int1_12536, %int3_12537 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9725 = torch.aten.permute %9723#0, %9724 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_12538 = torch.constant.int 1
    %int4608_12539 = torch.constant.int 4608
    %int3072_12540 = torch.constant.int 3072
    %9726 = torch.prim.ListConstruct %int1_12538, %int4608_12539, %int3072_12540 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9727 = torch.aten.view %9725, %9726 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_12541 = torch.constant.str "tanh"
    %9728 = torch.aten.gelu %9667, %str_12541 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9729 = torch.prim.ListConstruct %9727, %9728 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_12542 = torch.constant.int 2
    %9730 = torch.aten.cat %9729, %int2_12542 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_12543 = torch.constant.int 4608
    %int15360_12544 = torch.constant.int 15360
    %9731 = torch.prim.ListConstruct %int4608_12543, %int15360_12544 : (!torch.int, !torch.int) -> !torch.list<int>
    %9732 = torch.aten.view %9730, %9731 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.28.linear2.weight = util.global.load @__auto.sampler.single_blocks.28.linear2.weight : tensor<3072x15360xf16>
    %9733 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_12545 = torch.constant.int 0
    %int1_12546 = torch.constant.int 1
    %9734 = torch.aten.transpose.int %9733, %int0_12545, %int1_12546 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.28.linear2.bias = util.global.load @__auto.sampler.single_blocks.28.linear2.bias : tensor<3072xf16>
    %9735 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.28.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_12547 = torch.constant.int 6
    %9736 = torch.prims.convert_element_type %9735, %int6_12547 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_12548 = torch.constant.int 6
    %9737 = torch.prims.convert_element_type %9732, %int6_12548 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_12549 = torch.constant.int 6
    %9738 = torch.prims.convert_element_type %9734, %int6_12549 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9739 = torch.aten.mm %9737, %9738 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_12550 = torch.constant.int 1
    %9740 = torch.aten.mul.Scalar %9739, %int1_12550 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_12551 = torch.constant.int 1
    %9741 = torch.aten.mul.Scalar %9736, %int1_12551 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_12552 = torch.constant.int 1
    %9742 = torch.aten.add.Tensor %9740, %9741, %int1_12552 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_12553 = torch.constant.int 5
    %9743 = torch.prims.convert_element_type %9742, %int5_12553 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_12554 = torch.constant.int 1
    %int4608_12555 = torch.constant.int 4608
    %int3072_12556 = torch.constant.int 3072
    %9744 = torch.prim.ListConstruct %int1_12554, %int4608_12555, %int3072_12556 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9745 = torch.aten.view %9743, %9744 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9746 = torch.aten.mul.Tensor %9640, %9745 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12557 = torch.constant.int 1
    %9747 = torch.aten.add.Tensor %9622, %9746, %int1_12557 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9748 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.29.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.29.modulation.lin.weight : tensor<9216x3072xf16>
    %9749 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_12558 = torch.constant.int 0
    %int1_12559 = torch.constant.int 1
    %9750 = torch.aten.transpose.int %9749, %int0_12558, %int1_12559 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.29.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.29.modulation.lin.bias : tensor<9216xf16>
    %9751 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_12560 = torch.constant.int 6
    %9752 = torch.prims.convert_element_type %9751, %int6_12560 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_12561 = torch.constant.int 6
    %9753 = torch.prims.convert_element_type %9748, %int6_12561 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_12562 = torch.constant.int 6
    %9754 = torch.prims.convert_element_type %9750, %int6_12562 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9755 = torch.aten.mm %9753, %9754 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_12563 = torch.constant.int 1
    %9756 = torch.aten.mul.Scalar %9755, %int1_12563 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_12564 = torch.constant.int 1
    %9757 = torch.aten.mul.Scalar %9752, %int1_12564 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_12565 = torch.constant.int 1
    %9758 = torch.aten.add.Tensor %9756, %9757, %int1_12565 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_12566 = torch.constant.int 5
    %9759 = torch.prims.convert_element_type %9758, %int5_12566 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_12567 = torch.constant.int 0
    %int0_12568 = torch.constant.int 0
    %int9223372036854775807_12569 = torch.constant.int 9223372036854775807
    %int1_12570 = torch.constant.int 1
    %9760 = torch.aten.slice.Tensor %9759, %int0_12567, %int0_12568, %int9223372036854775807_12569, %int1_12570 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_12571 = torch.constant.int 1
    %9761 = torch.aten.unsqueeze %9760, %int1_12571 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_12572 = torch.constant.int 2
    %int0_12573 = torch.constant.int 0
    %int9223372036854775807_12574 = torch.constant.int 9223372036854775807
    %int1_12575 = torch.constant.int 1
    %9762 = torch.aten.slice.Tensor %9761, %int2_12572, %int0_12573, %int9223372036854775807_12574, %int1_12575 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_12576 = torch.constant.int -1
    %int0_12577 = torch.constant.int 0
    %int3072_12578 = torch.constant.int 3072
    %int1_12579 = torch.constant.int 1
    %9763 = torch.aten.slice.Tensor %9762, %int-1_12576, %int0_12577, %int3072_12578, %int1_12579 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12580 = torch.constant.int -1
    %int3072_12581 = torch.constant.int 3072
    %int6144_12582 = torch.constant.int 6144
    %int1_12583 = torch.constant.int 1
    %9764 = torch.aten.slice.Tensor %9762, %int-1_12580, %int3072_12581, %int6144_12582, %int1_12583 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12584 = torch.constant.int -1
    %int6144_12585 = torch.constant.int 6144
    %int9216_12586 = torch.constant.int 9216
    %int1_12587 = torch.constant.int 1
    %9765 = torch.aten.slice.Tensor %9762, %int-1_12584, %int6144_12585, %int9216_12586, %int1_12587 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_12588 = torch.constant.int 1
    %int1_12589 = torch.constant.int 1
    %9766 = torch.aten.add.Scalar %9764, %int1_12588, %int1_12589 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_12590 = torch.constant.int 6
    %9767 = torch.prims.convert_element_type %9747, %int6_12590 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_12591 = torch.constant.int 2
    %9768 = torch.prim.ListConstruct %int2_12591 : (!torch.int) -> !torch.list<int>
    %int0_12592 = torch.constant.int 0
    %true_12593 = torch.constant.bool true
    %result0_12594, %result1_12595 = torch.aten.var_mean.correction %9767, %9768, %int0_12592, %true_12593 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_12596 = torch.constant.float 9.9999999999999995E-7
    %int1_12597 = torch.constant.int 1
    %9769 = torch.aten.add.Scalar %result0_12594, %float9.999990e-07_12596, %int1_12597 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9770 = torch.aten.rsqrt %9769 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_12598 = torch.constant.int 1
    %9771 = torch.aten.sub.Tensor %9747, %result1_12595, %int1_12598 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9772 = torch.aten.mul.Tensor %9771, %9770 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_12599 = torch.constant.int 5
    %9773 = torch.prims.convert_element_type %9772, %int5_12599 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9774 = torch.aten.mul.Tensor %9766, %9773 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12600 = torch.constant.int 1
    %9775 = torch.aten.add.Tensor %9774, %9763, %int1_12600 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_12601 = torch.constant.int 4608
    %int3072_12602 = torch.constant.int 3072
    %9776 = torch.prim.ListConstruct %int4608_12601, %int3072_12602 : (!torch.int, !torch.int) -> !torch.list<int>
    %9777 = torch.aten.view %9775, %9776 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.29.linear1.weight = util.global.load @__auto.sampler.single_blocks.29.linear1.weight : tensor<21504x3072xf16>
    %9778 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_12603 = torch.constant.int 0
    %int1_12604 = torch.constant.int 1
    %9779 = torch.aten.transpose.int %9778, %int0_12603, %int1_12604 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.29.linear1.bias = util.global.load @__auto.sampler.single_blocks.29.linear1.bias : tensor<21504xf16>
    %9780 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_12605 = torch.constant.int 6
    %9781 = torch.prims.convert_element_type %9780, %int6_12605 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_12606 = torch.constant.int 6
    %9782 = torch.prims.convert_element_type %9777, %int6_12606 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_12607 = torch.constant.int 6
    %9783 = torch.prims.convert_element_type %9779, %int6_12607 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9784 = torch.aten.mm %9782, %9783 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_12608 = torch.constant.int 1
    %9785 = torch.aten.mul.Scalar %9784, %int1_12608 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_12609 = torch.constant.int 1
    %9786 = torch.aten.mul.Scalar %9781, %int1_12609 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_12610 = torch.constant.int 1
    %9787 = torch.aten.add.Tensor %9785, %9786, %int1_12610 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_12611 = torch.constant.int 5
    %9788 = torch.prims.convert_element_type %9787, %int5_12611 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_12612 = torch.constant.int 1
    %int4608_12613 = torch.constant.int 4608
    %int21504_12614 = torch.constant.int 21504
    %9789 = torch.prim.ListConstruct %int1_12612, %int4608_12613, %int21504_12614 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9790 = torch.aten.view %9788, %9789 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_12615 = torch.constant.int -1
    %int0_12616 = torch.constant.int 0
    %int9216_12617 = torch.constant.int 9216
    %int1_12618 = torch.constant.int 1
    %9791 = torch.aten.slice.Tensor %9790, %int-1_12615, %int0_12616, %int9216_12617, %int1_12618 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_12619 = torch.constant.int -1
    %int9216_12620 = torch.constant.int 9216
    %int21504_12621 = torch.constant.int 21504
    %int1_12622 = torch.constant.int 1
    %9792 = torch.aten.slice.Tensor %9790, %int-1_12619, %int9216_12620, %int21504_12621, %int1_12622 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_12623 = torch.constant.int 1
    %int4608_12624 = torch.constant.int 4608
    %int3_12625 = torch.constant.int 3
    %int24_12626 = torch.constant.int 24
    %int128_12627 = torch.constant.int 128
    %9793 = torch.prim.ListConstruct %int1_12623, %int4608_12624, %int3_12625, %int24_12626, %int128_12627 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9794 = torch.aten.view %9791, %9793 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_12628 = torch.constant.int 2
    %int0_12629 = torch.constant.int 0
    %int3_12630 = torch.constant.int 3
    %int1_12631 = torch.constant.int 1
    %int4_12632 = torch.constant.int 4
    %9795 = torch.prim.ListConstruct %int2_12628, %int0_12629, %int3_12630, %int1_12631, %int4_12632 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9796 = torch.aten.permute %9794, %9795 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_12633 = torch.constant.int 0
    %int0_12634 = torch.constant.int 0
    %9797 = torch.aten.select.int %9796, %int0_12633, %int0_12634 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12635 = torch.constant.int 0
    %int1_12636 = torch.constant.int 1
    %9798 = torch.aten.select.int %9796, %int0_12635, %int1_12636 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12637 = torch.constant.int 0
    %int2_12638 = torch.constant.int 2
    %9799 = torch.aten.select.int %9796, %int0_12637, %int2_12638 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12639 = torch.constant.int 6
    %9800 = torch.prims.convert_element_type %9797, %int6_12639 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12640 = torch.constant.int 2
    %9801 = torch.aten.pow.Tensor_Scalar %9800, %int2_12640 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12641 = torch.constant.int -1
    %9802 = torch.prim.ListConstruct %int-1_12641 : (!torch.int) -> !torch.list<int>
    %true_12642 = torch.constant.bool true
    %none_12643 = torch.constant.none
    %9803 = torch.aten.mean.dim %9801, %9802, %true_12642, %none_12643 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12644 = torch.constant.float 9.9999999999999995E-7
    %int1_12645 = torch.constant.int 1
    %9804 = torch.aten.add.Scalar %9803, %float9.999990e-07_12644, %int1_12645 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9805 = torch.aten.rsqrt %9804 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9806 = torch.aten.mul.Tensor %9800, %9805 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12646 = torch.constant.int 5
    %9807 = torch.prims.convert_element_type %9806, %int5_12646 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.29.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.29.norm.query_norm.scale : tensor<128xf16>
    %9808 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9809 = torch.aten.mul.Tensor %9807, %9808 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12647 = torch.constant.int 6
    %9810 = torch.prims.convert_element_type %9798, %int6_12647 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12648 = torch.constant.int 2
    %9811 = torch.aten.pow.Tensor_Scalar %9810, %int2_12648 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12649 = torch.constant.int -1
    %9812 = torch.prim.ListConstruct %int-1_12649 : (!torch.int) -> !torch.list<int>
    %true_12650 = torch.constant.bool true
    %none_12651 = torch.constant.none
    %9813 = torch.aten.mean.dim %9811, %9812, %true_12650, %none_12651 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12652 = torch.constant.float 9.9999999999999995E-7
    %int1_12653 = torch.constant.int 1
    %9814 = torch.aten.add.Scalar %9813, %float9.999990e-07_12652, %int1_12653 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9815 = torch.aten.rsqrt %9814 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9816 = torch.aten.mul.Tensor %9810, %9815 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12654 = torch.constant.int 5
    %9817 = torch.prims.convert_element_type %9816, %int5_12654 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.29.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.29.norm.key_norm.scale : tensor<128xf16>
    %9818 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9819 = torch.aten.mul.Tensor %9817, %9818 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12655 = torch.constant.int 5
    %9820 = torch.prims.convert_element_type %9809, %int5_12655 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12656 = torch.constant.int 5
    %9821 = torch.prims.convert_element_type %9819, %int5_12656 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12657 = torch.constant.int 6
    %9822 = torch.prims.convert_element_type %9820, %int6_12657 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12658 = torch.constant.int 1
    %int24_12659 = torch.constant.int 24
    %int4608_12660 = torch.constant.int 4608
    %int64_12661 = torch.constant.int 64
    %int1_12662 = torch.constant.int 1
    %int2_12663 = torch.constant.int 2
    %9823 = torch.prim.ListConstruct %int1_12658, %int24_12659, %int4608_12660, %int64_12661, %int1_12662, %int2_12663 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9824 = torch.aten.view %9822, %9823 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_12664 = torch.constant.int 6
    %9825 = torch.prims.convert_element_type %9821, %int6_12664 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12665 = torch.constant.int 1
    %int24_12666 = torch.constant.int 24
    %int4608_12667 = torch.constant.int 4608
    %int64_12668 = torch.constant.int 64
    %int1_12669 = torch.constant.int 1
    %int2_12670 = torch.constant.int 2
    %9826 = torch.prim.ListConstruct %int1_12665, %int24_12666, %int4608_12667, %int64_12668, %int1_12669, %int2_12670 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9827 = torch.aten.view %9825, %9826 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_12671 = torch.constant.int 5
    %int0_12672 = torch.constant.int 0
    %9828 = torch.aten.select.int %211, %int5_12671, %int0_12672 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12673 = torch.constant.int 5
    %int0_12674 = torch.constant.int 0
    %9829 = torch.aten.select.int %9824, %int5_12673, %int0_12674 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9830 = torch.aten.mul.Tensor %9828, %9829 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12675 = torch.constant.int 5
    %int1_12676 = torch.constant.int 1
    %9831 = torch.aten.select.int %211, %int5_12675, %int1_12676 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12677 = torch.constant.int 5
    %int1_12678 = torch.constant.int 1
    %9832 = torch.aten.select.int %9824, %int5_12677, %int1_12678 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9833 = torch.aten.mul.Tensor %9831, %9832 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12679 = torch.constant.int 1
    %9834 = torch.aten.add.Tensor %9830, %9833, %int1_12679 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12680 = torch.constant.int 5
    %int0_12681 = torch.constant.int 0
    %9835 = torch.aten.select.int %211, %int5_12680, %int0_12681 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12682 = torch.constant.int 5
    %int0_12683 = torch.constant.int 0
    %9836 = torch.aten.select.int %9827, %int5_12682, %int0_12683 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9837 = torch.aten.mul.Tensor %9835, %9836 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12684 = torch.constant.int 5
    %int1_12685 = torch.constant.int 1
    %9838 = torch.aten.select.int %211, %int5_12684, %int1_12685 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12686 = torch.constant.int 5
    %int1_12687 = torch.constant.int 1
    %9839 = torch.aten.select.int %9827, %int5_12686, %int1_12687 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9840 = torch.aten.mul.Tensor %9838, %9839 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12688 = torch.constant.int 1
    %9841 = torch.aten.add.Tensor %9837, %9840, %int1_12688 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12689 = torch.constant.int 1
    %int24_12690 = torch.constant.int 24
    %int4608_12691 = torch.constant.int 4608
    %int128_12692 = torch.constant.int 128
    %9842 = torch.prim.ListConstruct %int1_12689, %int24_12690, %int4608_12691, %int128_12692 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9843 = torch.aten.view %9834, %9842 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12693 = torch.constant.int 5
    %9844 = torch.prims.convert_element_type %9843, %int5_12693 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_12694 = torch.constant.int 1
    %int24_12695 = torch.constant.int 24
    %int4608_12696 = torch.constant.int 4608
    %int128_12697 = torch.constant.int 128
    %9845 = torch.prim.ListConstruct %int1_12694, %int24_12695, %int4608_12696, %int128_12697 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9846 = torch.aten.view %9841, %9845 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12698 = torch.constant.int 5
    %9847 = torch.prims.convert_element_type %9846, %int5_12698 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_12699 = torch.constant.float 0.000000e+00
    %false_12700 = torch.constant.bool false
    %none_12701 = torch.constant.none
    %none_12702 = torch.constant.none
    %9848:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9844, %9847, %9799, %float0.000000e00_12699, %false_12700, %none_12701, %none_12702) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_12703 = torch.constant.int 0
    %int2_12704 = torch.constant.int 2
    %int1_12705 = torch.constant.int 1
    %int3_12706 = torch.constant.int 3
    %9849 = torch.prim.ListConstruct %int0_12703, %int2_12704, %int1_12705, %int3_12706 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9850 = torch.aten.permute %9848#0, %9849 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_12707 = torch.constant.int 1
    %int4608_12708 = torch.constant.int 4608
    %int3072_12709 = torch.constant.int 3072
    %9851 = torch.prim.ListConstruct %int1_12707, %int4608_12708, %int3072_12709 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9852 = torch.aten.view %9850, %9851 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_12710 = torch.constant.str "tanh"
    %9853 = torch.aten.gelu %9792, %str_12710 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9854 = torch.prim.ListConstruct %9852, %9853 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_12711 = torch.constant.int 2
    %9855 = torch.aten.cat %9854, %int2_12711 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_12712 = torch.constant.int 4608
    %int15360_12713 = torch.constant.int 15360
    %9856 = torch.prim.ListConstruct %int4608_12712, %int15360_12713 : (!torch.int, !torch.int) -> !torch.list<int>
    %9857 = torch.aten.view %9855, %9856 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.29.linear2.weight = util.global.load @__auto.sampler.single_blocks.29.linear2.weight : tensor<3072x15360xf16>
    %9858 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_12714 = torch.constant.int 0
    %int1_12715 = torch.constant.int 1
    %9859 = torch.aten.transpose.int %9858, %int0_12714, %int1_12715 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.29.linear2.bias = util.global.load @__auto.sampler.single_blocks.29.linear2.bias : tensor<3072xf16>
    %9860 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.29.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_12716 = torch.constant.int 6
    %9861 = torch.prims.convert_element_type %9860, %int6_12716 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_12717 = torch.constant.int 6
    %9862 = torch.prims.convert_element_type %9857, %int6_12717 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_12718 = torch.constant.int 6
    %9863 = torch.prims.convert_element_type %9859, %int6_12718 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9864 = torch.aten.mm %9862, %9863 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_12719 = torch.constant.int 1
    %9865 = torch.aten.mul.Scalar %9864, %int1_12719 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_12720 = torch.constant.int 1
    %9866 = torch.aten.mul.Scalar %9861, %int1_12720 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_12721 = torch.constant.int 1
    %9867 = torch.aten.add.Tensor %9865, %9866, %int1_12721 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_12722 = torch.constant.int 5
    %9868 = torch.prims.convert_element_type %9867, %int5_12722 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_12723 = torch.constant.int 1
    %int4608_12724 = torch.constant.int 4608
    %int3072_12725 = torch.constant.int 3072
    %9869 = torch.prim.ListConstruct %int1_12723, %int4608_12724, %int3072_12725 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9870 = torch.aten.view %9868, %9869 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9871 = torch.aten.mul.Tensor %9765, %9870 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12726 = torch.constant.int 1
    %9872 = torch.aten.add.Tensor %9747, %9871, %int1_12726 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9873 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.30.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.30.modulation.lin.weight : tensor<9216x3072xf16>
    %9874 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_12727 = torch.constant.int 0
    %int1_12728 = torch.constant.int 1
    %9875 = torch.aten.transpose.int %9874, %int0_12727, %int1_12728 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.30.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.30.modulation.lin.bias : tensor<9216xf16>
    %9876 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_12729 = torch.constant.int 6
    %9877 = torch.prims.convert_element_type %9876, %int6_12729 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_12730 = torch.constant.int 6
    %9878 = torch.prims.convert_element_type %9873, %int6_12730 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_12731 = torch.constant.int 6
    %9879 = torch.prims.convert_element_type %9875, %int6_12731 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %9880 = torch.aten.mm %9878, %9879 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_12732 = torch.constant.int 1
    %9881 = torch.aten.mul.Scalar %9880, %int1_12732 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_12733 = torch.constant.int 1
    %9882 = torch.aten.mul.Scalar %9877, %int1_12733 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_12734 = torch.constant.int 1
    %9883 = torch.aten.add.Tensor %9881, %9882, %int1_12734 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_12735 = torch.constant.int 5
    %9884 = torch.prims.convert_element_type %9883, %int5_12735 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_12736 = torch.constant.int 0
    %int0_12737 = torch.constant.int 0
    %int9223372036854775807_12738 = torch.constant.int 9223372036854775807
    %int1_12739 = torch.constant.int 1
    %9885 = torch.aten.slice.Tensor %9884, %int0_12736, %int0_12737, %int9223372036854775807_12738, %int1_12739 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_12740 = torch.constant.int 1
    %9886 = torch.aten.unsqueeze %9885, %int1_12740 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_12741 = torch.constant.int 2
    %int0_12742 = torch.constant.int 0
    %int9223372036854775807_12743 = torch.constant.int 9223372036854775807
    %int1_12744 = torch.constant.int 1
    %9887 = torch.aten.slice.Tensor %9886, %int2_12741, %int0_12742, %int9223372036854775807_12743, %int1_12744 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_12745 = torch.constant.int -1
    %int0_12746 = torch.constant.int 0
    %int3072_12747 = torch.constant.int 3072
    %int1_12748 = torch.constant.int 1
    %9888 = torch.aten.slice.Tensor %9887, %int-1_12745, %int0_12746, %int3072_12747, %int1_12748 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12749 = torch.constant.int -1
    %int3072_12750 = torch.constant.int 3072
    %int6144_12751 = torch.constant.int 6144
    %int1_12752 = torch.constant.int 1
    %9889 = torch.aten.slice.Tensor %9887, %int-1_12749, %int3072_12750, %int6144_12751, %int1_12752 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12753 = torch.constant.int -1
    %int6144_12754 = torch.constant.int 6144
    %int9216_12755 = torch.constant.int 9216
    %int1_12756 = torch.constant.int 1
    %9890 = torch.aten.slice.Tensor %9887, %int-1_12753, %int6144_12754, %int9216_12755, %int1_12756 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_12757 = torch.constant.int 1
    %int1_12758 = torch.constant.int 1
    %9891 = torch.aten.add.Scalar %9889, %int1_12757, %int1_12758 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_12759 = torch.constant.int 6
    %9892 = torch.prims.convert_element_type %9872, %int6_12759 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_12760 = torch.constant.int 2
    %9893 = torch.prim.ListConstruct %int2_12760 : (!torch.int) -> !torch.list<int>
    %int0_12761 = torch.constant.int 0
    %true_12762 = torch.constant.bool true
    %result0_12763, %result1_12764 = torch.aten.var_mean.correction %9892, %9893, %int0_12761, %true_12762 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_12765 = torch.constant.float 9.9999999999999995E-7
    %int1_12766 = torch.constant.int 1
    %9894 = torch.aten.add.Scalar %result0_12763, %float9.999990e-07_12765, %int1_12766 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %9895 = torch.aten.rsqrt %9894 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_12767 = torch.constant.int 1
    %9896 = torch.aten.sub.Tensor %9872, %result1_12764, %int1_12767 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %9897 = torch.aten.mul.Tensor %9896, %9895 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_12768 = torch.constant.int 5
    %9898 = torch.prims.convert_element_type %9897, %int5_12768 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9899 = torch.aten.mul.Tensor %9891, %9898 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12769 = torch.constant.int 1
    %9900 = torch.aten.add.Tensor %9899, %9888, %int1_12769 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_12770 = torch.constant.int 4608
    %int3072_12771 = torch.constant.int 3072
    %9901 = torch.prim.ListConstruct %int4608_12770, %int3072_12771 : (!torch.int, !torch.int) -> !torch.list<int>
    %9902 = torch.aten.view %9900, %9901 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.30.linear1.weight = util.global.load @__auto.sampler.single_blocks.30.linear1.weight : tensor<21504x3072xf16>
    %9903 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_12772 = torch.constant.int 0
    %int1_12773 = torch.constant.int 1
    %9904 = torch.aten.transpose.int %9903, %int0_12772, %int1_12773 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.30.linear1.bias = util.global.load @__auto.sampler.single_blocks.30.linear1.bias : tensor<21504xf16>
    %9905 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_12774 = torch.constant.int 6
    %9906 = torch.prims.convert_element_type %9905, %int6_12774 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_12775 = torch.constant.int 6
    %9907 = torch.prims.convert_element_type %9902, %int6_12775 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_12776 = torch.constant.int 6
    %9908 = torch.prims.convert_element_type %9904, %int6_12776 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %9909 = torch.aten.mm %9907, %9908 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_12777 = torch.constant.int 1
    %9910 = torch.aten.mul.Scalar %9909, %int1_12777 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_12778 = torch.constant.int 1
    %9911 = torch.aten.mul.Scalar %9906, %int1_12778 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_12779 = torch.constant.int 1
    %9912 = torch.aten.add.Tensor %9910, %9911, %int1_12779 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_12780 = torch.constant.int 5
    %9913 = torch.prims.convert_element_type %9912, %int5_12780 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_12781 = torch.constant.int 1
    %int4608_12782 = torch.constant.int 4608
    %int21504_12783 = torch.constant.int 21504
    %9914 = torch.prim.ListConstruct %int1_12781, %int4608_12782, %int21504_12783 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9915 = torch.aten.view %9913, %9914 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_12784 = torch.constant.int -1
    %int0_12785 = torch.constant.int 0
    %int9216_12786 = torch.constant.int 9216
    %int1_12787 = torch.constant.int 1
    %9916 = torch.aten.slice.Tensor %9915, %int-1_12784, %int0_12785, %int9216_12786, %int1_12787 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_12788 = torch.constant.int -1
    %int9216_12789 = torch.constant.int 9216
    %int21504_12790 = torch.constant.int 21504
    %int1_12791 = torch.constant.int 1
    %9917 = torch.aten.slice.Tensor %9915, %int-1_12788, %int9216_12789, %int21504_12790, %int1_12791 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_12792 = torch.constant.int 1
    %int4608_12793 = torch.constant.int 4608
    %int3_12794 = torch.constant.int 3
    %int24_12795 = torch.constant.int 24
    %int128_12796 = torch.constant.int 128
    %9918 = torch.prim.ListConstruct %int1_12792, %int4608_12793, %int3_12794, %int24_12795, %int128_12796 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9919 = torch.aten.view %9916, %9918 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_12797 = torch.constant.int 2
    %int0_12798 = torch.constant.int 0
    %int3_12799 = torch.constant.int 3
    %int1_12800 = torch.constant.int 1
    %int4_12801 = torch.constant.int 4
    %9920 = torch.prim.ListConstruct %int2_12797, %int0_12798, %int3_12799, %int1_12800, %int4_12801 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9921 = torch.aten.permute %9919, %9920 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_12802 = torch.constant.int 0
    %int0_12803 = torch.constant.int 0
    %9922 = torch.aten.select.int %9921, %int0_12802, %int0_12803 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12804 = torch.constant.int 0
    %int1_12805 = torch.constant.int 1
    %9923 = torch.aten.select.int %9921, %int0_12804, %int1_12805 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12806 = torch.constant.int 0
    %int2_12807 = torch.constant.int 2
    %9924 = torch.aten.select.int %9921, %int0_12806, %int2_12807 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12808 = torch.constant.int 6
    %9925 = torch.prims.convert_element_type %9922, %int6_12808 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12809 = torch.constant.int 2
    %9926 = torch.aten.pow.Tensor_Scalar %9925, %int2_12809 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12810 = torch.constant.int -1
    %9927 = torch.prim.ListConstruct %int-1_12810 : (!torch.int) -> !torch.list<int>
    %true_12811 = torch.constant.bool true
    %none_12812 = torch.constant.none
    %9928 = torch.aten.mean.dim %9926, %9927, %true_12811, %none_12812 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12813 = torch.constant.float 9.9999999999999995E-7
    %int1_12814 = torch.constant.int 1
    %9929 = torch.aten.add.Scalar %9928, %float9.999990e-07_12813, %int1_12814 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9930 = torch.aten.rsqrt %9929 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9931 = torch.aten.mul.Tensor %9925, %9930 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12815 = torch.constant.int 5
    %9932 = torch.prims.convert_element_type %9931, %int5_12815 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.30.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.30.norm.query_norm.scale : tensor<128xf16>
    %9933 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9934 = torch.aten.mul.Tensor %9932, %9933 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12816 = torch.constant.int 6
    %9935 = torch.prims.convert_element_type %9923, %int6_12816 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12817 = torch.constant.int 2
    %9936 = torch.aten.pow.Tensor_Scalar %9935, %int2_12817 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12818 = torch.constant.int -1
    %9937 = torch.prim.ListConstruct %int-1_12818 : (!torch.int) -> !torch.list<int>
    %true_12819 = torch.constant.bool true
    %none_12820 = torch.constant.none
    %9938 = torch.aten.mean.dim %9936, %9937, %true_12819, %none_12820 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12821 = torch.constant.float 9.9999999999999995E-7
    %int1_12822 = torch.constant.int 1
    %9939 = torch.aten.add.Scalar %9938, %float9.999990e-07_12821, %int1_12822 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %9940 = torch.aten.rsqrt %9939 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %9941 = torch.aten.mul.Tensor %9935, %9940 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12823 = torch.constant.int 5
    %9942 = torch.prims.convert_element_type %9941, %int5_12823 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.30.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.30.norm.key_norm.scale : tensor<128xf16>
    %9943 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %9944 = torch.aten.mul.Tensor %9942, %9943 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12824 = torch.constant.int 5
    %9945 = torch.prims.convert_element_type %9934, %int5_12824 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12825 = torch.constant.int 5
    %9946 = torch.prims.convert_element_type %9944, %int5_12825 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12826 = torch.constant.int 6
    %9947 = torch.prims.convert_element_type %9945, %int6_12826 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12827 = torch.constant.int 1
    %int24_12828 = torch.constant.int 24
    %int4608_12829 = torch.constant.int 4608
    %int64_12830 = torch.constant.int 64
    %int1_12831 = torch.constant.int 1
    %int2_12832 = torch.constant.int 2
    %9948 = torch.prim.ListConstruct %int1_12827, %int24_12828, %int4608_12829, %int64_12830, %int1_12831, %int2_12832 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9949 = torch.aten.view %9947, %9948 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_12833 = torch.constant.int 6
    %9950 = torch.prims.convert_element_type %9946, %int6_12833 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12834 = torch.constant.int 1
    %int24_12835 = torch.constant.int 24
    %int4608_12836 = torch.constant.int 4608
    %int64_12837 = torch.constant.int 64
    %int1_12838 = torch.constant.int 1
    %int2_12839 = torch.constant.int 2
    %9951 = torch.prim.ListConstruct %int1_12834, %int24_12835, %int4608_12836, %int64_12837, %int1_12838, %int2_12839 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9952 = torch.aten.view %9950, %9951 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_12840 = torch.constant.int 5
    %int0_12841 = torch.constant.int 0
    %9953 = torch.aten.select.int %211, %int5_12840, %int0_12841 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12842 = torch.constant.int 5
    %int0_12843 = torch.constant.int 0
    %9954 = torch.aten.select.int %9949, %int5_12842, %int0_12843 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9955 = torch.aten.mul.Tensor %9953, %9954 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12844 = torch.constant.int 5
    %int1_12845 = torch.constant.int 1
    %9956 = torch.aten.select.int %211, %int5_12844, %int1_12845 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12846 = torch.constant.int 5
    %int1_12847 = torch.constant.int 1
    %9957 = torch.aten.select.int %9949, %int5_12846, %int1_12847 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9958 = torch.aten.mul.Tensor %9956, %9957 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12848 = torch.constant.int 1
    %9959 = torch.aten.add.Tensor %9955, %9958, %int1_12848 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12849 = torch.constant.int 5
    %int0_12850 = torch.constant.int 0
    %9960 = torch.aten.select.int %211, %int5_12849, %int0_12850 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12851 = torch.constant.int 5
    %int0_12852 = torch.constant.int 0
    %9961 = torch.aten.select.int %9952, %int5_12851, %int0_12852 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9962 = torch.aten.mul.Tensor %9960, %9961 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_12853 = torch.constant.int 5
    %int1_12854 = torch.constant.int 1
    %9963 = torch.aten.select.int %211, %int5_12853, %int1_12854 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_12855 = torch.constant.int 5
    %int1_12856 = torch.constant.int 1
    %9964 = torch.aten.select.int %9952, %int5_12855, %int1_12856 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %9965 = torch.aten.mul.Tensor %9963, %9964 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12857 = torch.constant.int 1
    %9966 = torch.aten.add.Tensor %9962, %9965, %int1_12857 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_12858 = torch.constant.int 1
    %int24_12859 = torch.constant.int 24
    %int4608_12860 = torch.constant.int 4608
    %int128_12861 = torch.constant.int 128
    %9967 = torch.prim.ListConstruct %int1_12858, %int24_12859, %int4608_12860, %int128_12861 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9968 = torch.aten.view %9959, %9967 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12862 = torch.constant.int 5
    %9969 = torch.prims.convert_element_type %9968, %int5_12862 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_12863 = torch.constant.int 1
    %int24_12864 = torch.constant.int 24
    %int4608_12865 = torch.constant.int 4608
    %int128_12866 = torch.constant.int 128
    %9970 = torch.prim.ListConstruct %int1_12863, %int24_12864, %int4608_12865, %int128_12866 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9971 = torch.aten.view %9966, %9970 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12867 = torch.constant.int 5
    %9972 = torch.prims.convert_element_type %9971, %int5_12867 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_12868 = torch.constant.float 0.000000e+00
    %false_12869 = torch.constant.bool false
    %none_12870 = torch.constant.none
    %none_12871 = torch.constant.none
    %9973:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%9969, %9972, %9924, %float0.000000e00_12868, %false_12869, %none_12870, %none_12871) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_12872 = torch.constant.int 0
    %int2_12873 = torch.constant.int 2
    %int1_12874 = torch.constant.int 1
    %int3_12875 = torch.constant.int 3
    %9974 = torch.prim.ListConstruct %int0_12872, %int2_12873, %int1_12874, %int3_12875 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9975 = torch.aten.permute %9973#0, %9974 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_12876 = torch.constant.int 1
    %int4608_12877 = torch.constant.int 4608
    %int3072_12878 = torch.constant.int 3072
    %9976 = torch.prim.ListConstruct %int1_12876, %int4608_12877, %int3072_12878 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9977 = torch.aten.view %9975, %9976 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_12879 = torch.constant.str "tanh"
    %9978 = torch.aten.gelu %9917, %str_12879 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %9979 = torch.prim.ListConstruct %9977, %9978 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_12880 = torch.constant.int 2
    %9980 = torch.aten.cat %9979, %int2_12880 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_12881 = torch.constant.int 4608
    %int15360_12882 = torch.constant.int 15360
    %9981 = torch.prim.ListConstruct %int4608_12881, %int15360_12882 : (!torch.int, !torch.int) -> !torch.list<int>
    %9982 = torch.aten.view %9980, %9981 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.30.linear2.weight = util.global.load @__auto.sampler.single_blocks.30.linear2.weight : tensor<3072x15360xf16>
    %9983 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_12883 = torch.constant.int 0
    %int1_12884 = torch.constant.int 1
    %9984 = torch.aten.transpose.int %9983, %int0_12883, %int1_12884 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.30.linear2.bias = util.global.load @__auto.sampler.single_blocks.30.linear2.bias : tensor<3072xf16>
    %9985 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.30.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_12885 = torch.constant.int 6
    %9986 = torch.prims.convert_element_type %9985, %int6_12885 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_12886 = torch.constant.int 6
    %9987 = torch.prims.convert_element_type %9982, %int6_12886 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_12887 = torch.constant.int 6
    %9988 = torch.prims.convert_element_type %9984, %int6_12887 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %9989 = torch.aten.mm %9987, %9988 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_12888 = torch.constant.int 1
    %9990 = torch.aten.mul.Scalar %9989, %int1_12888 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_12889 = torch.constant.int 1
    %9991 = torch.aten.mul.Scalar %9986, %int1_12889 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_12890 = torch.constant.int 1
    %9992 = torch.aten.add.Tensor %9990, %9991, %int1_12890 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_12891 = torch.constant.int 5
    %9993 = torch.prims.convert_element_type %9992, %int5_12891 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_12892 = torch.constant.int 1
    %int4608_12893 = torch.constant.int 4608
    %int3072_12894 = torch.constant.int 3072
    %9994 = torch.prim.ListConstruct %int1_12892, %int4608_12893, %int3072_12894 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %9995 = torch.aten.view %9993, %9994 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %9996 = torch.aten.mul.Tensor %9890, %9995 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12895 = torch.constant.int 1
    %9997 = torch.aten.add.Tensor %9872, %9996, %int1_12895 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %9998 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.31.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.31.modulation.lin.weight : tensor<9216x3072xf16>
    %9999 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_12896 = torch.constant.int 0
    %int1_12897 = torch.constant.int 1
    %10000 = torch.aten.transpose.int %9999, %int0_12896, %int1_12897 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.31.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.31.modulation.lin.bias : tensor<9216xf16>
    %10001 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_12898 = torch.constant.int 6
    %10002 = torch.prims.convert_element_type %10001, %int6_12898 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_12899 = torch.constant.int 6
    %10003 = torch.prims.convert_element_type %9998, %int6_12899 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_12900 = torch.constant.int 6
    %10004 = torch.prims.convert_element_type %10000, %int6_12900 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %10005 = torch.aten.mm %10003, %10004 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_12901 = torch.constant.int 1
    %10006 = torch.aten.mul.Scalar %10005, %int1_12901 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_12902 = torch.constant.int 1
    %10007 = torch.aten.mul.Scalar %10002, %int1_12902 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_12903 = torch.constant.int 1
    %10008 = torch.aten.add.Tensor %10006, %10007, %int1_12903 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_12904 = torch.constant.int 5
    %10009 = torch.prims.convert_element_type %10008, %int5_12904 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_12905 = torch.constant.int 0
    %int0_12906 = torch.constant.int 0
    %int9223372036854775807_12907 = torch.constant.int 9223372036854775807
    %int1_12908 = torch.constant.int 1
    %10010 = torch.aten.slice.Tensor %10009, %int0_12905, %int0_12906, %int9223372036854775807_12907, %int1_12908 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_12909 = torch.constant.int 1
    %10011 = torch.aten.unsqueeze %10010, %int1_12909 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_12910 = torch.constant.int 2
    %int0_12911 = torch.constant.int 0
    %int9223372036854775807_12912 = torch.constant.int 9223372036854775807
    %int1_12913 = torch.constant.int 1
    %10012 = torch.aten.slice.Tensor %10011, %int2_12910, %int0_12911, %int9223372036854775807_12912, %int1_12913 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_12914 = torch.constant.int -1
    %int0_12915 = torch.constant.int 0
    %int3072_12916 = torch.constant.int 3072
    %int1_12917 = torch.constant.int 1
    %10013 = torch.aten.slice.Tensor %10012, %int-1_12914, %int0_12915, %int3072_12916, %int1_12917 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12918 = torch.constant.int -1
    %int3072_12919 = torch.constant.int 3072
    %int6144_12920 = torch.constant.int 6144
    %int1_12921 = torch.constant.int 1
    %10014 = torch.aten.slice.Tensor %10012, %int-1_12918, %int3072_12919, %int6144_12920, %int1_12921 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_12922 = torch.constant.int -1
    %int6144_12923 = torch.constant.int 6144
    %int9216_12924 = torch.constant.int 9216
    %int1_12925 = torch.constant.int 1
    %10015 = torch.aten.slice.Tensor %10012, %int-1_12922, %int6144_12923, %int9216_12924, %int1_12925 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_12926 = torch.constant.int 1
    %int1_12927 = torch.constant.int 1
    %10016 = torch.aten.add.Scalar %10014, %int1_12926, %int1_12927 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_12928 = torch.constant.int 6
    %10017 = torch.prims.convert_element_type %9997, %int6_12928 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_12929 = torch.constant.int 2
    %10018 = torch.prim.ListConstruct %int2_12929 : (!torch.int) -> !torch.list<int>
    %int0_12930 = torch.constant.int 0
    %true_12931 = torch.constant.bool true
    %result0_12932, %result1_12933 = torch.aten.var_mean.correction %10017, %10018, %int0_12930, %true_12931 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_12934 = torch.constant.float 9.9999999999999995E-7
    %int1_12935 = torch.constant.int 1
    %10019 = torch.aten.add.Scalar %result0_12932, %float9.999990e-07_12934, %int1_12935 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %10020 = torch.aten.rsqrt %10019 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_12936 = torch.constant.int 1
    %10021 = torch.aten.sub.Tensor %9997, %result1_12933, %int1_12936 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %10022 = torch.aten.mul.Tensor %10021, %10020 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_12937 = torch.constant.int 5
    %10023 = torch.prims.convert_element_type %10022, %int5_12937 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10024 = torch.aten.mul.Tensor %10016, %10023 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_12938 = torch.constant.int 1
    %10025 = torch.aten.add.Tensor %10024, %10013, %int1_12938 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_12939 = torch.constant.int 4608
    %int3072_12940 = torch.constant.int 3072
    %10026 = torch.prim.ListConstruct %int4608_12939, %int3072_12940 : (!torch.int, !torch.int) -> !torch.list<int>
    %10027 = torch.aten.view %10025, %10026 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.31.linear1.weight = util.global.load @__auto.sampler.single_blocks.31.linear1.weight : tensor<21504x3072xf16>
    %10028 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_12941 = torch.constant.int 0
    %int1_12942 = torch.constant.int 1
    %10029 = torch.aten.transpose.int %10028, %int0_12941, %int1_12942 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.31.linear1.bias = util.global.load @__auto.sampler.single_blocks.31.linear1.bias : tensor<21504xf16>
    %10030 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_12943 = torch.constant.int 6
    %10031 = torch.prims.convert_element_type %10030, %int6_12943 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_12944 = torch.constant.int 6
    %10032 = torch.prims.convert_element_type %10027, %int6_12944 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_12945 = torch.constant.int 6
    %10033 = torch.prims.convert_element_type %10029, %int6_12945 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %10034 = torch.aten.mm %10032, %10033 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_12946 = torch.constant.int 1
    %10035 = torch.aten.mul.Scalar %10034, %int1_12946 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_12947 = torch.constant.int 1
    %10036 = torch.aten.mul.Scalar %10031, %int1_12947 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_12948 = torch.constant.int 1
    %10037 = torch.aten.add.Tensor %10035, %10036, %int1_12948 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_12949 = torch.constant.int 5
    %10038 = torch.prims.convert_element_type %10037, %int5_12949 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_12950 = torch.constant.int 1
    %int4608_12951 = torch.constant.int 4608
    %int21504_12952 = torch.constant.int 21504
    %10039 = torch.prim.ListConstruct %int1_12950, %int4608_12951, %int21504_12952 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10040 = torch.aten.view %10038, %10039 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_12953 = torch.constant.int -1
    %int0_12954 = torch.constant.int 0
    %int9216_12955 = torch.constant.int 9216
    %int1_12956 = torch.constant.int 1
    %10041 = torch.aten.slice.Tensor %10040, %int-1_12953, %int0_12954, %int9216_12955, %int1_12956 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_12957 = torch.constant.int -1
    %int9216_12958 = torch.constant.int 9216
    %int21504_12959 = torch.constant.int 21504
    %int1_12960 = torch.constant.int 1
    %10042 = torch.aten.slice.Tensor %10040, %int-1_12957, %int9216_12958, %int21504_12959, %int1_12960 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_12961 = torch.constant.int 1
    %int4608_12962 = torch.constant.int 4608
    %int3_12963 = torch.constant.int 3
    %int24_12964 = torch.constant.int 24
    %int128_12965 = torch.constant.int 128
    %10043 = torch.prim.ListConstruct %int1_12961, %int4608_12962, %int3_12963, %int24_12964, %int128_12965 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10044 = torch.aten.view %10041, %10043 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_12966 = torch.constant.int 2
    %int0_12967 = torch.constant.int 0
    %int3_12968 = torch.constant.int 3
    %int1_12969 = torch.constant.int 1
    %int4_12970 = torch.constant.int 4
    %10045 = torch.prim.ListConstruct %int2_12966, %int0_12967, %int3_12968, %int1_12969, %int4_12970 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10046 = torch.aten.permute %10044, %10045 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_12971 = torch.constant.int 0
    %int0_12972 = torch.constant.int 0
    %10047 = torch.aten.select.int %10046, %int0_12971, %int0_12972 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12973 = torch.constant.int 0
    %int1_12974 = torch.constant.int 1
    %10048 = torch.aten.select.int %10046, %int0_12973, %int1_12974 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_12975 = torch.constant.int 0
    %int2_12976 = torch.constant.int 2
    %10049 = torch.aten.select.int %10046, %int0_12975, %int2_12976 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12977 = torch.constant.int 6
    %10050 = torch.prims.convert_element_type %10047, %int6_12977 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12978 = torch.constant.int 2
    %10051 = torch.aten.pow.Tensor_Scalar %10050, %int2_12978 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12979 = torch.constant.int -1
    %10052 = torch.prim.ListConstruct %int-1_12979 : (!torch.int) -> !torch.list<int>
    %true_12980 = torch.constant.bool true
    %none_12981 = torch.constant.none
    %10053 = torch.aten.mean.dim %10051, %10052, %true_12980, %none_12981 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12982 = torch.constant.float 9.9999999999999995E-7
    %int1_12983 = torch.constant.int 1
    %10054 = torch.aten.add.Scalar %10053, %float9.999990e-07_12982, %int1_12983 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10055 = torch.aten.rsqrt %10054 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10056 = torch.aten.mul.Tensor %10050, %10055 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12984 = torch.constant.int 5
    %10057 = torch.prims.convert_element_type %10056, %int5_12984 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.31.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.31.norm.query_norm.scale : tensor<128xf16>
    %10058 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10059 = torch.aten.mul.Tensor %10057, %10058 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12985 = torch.constant.int 6
    %10060 = torch.prims.convert_element_type %10048, %int6_12985 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_12986 = torch.constant.int 2
    %10061 = torch.aten.pow.Tensor_Scalar %10060, %int2_12986 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_12987 = torch.constant.int -1
    %10062 = torch.prim.ListConstruct %int-1_12987 : (!torch.int) -> !torch.list<int>
    %true_12988 = torch.constant.bool true
    %none_12989 = torch.constant.none
    %10063 = torch.aten.mean.dim %10061, %10062, %true_12988, %none_12989 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_12990 = torch.constant.float 9.9999999999999995E-7
    %int1_12991 = torch.constant.int 1
    %10064 = torch.aten.add.Scalar %10063, %float9.999990e-07_12990, %int1_12991 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10065 = torch.aten.rsqrt %10064 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10066 = torch.aten.mul.Tensor %10060, %10065 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_12992 = torch.constant.int 5
    %10067 = torch.prims.convert_element_type %10066, %int5_12992 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.31.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.31.norm.key_norm.scale : tensor<128xf16>
    %10068 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10069 = torch.aten.mul.Tensor %10067, %10068 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12993 = torch.constant.int 5
    %10070 = torch.prims.convert_element_type %10059, %int5_12993 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_12994 = torch.constant.int 5
    %10071 = torch.prims.convert_element_type %10069, %int5_12994 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_12995 = torch.constant.int 6
    %10072 = torch.prims.convert_element_type %10070, %int6_12995 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_12996 = torch.constant.int 1
    %int24_12997 = torch.constant.int 24
    %int4608_12998 = torch.constant.int 4608
    %int64_12999 = torch.constant.int 64
    %int1_13000 = torch.constant.int 1
    %int2_13001 = torch.constant.int 2
    %10073 = torch.prim.ListConstruct %int1_12996, %int24_12997, %int4608_12998, %int64_12999, %int1_13000, %int2_13001 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10074 = torch.aten.view %10072, %10073 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_13002 = torch.constant.int 6
    %10075 = torch.prims.convert_element_type %10071, %int6_13002 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13003 = torch.constant.int 1
    %int24_13004 = torch.constant.int 24
    %int4608_13005 = torch.constant.int 4608
    %int64_13006 = torch.constant.int 64
    %int1_13007 = torch.constant.int 1
    %int2_13008 = torch.constant.int 2
    %10076 = torch.prim.ListConstruct %int1_13003, %int24_13004, %int4608_13005, %int64_13006, %int1_13007, %int2_13008 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10077 = torch.aten.view %10075, %10076 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_13009 = torch.constant.int 5
    %int0_13010 = torch.constant.int 0
    %10078 = torch.aten.select.int %211, %int5_13009, %int0_13010 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13011 = torch.constant.int 5
    %int0_13012 = torch.constant.int 0
    %10079 = torch.aten.select.int %10074, %int5_13011, %int0_13012 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10080 = torch.aten.mul.Tensor %10078, %10079 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13013 = torch.constant.int 5
    %int1_13014 = torch.constant.int 1
    %10081 = torch.aten.select.int %211, %int5_13013, %int1_13014 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13015 = torch.constant.int 5
    %int1_13016 = torch.constant.int 1
    %10082 = torch.aten.select.int %10074, %int5_13015, %int1_13016 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10083 = torch.aten.mul.Tensor %10081, %10082 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13017 = torch.constant.int 1
    %10084 = torch.aten.add.Tensor %10080, %10083, %int1_13017 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13018 = torch.constant.int 5
    %int0_13019 = torch.constant.int 0
    %10085 = torch.aten.select.int %211, %int5_13018, %int0_13019 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13020 = torch.constant.int 5
    %int0_13021 = torch.constant.int 0
    %10086 = torch.aten.select.int %10077, %int5_13020, %int0_13021 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10087 = torch.aten.mul.Tensor %10085, %10086 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13022 = torch.constant.int 5
    %int1_13023 = torch.constant.int 1
    %10088 = torch.aten.select.int %211, %int5_13022, %int1_13023 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13024 = torch.constant.int 5
    %int1_13025 = torch.constant.int 1
    %10089 = torch.aten.select.int %10077, %int5_13024, %int1_13025 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10090 = torch.aten.mul.Tensor %10088, %10089 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13026 = torch.constant.int 1
    %10091 = torch.aten.add.Tensor %10087, %10090, %int1_13026 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13027 = torch.constant.int 1
    %int24_13028 = torch.constant.int 24
    %int4608_13029 = torch.constant.int 4608
    %int128_13030 = torch.constant.int 128
    %10092 = torch.prim.ListConstruct %int1_13027, %int24_13028, %int4608_13029, %int128_13030 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10093 = torch.aten.view %10084, %10092 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13031 = torch.constant.int 5
    %10094 = torch.prims.convert_element_type %10093, %int5_13031 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_13032 = torch.constant.int 1
    %int24_13033 = torch.constant.int 24
    %int4608_13034 = torch.constant.int 4608
    %int128_13035 = torch.constant.int 128
    %10095 = torch.prim.ListConstruct %int1_13032, %int24_13033, %int4608_13034, %int128_13035 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10096 = torch.aten.view %10091, %10095 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13036 = torch.constant.int 5
    %10097 = torch.prims.convert_element_type %10096, %int5_13036 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_13037 = torch.constant.float 0.000000e+00
    %false_13038 = torch.constant.bool false
    %none_13039 = torch.constant.none
    %none_13040 = torch.constant.none
    %10098:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%10094, %10097, %10049, %float0.000000e00_13037, %false_13038, %none_13039, %none_13040) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_13041 = torch.constant.int 0
    %int2_13042 = torch.constant.int 2
    %int1_13043 = torch.constant.int 1
    %int3_13044 = torch.constant.int 3
    %10099 = torch.prim.ListConstruct %int0_13041, %int2_13042, %int1_13043, %int3_13044 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10100 = torch.aten.permute %10098#0, %10099 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_13045 = torch.constant.int 1
    %int4608_13046 = torch.constant.int 4608
    %int3072_13047 = torch.constant.int 3072
    %10101 = torch.prim.ListConstruct %int1_13045, %int4608_13046, %int3072_13047 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10102 = torch.aten.view %10100, %10101 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_13048 = torch.constant.str "tanh"
    %10103 = torch.aten.gelu %10042, %str_13048 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %10104 = torch.prim.ListConstruct %10102, %10103 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_13049 = torch.constant.int 2
    %10105 = torch.aten.cat %10104, %int2_13049 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_13050 = torch.constant.int 4608
    %int15360_13051 = torch.constant.int 15360
    %10106 = torch.prim.ListConstruct %int4608_13050, %int15360_13051 : (!torch.int, !torch.int) -> !torch.list<int>
    %10107 = torch.aten.view %10105, %10106 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.31.linear2.weight = util.global.load @__auto.sampler.single_blocks.31.linear2.weight : tensor<3072x15360xf16>
    %10108 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_13052 = torch.constant.int 0
    %int1_13053 = torch.constant.int 1
    %10109 = torch.aten.transpose.int %10108, %int0_13052, %int1_13053 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.31.linear2.bias = util.global.load @__auto.sampler.single_blocks.31.linear2.bias : tensor<3072xf16>
    %10110 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.31.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_13054 = torch.constant.int 6
    %10111 = torch.prims.convert_element_type %10110, %int6_13054 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_13055 = torch.constant.int 6
    %10112 = torch.prims.convert_element_type %10107, %int6_13055 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_13056 = torch.constant.int 6
    %10113 = torch.prims.convert_element_type %10109, %int6_13056 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %10114 = torch.aten.mm %10112, %10113 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_13057 = torch.constant.int 1
    %10115 = torch.aten.mul.Scalar %10114, %int1_13057 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_13058 = torch.constant.int 1
    %10116 = torch.aten.mul.Scalar %10111, %int1_13058 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_13059 = torch.constant.int 1
    %10117 = torch.aten.add.Tensor %10115, %10116, %int1_13059 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_13060 = torch.constant.int 5
    %10118 = torch.prims.convert_element_type %10117, %int5_13060 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_13061 = torch.constant.int 1
    %int4608_13062 = torch.constant.int 4608
    %int3072_13063 = torch.constant.int 3072
    %10119 = torch.prim.ListConstruct %int1_13061, %int4608_13062, %int3072_13063 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10120 = torch.aten.view %10118, %10119 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %10121 = torch.aten.mul.Tensor %10015, %10120 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13064 = torch.constant.int 1
    %10122 = torch.aten.add.Tensor %9997, %10121, %int1_13064 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10123 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.32.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.32.modulation.lin.weight : tensor<9216x3072xf16>
    %10124 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_13065 = torch.constant.int 0
    %int1_13066 = torch.constant.int 1
    %10125 = torch.aten.transpose.int %10124, %int0_13065, %int1_13066 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.32.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.32.modulation.lin.bias : tensor<9216xf16>
    %10126 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_13067 = torch.constant.int 6
    %10127 = torch.prims.convert_element_type %10126, %int6_13067 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_13068 = torch.constant.int 6
    %10128 = torch.prims.convert_element_type %10123, %int6_13068 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_13069 = torch.constant.int 6
    %10129 = torch.prims.convert_element_type %10125, %int6_13069 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %10130 = torch.aten.mm %10128, %10129 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_13070 = torch.constant.int 1
    %10131 = torch.aten.mul.Scalar %10130, %int1_13070 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_13071 = torch.constant.int 1
    %10132 = torch.aten.mul.Scalar %10127, %int1_13071 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_13072 = torch.constant.int 1
    %10133 = torch.aten.add.Tensor %10131, %10132, %int1_13072 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_13073 = torch.constant.int 5
    %10134 = torch.prims.convert_element_type %10133, %int5_13073 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_13074 = torch.constant.int 0
    %int0_13075 = torch.constant.int 0
    %int9223372036854775807_13076 = torch.constant.int 9223372036854775807
    %int1_13077 = torch.constant.int 1
    %10135 = torch.aten.slice.Tensor %10134, %int0_13074, %int0_13075, %int9223372036854775807_13076, %int1_13077 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_13078 = torch.constant.int 1
    %10136 = torch.aten.unsqueeze %10135, %int1_13078 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_13079 = torch.constant.int 2
    %int0_13080 = torch.constant.int 0
    %int9223372036854775807_13081 = torch.constant.int 9223372036854775807
    %int1_13082 = torch.constant.int 1
    %10137 = torch.aten.slice.Tensor %10136, %int2_13079, %int0_13080, %int9223372036854775807_13081, %int1_13082 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_13083 = torch.constant.int -1
    %int0_13084 = torch.constant.int 0
    %int3072_13085 = torch.constant.int 3072
    %int1_13086 = torch.constant.int 1
    %10138 = torch.aten.slice.Tensor %10137, %int-1_13083, %int0_13084, %int3072_13085, %int1_13086 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13087 = torch.constant.int -1
    %int3072_13088 = torch.constant.int 3072
    %int6144_13089 = torch.constant.int 6144
    %int1_13090 = torch.constant.int 1
    %10139 = torch.aten.slice.Tensor %10137, %int-1_13087, %int3072_13088, %int6144_13089, %int1_13090 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13091 = torch.constant.int -1
    %int6144_13092 = torch.constant.int 6144
    %int9216_13093 = torch.constant.int 9216
    %int1_13094 = torch.constant.int 1
    %10140 = torch.aten.slice.Tensor %10137, %int-1_13091, %int6144_13092, %int9216_13093, %int1_13094 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_13095 = torch.constant.int 1
    %int1_13096 = torch.constant.int 1
    %10141 = torch.aten.add.Scalar %10139, %int1_13095, %int1_13096 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_13097 = torch.constant.int 6
    %10142 = torch.prims.convert_element_type %10122, %int6_13097 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_13098 = torch.constant.int 2
    %10143 = torch.prim.ListConstruct %int2_13098 : (!torch.int) -> !torch.list<int>
    %int0_13099 = torch.constant.int 0
    %true_13100 = torch.constant.bool true
    %result0_13101, %result1_13102 = torch.aten.var_mean.correction %10142, %10143, %int0_13099, %true_13100 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_13103 = torch.constant.float 9.9999999999999995E-7
    %int1_13104 = torch.constant.int 1
    %10144 = torch.aten.add.Scalar %result0_13101, %float9.999990e-07_13103, %int1_13104 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %10145 = torch.aten.rsqrt %10144 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_13105 = torch.constant.int 1
    %10146 = torch.aten.sub.Tensor %10122, %result1_13102, %int1_13105 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %10147 = torch.aten.mul.Tensor %10146, %10145 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_13106 = torch.constant.int 5
    %10148 = torch.prims.convert_element_type %10147, %int5_13106 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10149 = torch.aten.mul.Tensor %10141, %10148 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13107 = torch.constant.int 1
    %10150 = torch.aten.add.Tensor %10149, %10138, %int1_13107 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_13108 = torch.constant.int 4608
    %int3072_13109 = torch.constant.int 3072
    %10151 = torch.prim.ListConstruct %int4608_13108, %int3072_13109 : (!torch.int, !torch.int) -> !torch.list<int>
    %10152 = torch.aten.view %10150, %10151 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.32.linear1.weight = util.global.load @__auto.sampler.single_blocks.32.linear1.weight : tensor<21504x3072xf16>
    %10153 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_13110 = torch.constant.int 0
    %int1_13111 = torch.constant.int 1
    %10154 = torch.aten.transpose.int %10153, %int0_13110, %int1_13111 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.32.linear1.bias = util.global.load @__auto.sampler.single_blocks.32.linear1.bias : tensor<21504xf16>
    %10155 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_13112 = torch.constant.int 6
    %10156 = torch.prims.convert_element_type %10155, %int6_13112 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_13113 = torch.constant.int 6
    %10157 = torch.prims.convert_element_type %10152, %int6_13113 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_13114 = torch.constant.int 6
    %10158 = torch.prims.convert_element_type %10154, %int6_13114 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %10159 = torch.aten.mm %10157, %10158 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_13115 = torch.constant.int 1
    %10160 = torch.aten.mul.Scalar %10159, %int1_13115 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_13116 = torch.constant.int 1
    %10161 = torch.aten.mul.Scalar %10156, %int1_13116 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_13117 = torch.constant.int 1
    %10162 = torch.aten.add.Tensor %10160, %10161, %int1_13117 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_13118 = torch.constant.int 5
    %10163 = torch.prims.convert_element_type %10162, %int5_13118 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_13119 = torch.constant.int 1
    %int4608_13120 = torch.constant.int 4608
    %int21504_13121 = torch.constant.int 21504
    %10164 = torch.prim.ListConstruct %int1_13119, %int4608_13120, %int21504_13121 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10165 = torch.aten.view %10163, %10164 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_13122 = torch.constant.int -1
    %int0_13123 = torch.constant.int 0
    %int9216_13124 = torch.constant.int 9216
    %int1_13125 = torch.constant.int 1
    %10166 = torch.aten.slice.Tensor %10165, %int-1_13122, %int0_13123, %int9216_13124, %int1_13125 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_13126 = torch.constant.int -1
    %int9216_13127 = torch.constant.int 9216
    %int21504_13128 = torch.constant.int 21504
    %int1_13129 = torch.constant.int 1
    %10167 = torch.aten.slice.Tensor %10165, %int-1_13126, %int9216_13127, %int21504_13128, %int1_13129 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_13130 = torch.constant.int 1
    %int4608_13131 = torch.constant.int 4608
    %int3_13132 = torch.constant.int 3
    %int24_13133 = torch.constant.int 24
    %int128_13134 = torch.constant.int 128
    %10168 = torch.prim.ListConstruct %int1_13130, %int4608_13131, %int3_13132, %int24_13133, %int128_13134 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10169 = torch.aten.view %10166, %10168 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_13135 = torch.constant.int 2
    %int0_13136 = torch.constant.int 0
    %int3_13137 = torch.constant.int 3
    %int1_13138 = torch.constant.int 1
    %int4_13139 = torch.constant.int 4
    %10170 = torch.prim.ListConstruct %int2_13135, %int0_13136, %int3_13137, %int1_13138, %int4_13139 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10171 = torch.aten.permute %10169, %10170 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_13140 = torch.constant.int 0
    %int0_13141 = torch.constant.int 0
    %10172 = torch.aten.select.int %10171, %int0_13140, %int0_13141 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13142 = torch.constant.int 0
    %int1_13143 = torch.constant.int 1
    %10173 = torch.aten.select.int %10171, %int0_13142, %int1_13143 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13144 = torch.constant.int 0
    %int2_13145 = torch.constant.int 2
    %10174 = torch.aten.select.int %10171, %int0_13144, %int2_13145 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13146 = torch.constant.int 6
    %10175 = torch.prims.convert_element_type %10172, %int6_13146 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13147 = torch.constant.int 2
    %10176 = torch.aten.pow.Tensor_Scalar %10175, %int2_13147 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13148 = torch.constant.int -1
    %10177 = torch.prim.ListConstruct %int-1_13148 : (!torch.int) -> !torch.list<int>
    %true_13149 = torch.constant.bool true
    %none_13150 = torch.constant.none
    %10178 = torch.aten.mean.dim %10176, %10177, %true_13149, %none_13150 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13151 = torch.constant.float 9.9999999999999995E-7
    %int1_13152 = torch.constant.int 1
    %10179 = torch.aten.add.Scalar %10178, %float9.999990e-07_13151, %int1_13152 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10180 = torch.aten.rsqrt %10179 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10181 = torch.aten.mul.Tensor %10175, %10180 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13153 = torch.constant.int 5
    %10182 = torch.prims.convert_element_type %10181, %int5_13153 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.32.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.32.norm.query_norm.scale : tensor<128xf16>
    %10183 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10184 = torch.aten.mul.Tensor %10182, %10183 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13154 = torch.constant.int 6
    %10185 = torch.prims.convert_element_type %10173, %int6_13154 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13155 = torch.constant.int 2
    %10186 = torch.aten.pow.Tensor_Scalar %10185, %int2_13155 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13156 = torch.constant.int -1
    %10187 = torch.prim.ListConstruct %int-1_13156 : (!torch.int) -> !torch.list<int>
    %true_13157 = torch.constant.bool true
    %none_13158 = torch.constant.none
    %10188 = torch.aten.mean.dim %10186, %10187, %true_13157, %none_13158 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13159 = torch.constant.float 9.9999999999999995E-7
    %int1_13160 = torch.constant.int 1
    %10189 = torch.aten.add.Scalar %10188, %float9.999990e-07_13159, %int1_13160 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10190 = torch.aten.rsqrt %10189 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10191 = torch.aten.mul.Tensor %10185, %10190 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13161 = torch.constant.int 5
    %10192 = torch.prims.convert_element_type %10191, %int5_13161 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.32.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.32.norm.key_norm.scale : tensor<128xf16>
    %10193 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10194 = torch.aten.mul.Tensor %10192, %10193 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13162 = torch.constant.int 5
    %10195 = torch.prims.convert_element_type %10184, %int5_13162 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13163 = torch.constant.int 5
    %10196 = torch.prims.convert_element_type %10194, %int5_13163 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13164 = torch.constant.int 6
    %10197 = torch.prims.convert_element_type %10195, %int6_13164 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13165 = torch.constant.int 1
    %int24_13166 = torch.constant.int 24
    %int4608_13167 = torch.constant.int 4608
    %int64_13168 = torch.constant.int 64
    %int1_13169 = torch.constant.int 1
    %int2_13170 = torch.constant.int 2
    %10198 = torch.prim.ListConstruct %int1_13165, %int24_13166, %int4608_13167, %int64_13168, %int1_13169, %int2_13170 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10199 = torch.aten.view %10197, %10198 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_13171 = torch.constant.int 6
    %10200 = torch.prims.convert_element_type %10196, %int6_13171 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13172 = torch.constant.int 1
    %int24_13173 = torch.constant.int 24
    %int4608_13174 = torch.constant.int 4608
    %int64_13175 = torch.constant.int 64
    %int1_13176 = torch.constant.int 1
    %int2_13177 = torch.constant.int 2
    %10201 = torch.prim.ListConstruct %int1_13172, %int24_13173, %int4608_13174, %int64_13175, %int1_13176, %int2_13177 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10202 = torch.aten.view %10200, %10201 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_13178 = torch.constant.int 5
    %int0_13179 = torch.constant.int 0
    %10203 = torch.aten.select.int %211, %int5_13178, %int0_13179 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13180 = torch.constant.int 5
    %int0_13181 = torch.constant.int 0
    %10204 = torch.aten.select.int %10199, %int5_13180, %int0_13181 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10205 = torch.aten.mul.Tensor %10203, %10204 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13182 = torch.constant.int 5
    %int1_13183 = torch.constant.int 1
    %10206 = torch.aten.select.int %211, %int5_13182, %int1_13183 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13184 = torch.constant.int 5
    %int1_13185 = torch.constant.int 1
    %10207 = torch.aten.select.int %10199, %int5_13184, %int1_13185 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10208 = torch.aten.mul.Tensor %10206, %10207 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13186 = torch.constant.int 1
    %10209 = torch.aten.add.Tensor %10205, %10208, %int1_13186 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13187 = torch.constant.int 5
    %int0_13188 = torch.constant.int 0
    %10210 = torch.aten.select.int %211, %int5_13187, %int0_13188 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13189 = torch.constant.int 5
    %int0_13190 = torch.constant.int 0
    %10211 = torch.aten.select.int %10202, %int5_13189, %int0_13190 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10212 = torch.aten.mul.Tensor %10210, %10211 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13191 = torch.constant.int 5
    %int1_13192 = torch.constant.int 1
    %10213 = torch.aten.select.int %211, %int5_13191, %int1_13192 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13193 = torch.constant.int 5
    %int1_13194 = torch.constant.int 1
    %10214 = torch.aten.select.int %10202, %int5_13193, %int1_13194 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10215 = torch.aten.mul.Tensor %10213, %10214 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13195 = torch.constant.int 1
    %10216 = torch.aten.add.Tensor %10212, %10215, %int1_13195 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13196 = torch.constant.int 1
    %int24_13197 = torch.constant.int 24
    %int4608_13198 = torch.constant.int 4608
    %int128_13199 = torch.constant.int 128
    %10217 = torch.prim.ListConstruct %int1_13196, %int24_13197, %int4608_13198, %int128_13199 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10218 = torch.aten.view %10209, %10217 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13200 = torch.constant.int 5
    %10219 = torch.prims.convert_element_type %10218, %int5_13200 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_13201 = torch.constant.int 1
    %int24_13202 = torch.constant.int 24
    %int4608_13203 = torch.constant.int 4608
    %int128_13204 = torch.constant.int 128
    %10220 = torch.prim.ListConstruct %int1_13201, %int24_13202, %int4608_13203, %int128_13204 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10221 = torch.aten.view %10216, %10220 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13205 = torch.constant.int 5
    %10222 = torch.prims.convert_element_type %10221, %int5_13205 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_13206 = torch.constant.float 0.000000e+00
    %false_13207 = torch.constant.bool false
    %none_13208 = torch.constant.none
    %none_13209 = torch.constant.none
    %10223:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%10219, %10222, %10174, %float0.000000e00_13206, %false_13207, %none_13208, %none_13209) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_13210 = torch.constant.int 0
    %int2_13211 = torch.constant.int 2
    %int1_13212 = torch.constant.int 1
    %int3_13213 = torch.constant.int 3
    %10224 = torch.prim.ListConstruct %int0_13210, %int2_13211, %int1_13212, %int3_13213 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10225 = torch.aten.permute %10223#0, %10224 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_13214 = torch.constant.int 1
    %int4608_13215 = torch.constant.int 4608
    %int3072_13216 = torch.constant.int 3072
    %10226 = torch.prim.ListConstruct %int1_13214, %int4608_13215, %int3072_13216 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10227 = torch.aten.view %10225, %10226 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_13217 = torch.constant.str "tanh"
    %10228 = torch.aten.gelu %10167, %str_13217 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %10229 = torch.prim.ListConstruct %10227, %10228 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_13218 = torch.constant.int 2
    %10230 = torch.aten.cat %10229, %int2_13218 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_13219 = torch.constant.int 4608
    %int15360_13220 = torch.constant.int 15360
    %10231 = torch.prim.ListConstruct %int4608_13219, %int15360_13220 : (!torch.int, !torch.int) -> !torch.list<int>
    %10232 = torch.aten.view %10230, %10231 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.32.linear2.weight = util.global.load @__auto.sampler.single_blocks.32.linear2.weight : tensor<3072x15360xf16>
    %10233 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_13221 = torch.constant.int 0
    %int1_13222 = torch.constant.int 1
    %10234 = torch.aten.transpose.int %10233, %int0_13221, %int1_13222 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.32.linear2.bias = util.global.load @__auto.sampler.single_blocks.32.linear2.bias : tensor<3072xf16>
    %10235 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.32.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_13223 = torch.constant.int 6
    %10236 = torch.prims.convert_element_type %10235, %int6_13223 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_13224 = torch.constant.int 6
    %10237 = torch.prims.convert_element_type %10232, %int6_13224 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_13225 = torch.constant.int 6
    %10238 = torch.prims.convert_element_type %10234, %int6_13225 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %10239 = torch.aten.mm %10237, %10238 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_13226 = torch.constant.int 1
    %10240 = torch.aten.mul.Scalar %10239, %int1_13226 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_13227 = torch.constant.int 1
    %10241 = torch.aten.mul.Scalar %10236, %int1_13227 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_13228 = torch.constant.int 1
    %10242 = torch.aten.add.Tensor %10240, %10241, %int1_13228 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_13229 = torch.constant.int 5
    %10243 = torch.prims.convert_element_type %10242, %int5_13229 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_13230 = torch.constant.int 1
    %int4608_13231 = torch.constant.int 4608
    %int3072_13232 = torch.constant.int 3072
    %10244 = torch.prim.ListConstruct %int1_13230, %int4608_13231, %int3072_13232 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10245 = torch.aten.view %10243, %10244 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %10246 = torch.aten.mul.Tensor %10140, %10245 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13233 = torch.constant.int 1
    %10247 = torch.aten.add.Tensor %10122, %10246, %int1_13233 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10248 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.33.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.33.modulation.lin.weight : tensor<9216x3072xf16>
    %10249 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_13234 = torch.constant.int 0
    %int1_13235 = torch.constant.int 1
    %10250 = torch.aten.transpose.int %10249, %int0_13234, %int1_13235 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.33.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.33.modulation.lin.bias : tensor<9216xf16>
    %10251 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_13236 = torch.constant.int 6
    %10252 = torch.prims.convert_element_type %10251, %int6_13236 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_13237 = torch.constant.int 6
    %10253 = torch.prims.convert_element_type %10248, %int6_13237 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_13238 = torch.constant.int 6
    %10254 = torch.prims.convert_element_type %10250, %int6_13238 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %10255 = torch.aten.mm %10253, %10254 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_13239 = torch.constant.int 1
    %10256 = torch.aten.mul.Scalar %10255, %int1_13239 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_13240 = torch.constant.int 1
    %10257 = torch.aten.mul.Scalar %10252, %int1_13240 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_13241 = torch.constant.int 1
    %10258 = torch.aten.add.Tensor %10256, %10257, %int1_13241 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_13242 = torch.constant.int 5
    %10259 = torch.prims.convert_element_type %10258, %int5_13242 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_13243 = torch.constant.int 0
    %int0_13244 = torch.constant.int 0
    %int9223372036854775807_13245 = torch.constant.int 9223372036854775807
    %int1_13246 = torch.constant.int 1
    %10260 = torch.aten.slice.Tensor %10259, %int0_13243, %int0_13244, %int9223372036854775807_13245, %int1_13246 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_13247 = torch.constant.int 1
    %10261 = torch.aten.unsqueeze %10260, %int1_13247 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_13248 = torch.constant.int 2
    %int0_13249 = torch.constant.int 0
    %int9223372036854775807_13250 = torch.constant.int 9223372036854775807
    %int1_13251 = torch.constant.int 1
    %10262 = torch.aten.slice.Tensor %10261, %int2_13248, %int0_13249, %int9223372036854775807_13250, %int1_13251 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_13252 = torch.constant.int -1
    %int0_13253 = torch.constant.int 0
    %int3072_13254 = torch.constant.int 3072
    %int1_13255 = torch.constant.int 1
    %10263 = torch.aten.slice.Tensor %10262, %int-1_13252, %int0_13253, %int3072_13254, %int1_13255 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13256 = torch.constant.int -1
    %int3072_13257 = torch.constant.int 3072
    %int6144_13258 = torch.constant.int 6144
    %int1_13259 = torch.constant.int 1
    %10264 = torch.aten.slice.Tensor %10262, %int-1_13256, %int3072_13257, %int6144_13258, %int1_13259 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13260 = torch.constant.int -1
    %int6144_13261 = torch.constant.int 6144
    %int9216_13262 = torch.constant.int 9216
    %int1_13263 = torch.constant.int 1
    %10265 = torch.aten.slice.Tensor %10262, %int-1_13260, %int6144_13261, %int9216_13262, %int1_13263 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_13264 = torch.constant.int 1
    %int1_13265 = torch.constant.int 1
    %10266 = torch.aten.add.Scalar %10264, %int1_13264, %int1_13265 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_13266 = torch.constant.int 6
    %10267 = torch.prims.convert_element_type %10247, %int6_13266 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_13267 = torch.constant.int 2
    %10268 = torch.prim.ListConstruct %int2_13267 : (!torch.int) -> !torch.list<int>
    %int0_13268 = torch.constant.int 0
    %true_13269 = torch.constant.bool true
    %result0_13270, %result1_13271 = torch.aten.var_mean.correction %10267, %10268, %int0_13268, %true_13269 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_13272 = torch.constant.float 9.9999999999999995E-7
    %int1_13273 = torch.constant.int 1
    %10269 = torch.aten.add.Scalar %result0_13270, %float9.999990e-07_13272, %int1_13273 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %10270 = torch.aten.rsqrt %10269 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_13274 = torch.constant.int 1
    %10271 = torch.aten.sub.Tensor %10247, %result1_13271, %int1_13274 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %10272 = torch.aten.mul.Tensor %10271, %10270 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_13275 = torch.constant.int 5
    %10273 = torch.prims.convert_element_type %10272, %int5_13275 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10274 = torch.aten.mul.Tensor %10266, %10273 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13276 = torch.constant.int 1
    %10275 = torch.aten.add.Tensor %10274, %10263, %int1_13276 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_13277 = torch.constant.int 4608
    %int3072_13278 = torch.constant.int 3072
    %10276 = torch.prim.ListConstruct %int4608_13277, %int3072_13278 : (!torch.int, !torch.int) -> !torch.list<int>
    %10277 = torch.aten.view %10275, %10276 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.33.linear1.weight = util.global.load @__auto.sampler.single_blocks.33.linear1.weight : tensor<21504x3072xf16>
    %10278 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_13279 = torch.constant.int 0
    %int1_13280 = torch.constant.int 1
    %10279 = torch.aten.transpose.int %10278, %int0_13279, %int1_13280 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.33.linear1.bias = util.global.load @__auto.sampler.single_blocks.33.linear1.bias : tensor<21504xf16>
    %10280 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_13281 = torch.constant.int 6
    %10281 = torch.prims.convert_element_type %10280, %int6_13281 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_13282 = torch.constant.int 6
    %10282 = torch.prims.convert_element_type %10277, %int6_13282 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_13283 = torch.constant.int 6
    %10283 = torch.prims.convert_element_type %10279, %int6_13283 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %10284 = torch.aten.mm %10282, %10283 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_13284 = torch.constant.int 1
    %10285 = torch.aten.mul.Scalar %10284, %int1_13284 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_13285 = torch.constant.int 1
    %10286 = torch.aten.mul.Scalar %10281, %int1_13285 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_13286 = torch.constant.int 1
    %10287 = torch.aten.add.Tensor %10285, %10286, %int1_13286 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_13287 = torch.constant.int 5
    %10288 = torch.prims.convert_element_type %10287, %int5_13287 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_13288 = torch.constant.int 1
    %int4608_13289 = torch.constant.int 4608
    %int21504_13290 = torch.constant.int 21504
    %10289 = torch.prim.ListConstruct %int1_13288, %int4608_13289, %int21504_13290 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10290 = torch.aten.view %10288, %10289 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_13291 = torch.constant.int -1
    %int0_13292 = torch.constant.int 0
    %int9216_13293 = torch.constant.int 9216
    %int1_13294 = torch.constant.int 1
    %10291 = torch.aten.slice.Tensor %10290, %int-1_13291, %int0_13292, %int9216_13293, %int1_13294 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_13295 = torch.constant.int -1
    %int9216_13296 = torch.constant.int 9216
    %int21504_13297 = torch.constant.int 21504
    %int1_13298 = torch.constant.int 1
    %10292 = torch.aten.slice.Tensor %10290, %int-1_13295, %int9216_13296, %int21504_13297, %int1_13298 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_13299 = torch.constant.int 1
    %int4608_13300 = torch.constant.int 4608
    %int3_13301 = torch.constant.int 3
    %int24_13302 = torch.constant.int 24
    %int128_13303 = torch.constant.int 128
    %10293 = torch.prim.ListConstruct %int1_13299, %int4608_13300, %int3_13301, %int24_13302, %int128_13303 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10294 = torch.aten.view %10291, %10293 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_13304 = torch.constant.int 2
    %int0_13305 = torch.constant.int 0
    %int3_13306 = torch.constant.int 3
    %int1_13307 = torch.constant.int 1
    %int4_13308 = torch.constant.int 4
    %10295 = torch.prim.ListConstruct %int2_13304, %int0_13305, %int3_13306, %int1_13307, %int4_13308 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10296 = torch.aten.permute %10294, %10295 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_13309 = torch.constant.int 0
    %int0_13310 = torch.constant.int 0
    %10297 = torch.aten.select.int %10296, %int0_13309, %int0_13310 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13311 = torch.constant.int 0
    %int1_13312 = torch.constant.int 1
    %10298 = torch.aten.select.int %10296, %int0_13311, %int1_13312 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13313 = torch.constant.int 0
    %int2_13314 = torch.constant.int 2
    %10299 = torch.aten.select.int %10296, %int0_13313, %int2_13314 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13315 = torch.constant.int 6
    %10300 = torch.prims.convert_element_type %10297, %int6_13315 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13316 = torch.constant.int 2
    %10301 = torch.aten.pow.Tensor_Scalar %10300, %int2_13316 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13317 = torch.constant.int -1
    %10302 = torch.prim.ListConstruct %int-1_13317 : (!torch.int) -> !torch.list<int>
    %true_13318 = torch.constant.bool true
    %none_13319 = torch.constant.none
    %10303 = torch.aten.mean.dim %10301, %10302, %true_13318, %none_13319 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13320 = torch.constant.float 9.9999999999999995E-7
    %int1_13321 = torch.constant.int 1
    %10304 = torch.aten.add.Scalar %10303, %float9.999990e-07_13320, %int1_13321 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10305 = torch.aten.rsqrt %10304 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10306 = torch.aten.mul.Tensor %10300, %10305 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13322 = torch.constant.int 5
    %10307 = torch.prims.convert_element_type %10306, %int5_13322 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.33.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.33.norm.query_norm.scale : tensor<128xf16>
    %10308 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10309 = torch.aten.mul.Tensor %10307, %10308 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13323 = torch.constant.int 6
    %10310 = torch.prims.convert_element_type %10298, %int6_13323 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13324 = torch.constant.int 2
    %10311 = torch.aten.pow.Tensor_Scalar %10310, %int2_13324 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13325 = torch.constant.int -1
    %10312 = torch.prim.ListConstruct %int-1_13325 : (!torch.int) -> !torch.list<int>
    %true_13326 = torch.constant.bool true
    %none_13327 = torch.constant.none
    %10313 = torch.aten.mean.dim %10311, %10312, %true_13326, %none_13327 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13328 = torch.constant.float 9.9999999999999995E-7
    %int1_13329 = torch.constant.int 1
    %10314 = torch.aten.add.Scalar %10313, %float9.999990e-07_13328, %int1_13329 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10315 = torch.aten.rsqrt %10314 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10316 = torch.aten.mul.Tensor %10310, %10315 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13330 = torch.constant.int 5
    %10317 = torch.prims.convert_element_type %10316, %int5_13330 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.33.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.33.norm.key_norm.scale : tensor<128xf16>
    %10318 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10319 = torch.aten.mul.Tensor %10317, %10318 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13331 = torch.constant.int 5
    %10320 = torch.prims.convert_element_type %10309, %int5_13331 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13332 = torch.constant.int 5
    %10321 = torch.prims.convert_element_type %10319, %int5_13332 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13333 = torch.constant.int 6
    %10322 = torch.prims.convert_element_type %10320, %int6_13333 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13334 = torch.constant.int 1
    %int24_13335 = torch.constant.int 24
    %int4608_13336 = torch.constant.int 4608
    %int64_13337 = torch.constant.int 64
    %int1_13338 = torch.constant.int 1
    %int2_13339 = torch.constant.int 2
    %10323 = torch.prim.ListConstruct %int1_13334, %int24_13335, %int4608_13336, %int64_13337, %int1_13338, %int2_13339 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10324 = torch.aten.view %10322, %10323 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_13340 = torch.constant.int 6
    %10325 = torch.prims.convert_element_type %10321, %int6_13340 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13341 = torch.constant.int 1
    %int24_13342 = torch.constant.int 24
    %int4608_13343 = torch.constant.int 4608
    %int64_13344 = torch.constant.int 64
    %int1_13345 = torch.constant.int 1
    %int2_13346 = torch.constant.int 2
    %10326 = torch.prim.ListConstruct %int1_13341, %int24_13342, %int4608_13343, %int64_13344, %int1_13345, %int2_13346 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10327 = torch.aten.view %10325, %10326 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_13347 = torch.constant.int 5
    %int0_13348 = torch.constant.int 0
    %10328 = torch.aten.select.int %211, %int5_13347, %int0_13348 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13349 = torch.constant.int 5
    %int0_13350 = torch.constant.int 0
    %10329 = torch.aten.select.int %10324, %int5_13349, %int0_13350 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10330 = torch.aten.mul.Tensor %10328, %10329 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13351 = torch.constant.int 5
    %int1_13352 = torch.constant.int 1
    %10331 = torch.aten.select.int %211, %int5_13351, %int1_13352 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13353 = torch.constant.int 5
    %int1_13354 = torch.constant.int 1
    %10332 = torch.aten.select.int %10324, %int5_13353, %int1_13354 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10333 = torch.aten.mul.Tensor %10331, %10332 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13355 = torch.constant.int 1
    %10334 = torch.aten.add.Tensor %10330, %10333, %int1_13355 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13356 = torch.constant.int 5
    %int0_13357 = torch.constant.int 0
    %10335 = torch.aten.select.int %211, %int5_13356, %int0_13357 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13358 = torch.constant.int 5
    %int0_13359 = torch.constant.int 0
    %10336 = torch.aten.select.int %10327, %int5_13358, %int0_13359 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10337 = torch.aten.mul.Tensor %10335, %10336 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13360 = torch.constant.int 5
    %int1_13361 = torch.constant.int 1
    %10338 = torch.aten.select.int %211, %int5_13360, %int1_13361 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13362 = torch.constant.int 5
    %int1_13363 = torch.constant.int 1
    %10339 = torch.aten.select.int %10327, %int5_13362, %int1_13363 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10340 = torch.aten.mul.Tensor %10338, %10339 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13364 = torch.constant.int 1
    %10341 = torch.aten.add.Tensor %10337, %10340, %int1_13364 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13365 = torch.constant.int 1
    %int24_13366 = torch.constant.int 24
    %int4608_13367 = torch.constant.int 4608
    %int128_13368 = torch.constant.int 128
    %10342 = torch.prim.ListConstruct %int1_13365, %int24_13366, %int4608_13367, %int128_13368 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10343 = torch.aten.view %10334, %10342 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13369 = torch.constant.int 5
    %10344 = torch.prims.convert_element_type %10343, %int5_13369 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_13370 = torch.constant.int 1
    %int24_13371 = torch.constant.int 24
    %int4608_13372 = torch.constant.int 4608
    %int128_13373 = torch.constant.int 128
    %10345 = torch.prim.ListConstruct %int1_13370, %int24_13371, %int4608_13372, %int128_13373 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10346 = torch.aten.view %10341, %10345 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13374 = torch.constant.int 5
    %10347 = torch.prims.convert_element_type %10346, %int5_13374 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_13375 = torch.constant.float 0.000000e+00
    %false_13376 = torch.constant.bool false
    %none_13377 = torch.constant.none
    %none_13378 = torch.constant.none
    %10348:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%10344, %10347, %10299, %float0.000000e00_13375, %false_13376, %none_13377, %none_13378) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_13379 = torch.constant.int 0
    %int2_13380 = torch.constant.int 2
    %int1_13381 = torch.constant.int 1
    %int3_13382 = torch.constant.int 3
    %10349 = torch.prim.ListConstruct %int0_13379, %int2_13380, %int1_13381, %int3_13382 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10350 = torch.aten.permute %10348#0, %10349 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_13383 = torch.constant.int 1
    %int4608_13384 = torch.constant.int 4608
    %int3072_13385 = torch.constant.int 3072
    %10351 = torch.prim.ListConstruct %int1_13383, %int4608_13384, %int3072_13385 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10352 = torch.aten.view %10350, %10351 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_13386 = torch.constant.str "tanh"
    %10353 = torch.aten.gelu %10292, %str_13386 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %10354 = torch.prim.ListConstruct %10352, %10353 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_13387 = torch.constant.int 2
    %10355 = torch.aten.cat %10354, %int2_13387 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_13388 = torch.constant.int 4608
    %int15360_13389 = torch.constant.int 15360
    %10356 = torch.prim.ListConstruct %int4608_13388, %int15360_13389 : (!torch.int, !torch.int) -> !torch.list<int>
    %10357 = torch.aten.view %10355, %10356 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.33.linear2.weight = util.global.load @__auto.sampler.single_blocks.33.linear2.weight : tensor<3072x15360xf16>
    %10358 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_13390 = torch.constant.int 0
    %int1_13391 = torch.constant.int 1
    %10359 = torch.aten.transpose.int %10358, %int0_13390, %int1_13391 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.33.linear2.bias = util.global.load @__auto.sampler.single_blocks.33.linear2.bias : tensor<3072xf16>
    %10360 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.33.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_13392 = torch.constant.int 6
    %10361 = torch.prims.convert_element_type %10360, %int6_13392 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_13393 = torch.constant.int 6
    %10362 = torch.prims.convert_element_type %10357, %int6_13393 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_13394 = torch.constant.int 6
    %10363 = torch.prims.convert_element_type %10359, %int6_13394 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %10364 = torch.aten.mm %10362, %10363 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_13395 = torch.constant.int 1
    %10365 = torch.aten.mul.Scalar %10364, %int1_13395 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_13396 = torch.constant.int 1
    %10366 = torch.aten.mul.Scalar %10361, %int1_13396 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_13397 = torch.constant.int 1
    %10367 = torch.aten.add.Tensor %10365, %10366, %int1_13397 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_13398 = torch.constant.int 5
    %10368 = torch.prims.convert_element_type %10367, %int5_13398 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_13399 = torch.constant.int 1
    %int4608_13400 = torch.constant.int 4608
    %int3072_13401 = torch.constant.int 3072
    %10369 = torch.prim.ListConstruct %int1_13399, %int4608_13400, %int3072_13401 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10370 = torch.aten.view %10368, %10369 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %10371 = torch.aten.mul.Tensor %10265, %10370 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13402 = torch.constant.int 1
    %10372 = torch.aten.add.Tensor %10247, %10371, %int1_13402 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10373 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.34.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.34.modulation.lin.weight : tensor<9216x3072xf16>
    %10374 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_13403 = torch.constant.int 0
    %int1_13404 = torch.constant.int 1
    %10375 = torch.aten.transpose.int %10374, %int0_13403, %int1_13404 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.34.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.34.modulation.lin.bias : tensor<9216xf16>
    %10376 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_13405 = torch.constant.int 6
    %10377 = torch.prims.convert_element_type %10376, %int6_13405 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_13406 = torch.constant.int 6
    %10378 = torch.prims.convert_element_type %10373, %int6_13406 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_13407 = torch.constant.int 6
    %10379 = torch.prims.convert_element_type %10375, %int6_13407 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %10380 = torch.aten.mm %10378, %10379 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_13408 = torch.constant.int 1
    %10381 = torch.aten.mul.Scalar %10380, %int1_13408 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_13409 = torch.constant.int 1
    %10382 = torch.aten.mul.Scalar %10377, %int1_13409 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_13410 = torch.constant.int 1
    %10383 = torch.aten.add.Tensor %10381, %10382, %int1_13410 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_13411 = torch.constant.int 5
    %10384 = torch.prims.convert_element_type %10383, %int5_13411 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_13412 = torch.constant.int 0
    %int0_13413 = torch.constant.int 0
    %int9223372036854775807_13414 = torch.constant.int 9223372036854775807
    %int1_13415 = torch.constant.int 1
    %10385 = torch.aten.slice.Tensor %10384, %int0_13412, %int0_13413, %int9223372036854775807_13414, %int1_13415 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_13416 = torch.constant.int 1
    %10386 = torch.aten.unsqueeze %10385, %int1_13416 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_13417 = torch.constant.int 2
    %int0_13418 = torch.constant.int 0
    %int9223372036854775807_13419 = torch.constant.int 9223372036854775807
    %int1_13420 = torch.constant.int 1
    %10387 = torch.aten.slice.Tensor %10386, %int2_13417, %int0_13418, %int9223372036854775807_13419, %int1_13420 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_13421 = torch.constant.int -1
    %int0_13422 = torch.constant.int 0
    %int3072_13423 = torch.constant.int 3072
    %int1_13424 = torch.constant.int 1
    %10388 = torch.aten.slice.Tensor %10387, %int-1_13421, %int0_13422, %int3072_13423, %int1_13424 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13425 = torch.constant.int -1
    %int3072_13426 = torch.constant.int 3072
    %int6144_13427 = torch.constant.int 6144
    %int1_13428 = torch.constant.int 1
    %10389 = torch.aten.slice.Tensor %10387, %int-1_13425, %int3072_13426, %int6144_13427, %int1_13428 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13429 = torch.constant.int -1
    %int6144_13430 = torch.constant.int 6144
    %int9216_13431 = torch.constant.int 9216
    %int1_13432 = torch.constant.int 1
    %10390 = torch.aten.slice.Tensor %10387, %int-1_13429, %int6144_13430, %int9216_13431, %int1_13432 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_13433 = torch.constant.int 1
    %int1_13434 = torch.constant.int 1
    %10391 = torch.aten.add.Scalar %10389, %int1_13433, %int1_13434 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_13435 = torch.constant.int 6
    %10392 = torch.prims.convert_element_type %10372, %int6_13435 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_13436 = torch.constant.int 2
    %10393 = torch.prim.ListConstruct %int2_13436 : (!torch.int) -> !torch.list<int>
    %int0_13437 = torch.constant.int 0
    %true_13438 = torch.constant.bool true
    %result0_13439, %result1_13440 = torch.aten.var_mean.correction %10392, %10393, %int0_13437, %true_13438 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_13441 = torch.constant.float 9.9999999999999995E-7
    %int1_13442 = torch.constant.int 1
    %10394 = torch.aten.add.Scalar %result0_13439, %float9.999990e-07_13441, %int1_13442 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %10395 = torch.aten.rsqrt %10394 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_13443 = torch.constant.int 1
    %10396 = torch.aten.sub.Tensor %10372, %result1_13440, %int1_13443 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %10397 = torch.aten.mul.Tensor %10396, %10395 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_13444 = torch.constant.int 5
    %10398 = torch.prims.convert_element_type %10397, %int5_13444 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10399 = torch.aten.mul.Tensor %10391, %10398 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13445 = torch.constant.int 1
    %10400 = torch.aten.add.Tensor %10399, %10388, %int1_13445 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_13446 = torch.constant.int 4608
    %int3072_13447 = torch.constant.int 3072
    %10401 = torch.prim.ListConstruct %int4608_13446, %int3072_13447 : (!torch.int, !torch.int) -> !torch.list<int>
    %10402 = torch.aten.view %10400, %10401 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.34.linear1.weight = util.global.load @__auto.sampler.single_blocks.34.linear1.weight : tensor<21504x3072xf16>
    %10403 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_13448 = torch.constant.int 0
    %int1_13449 = torch.constant.int 1
    %10404 = torch.aten.transpose.int %10403, %int0_13448, %int1_13449 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.34.linear1.bias = util.global.load @__auto.sampler.single_blocks.34.linear1.bias : tensor<21504xf16>
    %10405 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_13450 = torch.constant.int 6
    %10406 = torch.prims.convert_element_type %10405, %int6_13450 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_13451 = torch.constant.int 6
    %10407 = torch.prims.convert_element_type %10402, %int6_13451 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_13452 = torch.constant.int 6
    %10408 = torch.prims.convert_element_type %10404, %int6_13452 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %10409 = torch.aten.mm %10407, %10408 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_13453 = torch.constant.int 1
    %10410 = torch.aten.mul.Scalar %10409, %int1_13453 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_13454 = torch.constant.int 1
    %10411 = torch.aten.mul.Scalar %10406, %int1_13454 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_13455 = torch.constant.int 1
    %10412 = torch.aten.add.Tensor %10410, %10411, %int1_13455 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_13456 = torch.constant.int 5
    %10413 = torch.prims.convert_element_type %10412, %int5_13456 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_13457 = torch.constant.int 1
    %int4608_13458 = torch.constant.int 4608
    %int21504_13459 = torch.constant.int 21504
    %10414 = torch.prim.ListConstruct %int1_13457, %int4608_13458, %int21504_13459 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10415 = torch.aten.view %10413, %10414 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_13460 = torch.constant.int -1
    %int0_13461 = torch.constant.int 0
    %int9216_13462 = torch.constant.int 9216
    %int1_13463 = torch.constant.int 1
    %10416 = torch.aten.slice.Tensor %10415, %int-1_13460, %int0_13461, %int9216_13462, %int1_13463 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_13464 = torch.constant.int -1
    %int9216_13465 = torch.constant.int 9216
    %int21504_13466 = torch.constant.int 21504
    %int1_13467 = torch.constant.int 1
    %10417 = torch.aten.slice.Tensor %10415, %int-1_13464, %int9216_13465, %int21504_13466, %int1_13467 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_13468 = torch.constant.int 1
    %int4608_13469 = torch.constant.int 4608
    %int3_13470 = torch.constant.int 3
    %int24_13471 = torch.constant.int 24
    %int128_13472 = torch.constant.int 128
    %10418 = torch.prim.ListConstruct %int1_13468, %int4608_13469, %int3_13470, %int24_13471, %int128_13472 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10419 = torch.aten.view %10416, %10418 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_13473 = torch.constant.int 2
    %int0_13474 = torch.constant.int 0
    %int3_13475 = torch.constant.int 3
    %int1_13476 = torch.constant.int 1
    %int4_13477 = torch.constant.int 4
    %10420 = torch.prim.ListConstruct %int2_13473, %int0_13474, %int3_13475, %int1_13476, %int4_13477 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10421 = torch.aten.permute %10419, %10420 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_13478 = torch.constant.int 0
    %int0_13479 = torch.constant.int 0
    %10422 = torch.aten.select.int %10421, %int0_13478, %int0_13479 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13480 = torch.constant.int 0
    %int1_13481 = torch.constant.int 1
    %10423 = torch.aten.select.int %10421, %int0_13480, %int1_13481 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13482 = torch.constant.int 0
    %int2_13483 = torch.constant.int 2
    %10424 = torch.aten.select.int %10421, %int0_13482, %int2_13483 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13484 = torch.constant.int 6
    %10425 = torch.prims.convert_element_type %10422, %int6_13484 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13485 = torch.constant.int 2
    %10426 = torch.aten.pow.Tensor_Scalar %10425, %int2_13485 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13486 = torch.constant.int -1
    %10427 = torch.prim.ListConstruct %int-1_13486 : (!torch.int) -> !torch.list<int>
    %true_13487 = torch.constant.bool true
    %none_13488 = torch.constant.none
    %10428 = torch.aten.mean.dim %10426, %10427, %true_13487, %none_13488 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13489 = torch.constant.float 9.9999999999999995E-7
    %int1_13490 = torch.constant.int 1
    %10429 = torch.aten.add.Scalar %10428, %float9.999990e-07_13489, %int1_13490 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10430 = torch.aten.rsqrt %10429 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10431 = torch.aten.mul.Tensor %10425, %10430 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13491 = torch.constant.int 5
    %10432 = torch.prims.convert_element_type %10431, %int5_13491 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.34.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.34.norm.query_norm.scale : tensor<128xf16>
    %10433 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10434 = torch.aten.mul.Tensor %10432, %10433 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13492 = torch.constant.int 6
    %10435 = torch.prims.convert_element_type %10423, %int6_13492 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13493 = torch.constant.int 2
    %10436 = torch.aten.pow.Tensor_Scalar %10435, %int2_13493 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13494 = torch.constant.int -1
    %10437 = torch.prim.ListConstruct %int-1_13494 : (!torch.int) -> !torch.list<int>
    %true_13495 = torch.constant.bool true
    %none_13496 = torch.constant.none
    %10438 = torch.aten.mean.dim %10436, %10437, %true_13495, %none_13496 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13497 = torch.constant.float 9.9999999999999995E-7
    %int1_13498 = torch.constant.int 1
    %10439 = torch.aten.add.Scalar %10438, %float9.999990e-07_13497, %int1_13498 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10440 = torch.aten.rsqrt %10439 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10441 = torch.aten.mul.Tensor %10435, %10440 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13499 = torch.constant.int 5
    %10442 = torch.prims.convert_element_type %10441, %int5_13499 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.34.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.34.norm.key_norm.scale : tensor<128xf16>
    %10443 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10444 = torch.aten.mul.Tensor %10442, %10443 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13500 = torch.constant.int 5
    %10445 = torch.prims.convert_element_type %10434, %int5_13500 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13501 = torch.constant.int 5
    %10446 = torch.prims.convert_element_type %10444, %int5_13501 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13502 = torch.constant.int 6
    %10447 = torch.prims.convert_element_type %10445, %int6_13502 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13503 = torch.constant.int 1
    %int24_13504 = torch.constant.int 24
    %int4608_13505 = torch.constant.int 4608
    %int64_13506 = torch.constant.int 64
    %int1_13507 = torch.constant.int 1
    %int2_13508 = torch.constant.int 2
    %10448 = torch.prim.ListConstruct %int1_13503, %int24_13504, %int4608_13505, %int64_13506, %int1_13507, %int2_13508 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10449 = torch.aten.view %10447, %10448 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_13509 = torch.constant.int 6
    %10450 = torch.prims.convert_element_type %10446, %int6_13509 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13510 = torch.constant.int 1
    %int24_13511 = torch.constant.int 24
    %int4608_13512 = torch.constant.int 4608
    %int64_13513 = torch.constant.int 64
    %int1_13514 = torch.constant.int 1
    %int2_13515 = torch.constant.int 2
    %10451 = torch.prim.ListConstruct %int1_13510, %int24_13511, %int4608_13512, %int64_13513, %int1_13514, %int2_13515 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10452 = torch.aten.view %10450, %10451 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_13516 = torch.constant.int 5
    %int0_13517 = torch.constant.int 0
    %10453 = torch.aten.select.int %211, %int5_13516, %int0_13517 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13518 = torch.constant.int 5
    %int0_13519 = torch.constant.int 0
    %10454 = torch.aten.select.int %10449, %int5_13518, %int0_13519 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10455 = torch.aten.mul.Tensor %10453, %10454 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13520 = torch.constant.int 5
    %int1_13521 = torch.constant.int 1
    %10456 = torch.aten.select.int %211, %int5_13520, %int1_13521 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13522 = torch.constant.int 5
    %int1_13523 = torch.constant.int 1
    %10457 = torch.aten.select.int %10449, %int5_13522, %int1_13523 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10458 = torch.aten.mul.Tensor %10456, %10457 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13524 = torch.constant.int 1
    %10459 = torch.aten.add.Tensor %10455, %10458, %int1_13524 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13525 = torch.constant.int 5
    %int0_13526 = torch.constant.int 0
    %10460 = torch.aten.select.int %211, %int5_13525, %int0_13526 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13527 = torch.constant.int 5
    %int0_13528 = torch.constant.int 0
    %10461 = torch.aten.select.int %10452, %int5_13527, %int0_13528 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10462 = torch.aten.mul.Tensor %10460, %10461 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13529 = torch.constant.int 5
    %int1_13530 = torch.constant.int 1
    %10463 = torch.aten.select.int %211, %int5_13529, %int1_13530 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13531 = torch.constant.int 5
    %int1_13532 = torch.constant.int 1
    %10464 = torch.aten.select.int %10452, %int5_13531, %int1_13532 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10465 = torch.aten.mul.Tensor %10463, %10464 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13533 = torch.constant.int 1
    %10466 = torch.aten.add.Tensor %10462, %10465, %int1_13533 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13534 = torch.constant.int 1
    %int24_13535 = torch.constant.int 24
    %int4608_13536 = torch.constant.int 4608
    %int128_13537 = torch.constant.int 128
    %10467 = torch.prim.ListConstruct %int1_13534, %int24_13535, %int4608_13536, %int128_13537 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10468 = torch.aten.view %10459, %10467 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13538 = torch.constant.int 5
    %10469 = torch.prims.convert_element_type %10468, %int5_13538 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_13539 = torch.constant.int 1
    %int24_13540 = torch.constant.int 24
    %int4608_13541 = torch.constant.int 4608
    %int128_13542 = torch.constant.int 128
    %10470 = torch.prim.ListConstruct %int1_13539, %int24_13540, %int4608_13541, %int128_13542 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10471 = torch.aten.view %10466, %10470 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13543 = torch.constant.int 5
    %10472 = torch.prims.convert_element_type %10471, %int5_13543 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_13544 = torch.constant.float 0.000000e+00
    %false_13545 = torch.constant.bool false
    %none_13546 = torch.constant.none
    %none_13547 = torch.constant.none
    %10473:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%10469, %10472, %10424, %float0.000000e00_13544, %false_13545, %none_13546, %none_13547) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_13548 = torch.constant.int 0
    %int2_13549 = torch.constant.int 2
    %int1_13550 = torch.constant.int 1
    %int3_13551 = torch.constant.int 3
    %10474 = torch.prim.ListConstruct %int0_13548, %int2_13549, %int1_13550, %int3_13551 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10475 = torch.aten.permute %10473#0, %10474 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_13552 = torch.constant.int 1
    %int4608_13553 = torch.constant.int 4608
    %int3072_13554 = torch.constant.int 3072
    %10476 = torch.prim.ListConstruct %int1_13552, %int4608_13553, %int3072_13554 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10477 = torch.aten.view %10475, %10476 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_13555 = torch.constant.str "tanh"
    %10478 = torch.aten.gelu %10417, %str_13555 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %10479 = torch.prim.ListConstruct %10477, %10478 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_13556 = torch.constant.int 2
    %10480 = torch.aten.cat %10479, %int2_13556 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_13557 = torch.constant.int 4608
    %int15360_13558 = torch.constant.int 15360
    %10481 = torch.prim.ListConstruct %int4608_13557, %int15360_13558 : (!torch.int, !torch.int) -> !torch.list<int>
    %10482 = torch.aten.view %10480, %10481 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.34.linear2.weight = util.global.load @__auto.sampler.single_blocks.34.linear2.weight : tensor<3072x15360xf16>
    %10483 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_13559 = torch.constant.int 0
    %int1_13560 = torch.constant.int 1
    %10484 = torch.aten.transpose.int %10483, %int0_13559, %int1_13560 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.34.linear2.bias = util.global.load @__auto.sampler.single_blocks.34.linear2.bias : tensor<3072xf16>
    %10485 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.34.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_13561 = torch.constant.int 6
    %10486 = torch.prims.convert_element_type %10485, %int6_13561 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_13562 = torch.constant.int 6
    %10487 = torch.prims.convert_element_type %10482, %int6_13562 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_13563 = torch.constant.int 6
    %10488 = torch.prims.convert_element_type %10484, %int6_13563 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %10489 = torch.aten.mm %10487, %10488 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_13564 = torch.constant.int 1
    %10490 = torch.aten.mul.Scalar %10489, %int1_13564 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_13565 = torch.constant.int 1
    %10491 = torch.aten.mul.Scalar %10486, %int1_13565 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_13566 = torch.constant.int 1
    %10492 = torch.aten.add.Tensor %10490, %10491, %int1_13566 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_13567 = torch.constant.int 5
    %10493 = torch.prims.convert_element_type %10492, %int5_13567 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_13568 = torch.constant.int 1
    %int4608_13569 = torch.constant.int 4608
    %int3072_13570 = torch.constant.int 3072
    %10494 = torch.prim.ListConstruct %int1_13568, %int4608_13569, %int3072_13570 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10495 = torch.aten.view %10493, %10494 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %10496 = torch.aten.mul.Tensor %10390, %10495 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13571 = torch.constant.int 1
    %10497 = torch.aten.add.Tensor %10372, %10496, %int1_13571 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10498 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.35.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.35.modulation.lin.weight : tensor<9216x3072xf16>
    %10499 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_13572 = torch.constant.int 0
    %int1_13573 = torch.constant.int 1
    %10500 = torch.aten.transpose.int %10499, %int0_13572, %int1_13573 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.35.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.35.modulation.lin.bias : tensor<9216xf16>
    %10501 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_13574 = torch.constant.int 6
    %10502 = torch.prims.convert_element_type %10501, %int6_13574 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_13575 = torch.constant.int 6
    %10503 = torch.prims.convert_element_type %10498, %int6_13575 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_13576 = torch.constant.int 6
    %10504 = torch.prims.convert_element_type %10500, %int6_13576 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %10505 = torch.aten.mm %10503, %10504 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_13577 = torch.constant.int 1
    %10506 = torch.aten.mul.Scalar %10505, %int1_13577 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_13578 = torch.constant.int 1
    %10507 = torch.aten.mul.Scalar %10502, %int1_13578 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_13579 = torch.constant.int 1
    %10508 = torch.aten.add.Tensor %10506, %10507, %int1_13579 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_13580 = torch.constant.int 5
    %10509 = torch.prims.convert_element_type %10508, %int5_13580 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_13581 = torch.constant.int 0
    %int0_13582 = torch.constant.int 0
    %int9223372036854775807_13583 = torch.constant.int 9223372036854775807
    %int1_13584 = torch.constant.int 1
    %10510 = torch.aten.slice.Tensor %10509, %int0_13581, %int0_13582, %int9223372036854775807_13583, %int1_13584 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_13585 = torch.constant.int 1
    %10511 = torch.aten.unsqueeze %10510, %int1_13585 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_13586 = torch.constant.int 2
    %int0_13587 = torch.constant.int 0
    %int9223372036854775807_13588 = torch.constant.int 9223372036854775807
    %int1_13589 = torch.constant.int 1
    %10512 = torch.aten.slice.Tensor %10511, %int2_13586, %int0_13587, %int9223372036854775807_13588, %int1_13589 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_13590 = torch.constant.int -1
    %int0_13591 = torch.constant.int 0
    %int3072_13592 = torch.constant.int 3072
    %int1_13593 = torch.constant.int 1
    %10513 = torch.aten.slice.Tensor %10512, %int-1_13590, %int0_13591, %int3072_13592, %int1_13593 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13594 = torch.constant.int -1
    %int3072_13595 = torch.constant.int 3072
    %int6144_13596 = torch.constant.int 6144
    %int1_13597 = torch.constant.int 1
    %10514 = torch.aten.slice.Tensor %10512, %int-1_13594, %int3072_13595, %int6144_13596, %int1_13597 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13598 = torch.constant.int -1
    %int6144_13599 = torch.constant.int 6144
    %int9216_13600 = torch.constant.int 9216
    %int1_13601 = torch.constant.int 1
    %10515 = torch.aten.slice.Tensor %10512, %int-1_13598, %int6144_13599, %int9216_13600, %int1_13601 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_13602 = torch.constant.int 1
    %int1_13603 = torch.constant.int 1
    %10516 = torch.aten.add.Scalar %10514, %int1_13602, %int1_13603 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_13604 = torch.constant.int 6
    %10517 = torch.prims.convert_element_type %10497, %int6_13604 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_13605 = torch.constant.int 2
    %10518 = torch.prim.ListConstruct %int2_13605 : (!torch.int) -> !torch.list<int>
    %int0_13606 = torch.constant.int 0
    %true_13607 = torch.constant.bool true
    %result0_13608, %result1_13609 = torch.aten.var_mean.correction %10517, %10518, %int0_13606, %true_13607 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_13610 = torch.constant.float 9.9999999999999995E-7
    %int1_13611 = torch.constant.int 1
    %10519 = torch.aten.add.Scalar %result0_13608, %float9.999990e-07_13610, %int1_13611 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %10520 = torch.aten.rsqrt %10519 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_13612 = torch.constant.int 1
    %10521 = torch.aten.sub.Tensor %10497, %result1_13609, %int1_13612 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %10522 = torch.aten.mul.Tensor %10521, %10520 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_13613 = torch.constant.int 5
    %10523 = torch.prims.convert_element_type %10522, %int5_13613 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10524 = torch.aten.mul.Tensor %10516, %10523 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13614 = torch.constant.int 1
    %10525 = torch.aten.add.Tensor %10524, %10513, %int1_13614 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_13615 = torch.constant.int 4608
    %int3072_13616 = torch.constant.int 3072
    %10526 = torch.prim.ListConstruct %int4608_13615, %int3072_13616 : (!torch.int, !torch.int) -> !torch.list<int>
    %10527 = torch.aten.view %10525, %10526 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.35.linear1.weight = util.global.load @__auto.sampler.single_blocks.35.linear1.weight : tensor<21504x3072xf16>
    %10528 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_13617 = torch.constant.int 0
    %int1_13618 = torch.constant.int 1
    %10529 = torch.aten.transpose.int %10528, %int0_13617, %int1_13618 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.35.linear1.bias = util.global.load @__auto.sampler.single_blocks.35.linear1.bias : tensor<21504xf16>
    %10530 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_13619 = torch.constant.int 6
    %10531 = torch.prims.convert_element_type %10530, %int6_13619 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_13620 = torch.constant.int 6
    %10532 = torch.prims.convert_element_type %10527, %int6_13620 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_13621 = torch.constant.int 6
    %10533 = torch.prims.convert_element_type %10529, %int6_13621 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %10534 = torch.aten.mm %10532, %10533 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_13622 = torch.constant.int 1
    %10535 = torch.aten.mul.Scalar %10534, %int1_13622 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_13623 = torch.constant.int 1
    %10536 = torch.aten.mul.Scalar %10531, %int1_13623 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_13624 = torch.constant.int 1
    %10537 = torch.aten.add.Tensor %10535, %10536, %int1_13624 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_13625 = torch.constant.int 5
    %10538 = torch.prims.convert_element_type %10537, %int5_13625 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_13626 = torch.constant.int 1
    %int4608_13627 = torch.constant.int 4608
    %int21504_13628 = torch.constant.int 21504
    %10539 = torch.prim.ListConstruct %int1_13626, %int4608_13627, %int21504_13628 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10540 = torch.aten.view %10538, %10539 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_13629 = torch.constant.int -1
    %int0_13630 = torch.constant.int 0
    %int9216_13631 = torch.constant.int 9216
    %int1_13632 = torch.constant.int 1
    %10541 = torch.aten.slice.Tensor %10540, %int-1_13629, %int0_13630, %int9216_13631, %int1_13632 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_13633 = torch.constant.int -1
    %int9216_13634 = torch.constant.int 9216
    %int21504_13635 = torch.constant.int 21504
    %int1_13636 = torch.constant.int 1
    %10542 = torch.aten.slice.Tensor %10540, %int-1_13633, %int9216_13634, %int21504_13635, %int1_13636 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_13637 = torch.constant.int 1
    %int4608_13638 = torch.constant.int 4608
    %int3_13639 = torch.constant.int 3
    %int24_13640 = torch.constant.int 24
    %int128_13641 = torch.constant.int 128
    %10543 = torch.prim.ListConstruct %int1_13637, %int4608_13638, %int3_13639, %int24_13640, %int128_13641 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10544 = torch.aten.view %10541, %10543 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_13642 = torch.constant.int 2
    %int0_13643 = torch.constant.int 0
    %int3_13644 = torch.constant.int 3
    %int1_13645 = torch.constant.int 1
    %int4_13646 = torch.constant.int 4
    %10545 = torch.prim.ListConstruct %int2_13642, %int0_13643, %int3_13644, %int1_13645, %int4_13646 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10546 = torch.aten.permute %10544, %10545 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_13647 = torch.constant.int 0
    %int0_13648 = torch.constant.int 0
    %10547 = torch.aten.select.int %10546, %int0_13647, %int0_13648 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13649 = torch.constant.int 0
    %int1_13650 = torch.constant.int 1
    %10548 = torch.aten.select.int %10546, %int0_13649, %int1_13650 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13651 = torch.constant.int 0
    %int2_13652 = torch.constant.int 2
    %10549 = torch.aten.select.int %10546, %int0_13651, %int2_13652 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13653 = torch.constant.int 6
    %10550 = torch.prims.convert_element_type %10547, %int6_13653 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13654 = torch.constant.int 2
    %10551 = torch.aten.pow.Tensor_Scalar %10550, %int2_13654 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13655 = torch.constant.int -1
    %10552 = torch.prim.ListConstruct %int-1_13655 : (!torch.int) -> !torch.list<int>
    %true_13656 = torch.constant.bool true
    %none_13657 = torch.constant.none
    %10553 = torch.aten.mean.dim %10551, %10552, %true_13656, %none_13657 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13658 = torch.constant.float 9.9999999999999995E-7
    %int1_13659 = torch.constant.int 1
    %10554 = torch.aten.add.Scalar %10553, %float9.999990e-07_13658, %int1_13659 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10555 = torch.aten.rsqrt %10554 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10556 = torch.aten.mul.Tensor %10550, %10555 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13660 = torch.constant.int 5
    %10557 = torch.prims.convert_element_type %10556, %int5_13660 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.35.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.35.norm.query_norm.scale : tensor<128xf16>
    %10558 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10559 = torch.aten.mul.Tensor %10557, %10558 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13661 = torch.constant.int 6
    %10560 = torch.prims.convert_element_type %10548, %int6_13661 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13662 = torch.constant.int 2
    %10561 = torch.aten.pow.Tensor_Scalar %10560, %int2_13662 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13663 = torch.constant.int -1
    %10562 = torch.prim.ListConstruct %int-1_13663 : (!torch.int) -> !torch.list<int>
    %true_13664 = torch.constant.bool true
    %none_13665 = torch.constant.none
    %10563 = torch.aten.mean.dim %10561, %10562, %true_13664, %none_13665 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13666 = torch.constant.float 9.9999999999999995E-7
    %int1_13667 = torch.constant.int 1
    %10564 = torch.aten.add.Scalar %10563, %float9.999990e-07_13666, %int1_13667 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10565 = torch.aten.rsqrt %10564 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10566 = torch.aten.mul.Tensor %10560, %10565 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13668 = torch.constant.int 5
    %10567 = torch.prims.convert_element_type %10566, %int5_13668 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.35.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.35.norm.key_norm.scale : tensor<128xf16>
    %10568 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10569 = torch.aten.mul.Tensor %10567, %10568 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13669 = torch.constant.int 5
    %10570 = torch.prims.convert_element_type %10559, %int5_13669 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13670 = torch.constant.int 5
    %10571 = torch.prims.convert_element_type %10569, %int5_13670 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13671 = torch.constant.int 6
    %10572 = torch.prims.convert_element_type %10570, %int6_13671 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13672 = torch.constant.int 1
    %int24_13673 = torch.constant.int 24
    %int4608_13674 = torch.constant.int 4608
    %int64_13675 = torch.constant.int 64
    %int1_13676 = torch.constant.int 1
    %int2_13677 = torch.constant.int 2
    %10573 = torch.prim.ListConstruct %int1_13672, %int24_13673, %int4608_13674, %int64_13675, %int1_13676, %int2_13677 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10574 = torch.aten.view %10572, %10573 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_13678 = torch.constant.int 6
    %10575 = torch.prims.convert_element_type %10571, %int6_13678 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13679 = torch.constant.int 1
    %int24_13680 = torch.constant.int 24
    %int4608_13681 = torch.constant.int 4608
    %int64_13682 = torch.constant.int 64
    %int1_13683 = torch.constant.int 1
    %int2_13684 = torch.constant.int 2
    %10576 = torch.prim.ListConstruct %int1_13679, %int24_13680, %int4608_13681, %int64_13682, %int1_13683, %int2_13684 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10577 = torch.aten.view %10575, %10576 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_13685 = torch.constant.int 5
    %int0_13686 = torch.constant.int 0
    %10578 = torch.aten.select.int %211, %int5_13685, %int0_13686 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13687 = torch.constant.int 5
    %int0_13688 = torch.constant.int 0
    %10579 = torch.aten.select.int %10574, %int5_13687, %int0_13688 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10580 = torch.aten.mul.Tensor %10578, %10579 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13689 = torch.constant.int 5
    %int1_13690 = torch.constant.int 1
    %10581 = torch.aten.select.int %211, %int5_13689, %int1_13690 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13691 = torch.constant.int 5
    %int1_13692 = torch.constant.int 1
    %10582 = torch.aten.select.int %10574, %int5_13691, %int1_13692 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10583 = torch.aten.mul.Tensor %10581, %10582 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13693 = torch.constant.int 1
    %10584 = torch.aten.add.Tensor %10580, %10583, %int1_13693 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13694 = torch.constant.int 5
    %int0_13695 = torch.constant.int 0
    %10585 = torch.aten.select.int %211, %int5_13694, %int0_13695 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13696 = torch.constant.int 5
    %int0_13697 = torch.constant.int 0
    %10586 = torch.aten.select.int %10577, %int5_13696, %int0_13697 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10587 = torch.aten.mul.Tensor %10585, %10586 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13698 = torch.constant.int 5
    %int1_13699 = torch.constant.int 1
    %10588 = torch.aten.select.int %211, %int5_13698, %int1_13699 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13700 = torch.constant.int 5
    %int1_13701 = torch.constant.int 1
    %10589 = torch.aten.select.int %10577, %int5_13700, %int1_13701 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10590 = torch.aten.mul.Tensor %10588, %10589 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13702 = torch.constant.int 1
    %10591 = torch.aten.add.Tensor %10587, %10590, %int1_13702 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13703 = torch.constant.int 1
    %int24_13704 = torch.constant.int 24
    %int4608_13705 = torch.constant.int 4608
    %int128_13706 = torch.constant.int 128
    %10592 = torch.prim.ListConstruct %int1_13703, %int24_13704, %int4608_13705, %int128_13706 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10593 = torch.aten.view %10584, %10592 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13707 = torch.constant.int 5
    %10594 = torch.prims.convert_element_type %10593, %int5_13707 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_13708 = torch.constant.int 1
    %int24_13709 = torch.constant.int 24
    %int4608_13710 = torch.constant.int 4608
    %int128_13711 = torch.constant.int 128
    %10595 = torch.prim.ListConstruct %int1_13708, %int24_13709, %int4608_13710, %int128_13711 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10596 = torch.aten.view %10591, %10595 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13712 = torch.constant.int 5
    %10597 = torch.prims.convert_element_type %10596, %int5_13712 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_13713 = torch.constant.float 0.000000e+00
    %false_13714 = torch.constant.bool false
    %none_13715 = torch.constant.none
    %none_13716 = torch.constant.none
    %10598:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%10594, %10597, %10549, %float0.000000e00_13713, %false_13714, %none_13715, %none_13716) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_13717 = torch.constant.int 0
    %int2_13718 = torch.constant.int 2
    %int1_13719 = torch.constant.int 1
    %int3_13720 = torch.constant.int 3
    %10599 = torch.prim.ListConstruct %int0_13717, %int2_13718, %int1_13719, %int3_13720 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10600 = torch.aten.permute %10598#0, %10599 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_13721 = torch.constant.int 1
    %int4608_13722 = torch.constant.int 4608
    %int3072_13723 = torch.constant.int 3072
    %10601 = torch.prim.ListConstruct %int1_13721, %int4608_13722, %int3072_13723 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10602 = torch.aten.view %10600, %10601 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_13724 = torch.constant.str "tanh"
    %10603 = torch.aten.gelu %10542, %str_13724 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %10604 = torch.prim.ListConstruct %10602, %10603 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_13725 = torch.constant.int 2
    %10605 = torch.aten.cat %10604, %int2_13725 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_13726 = torch.constant.int 4608
    %int15360_13727 = torch.constant.int 15360
    %10606 = torch.prim.ListConstruct %int4608_13726, %int15360_13727 : (!torch.int, !torch.int) -> !torch.list<int>
    %10607 = torch.aten.view %10605, %10606 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.35.linear2.weight = util.global.load @__auto.sampler.single_blocks.35.linear2.weight : tensor<3072x15360xf16>
    %10608 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_13728 = torch.constant.int 0
    %int1_13729 = torch.constant.int 1
    %10609 = torch.aten.transpose.int %10608, %int0_13728, %int1_13729 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.35.linear2.bias = util.global.load @__auto.sampler.single_blocks.35.linear2.bias : tensor<3072xf16>
    %10610 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.35.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_13730 = torch.constant.int 6
    %10611 = torch.prims.convert_element_type %10610, %int6_13730 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_13731 = torch.constant.int 6
    %10612 = torch.prims.convert_element_type %10607, %int6_13731 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_13732 = torch.constant.int 6
    %10613 = torch.prims.convert_element_type %10609, %int6_13732 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %10614 = torch.aten.mm %10612, %10613 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_13733 = torch.constant.int 1
    %10615 = torch.aten.mul.Scalar %10614, %int1_13733 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_13734 = torch.constant.int 1
    %10616 = torch.aten.mul.Scalar %10611, %int1_13734 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_13735 = torch.constant.int 1
    %10617 = torch.aten.add.Tensor %10615, %10616, %int1_13735 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_13736 = torch.constant.int 5
    %10618 = torch.prims.convert_element_type %10617, %int5_13736 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_13737 = torch.constant.int 1
    %int4608_13738 = torch.constant.int 4608
    %int3072_13739 = torch.constant.int 3072
    %10619 = torch.prim.ListConstruct %int1_13737, %int4608_13738, %int3072_13739 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10620 = torch.aten.view %10618, %10619 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %10621 = torch.aten.mul.Tensor %10515, %10620 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13740 = torch.constant.int 1
    %10622 = torch.aten.add.Tensor %10497, %10621, %int1_13740 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10623 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.36.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.36.modulation.lin.weight : tensor<9216x3072xf16>
    %10624 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_13741 = torch.constant.int 0
    %int1_13742 = torch.constant.int 1
    %10625 = torch.aten.transpose.int %10624, %int0_13741, %int1_13742 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.36.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.36.modulation.lin.bias : tensor<9216xf16>
    %10626 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_13743 = torch.constant.int 6
    %10627 = torch.prims.convert_element_type %10626, %int6_13743 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_13744 = torch.constant.int 6
    %10628 = torch.prims.convert_element_type %10623, %int6_13744 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_13745 = torch.constant.int 6
    %10629 = torch.prims.convert_element_type %10625, %int6_13745 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %10630 = torch.aten.mm %10628, %10629 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_13746 = torch.constant.int 1
    %10631 = torch.aten.mul.Scalar %10630, %int1_13746 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_13747 = torch.constant.int 1
    %10632 = torch.aten.mul.Scalar %10627, %int1_13747 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_13748 = torch.constant.int 1
    %10633 = torch.aten.add.Tensor %10631, %10632, %int1_13748 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_13749 = torch.constant.int 5
    %10634 = torch.prims.convert_element_type %10633, %int5_13749 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_13750 = torch.constant.int 0
    %int0_13751 = torch.constant.int 0
    %int9223372036854775807_13752 = torch.constant.int 9223372036854775807
    %int1_13753 = torch.constant.int 1
    %10635 = torch.aten.slice.Tensor %10634, %int0_13750, %int0_13751, %int9223372036854775807_13752, %int1_13753 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_13754 = torch.constant.int 1
    %10636 = torch.aten.unsqueeze %10635, %int1_13754 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_13755 = torch.constant.int 2
    %int0_13756 = torch.constant.int 0
    %int9223372036854775807_13757 = torch.constant.int 9223372036854775807
    %int1_13758 = torch.constant.int 1
    %10637 = torch.aten.slice.Tensor %10636, %int2_13755, %int0_13756, %int9223372036854775807_13757, %int1_13758 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_13759 = torch.constant.int -1
    %int0_13760 = torch.constant.int 0
    %int3072_13761 = torch.constant.int 3072
    %int1_13762 = torch.constant.int 1
    %10638 = torch.aten.slice.Tensor %10637, %int-1_13759, %int0_13760, %int3072_13761, %int1_13762 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13763 = torch.constant.int -1
    %int3072_13764 = torch.constant.int 3072
    %int6144_13765 = torch.constant.int 6144
    %int1_13766 = torch.constant.int 1
    %10639 = torch.aten.slice.Tensor %10637, %int-1_13763, %int3072_13764, %int6144_13765, %int1_13766 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13767 = torch.constant.int -1
    %int6144_13768 = torch.constant.int 6144
    %int9216_13769 = torch.constant.int 9216
    %int1_13770 = torch.constant.int 1
    %10640 = torch.aten.slice.Tensor %10637, %int-1_13767, %int6144_13768, %int9216_13769, %int1_13770 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_13771 = torch.constant.int 1
    %int1_13772 = torch.constant.int 1
    %10641 = torch.aten.add.Scalar %10639, %int1_13771, %int1_13772 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_13773 = torch.constant.int 6
    %10642 = torch.prims.convert_element_type %10622, %int6_13773 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_13774 = torch.constant.int 2
    %10643 = torch.prim.ListConstruct %int2_13774 : (!torch.int) -> !torch.list<int>
    %int0_13775 = torch.constant.int 0
    %true_13776 = torch.constant.bool true
    %result0_13777, %result1_13778 = torch.aten.var_mean.correction %10642, %10643, %int0_13775, %true_13776 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_13779 = torch.constant.float 9.9999999999999995E-7
    %int1_13780 = torch.constant.int 1
    %10644 = torch.aten.add.Scalar %result0_13777, %float9.999990e-07_13779, %int1_13780 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %10645 = torch.aten.rsqrt %10644 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_13781 = torch.constant.int 1
    %10646 = torch.aten.sub.Tensor %10622, %result1_13778, %int1_13781 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %10647 = torch.aten.mul.Tensor %10646, %10645 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_13782 = torch.constant.int 5
    %10648 = torch.prims.convert_element_type %10647, %int5_13782 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10649 = torch.aten.mul.Tensor %10641, %10648 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13783 = torch.constant.int 1
    %10650 = torch.aten.add.Tensor %10649, %10638, %int1_13783 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_13784 = torch.constant.int 4608
    %int3072_13785 = torch.constant.int 3072
    %10651 = torch.prim.ListConstruct %int4608_13784, %int3072_13785 : (!torch.int, !torch.int) -> !torch.list<int>
    %10652 = torch.aten.view %10650, %10651 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.36.linear1.weight = util.global.load @__auto.sampler.single_blocks.36.linear1.weight : tensor<21504x3072xf16>
    %10653 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_13786 = torch.constant.int 0
    %int1_13787 = torch.constant.int 1
    %10654 = torch.aten.transpose.int %10653, %int0_13786, %int1_13787 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.36.linear1.bias = util.global.load @__auto.sampler.single_blocks.36.linear1.bias : tensor<21504xf16>
    %10655 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_13788 = torch.constant.int 6
    %10656 = torch.prims.convert_element_type %10655, %int6_13788 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_13789 = torch.constant.int 6
    %10657 = torch.prims.convert_element_type %10652, %int6_13789 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_13790 = torch.constant.int 6
    %10658 = torch.prims.convert_element_type %10654, %int6_13790 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %10659 = torch.aten.mm %10657, %10658 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_13791 = torch.constant.int 1
    %10660 = torch.aten.mul.Scalar %10659, %int1_13791 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_13792 = torch.constant.int 1
    %10661 = torch.aten.mul.Scalar %10656, %int1_13792 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_13793 = torch.constant.int 1
    %10662 = torch.aten.add.Tensor %10660, %10661, %int1_13793 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_13794 = torch.constant.int 5
    %10663 = torch.prims.convert_element_type %10662, %int5_13794 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_13795 = torch.constant.int 1
    %int4608_13796 = torch.constant.int 4608
    %int21504_13797 = torch.constant.int 21504
    %10664 = torch.prim.ListConstruct %int1_13795, %int4608_13796, %int21504_13797 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10665 = torch.aten.view %10663, %10664 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_13798 = torch.constant.int -1
    %int0_13799 = torch.constant.int 0
    %int9216_13800 = torch.constant.int 9216
    %int1_13801 = torch.constant.int 1
    %10666 = torch.aten.slice.Tensor %10665, %int-1_13798, %int0_13799, %int9216_13800, %int1_13801 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_13802 = torch.constant.int -1
    %int9216_13803 = torch.constant.int 9216
    %int21504_13804 = torch.constant.int 21504
    %int1_13805 = torch.constant.int 1
    %10667 = torch.aten.slice.Tensor %10665, %int-1_13802, %int9216_13803, %int21504_13804, %int1_13805 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_13806 = torch.constant.int 1
    %int4608_13807 = torch.constant.int 4608
    %int3_13808 = torch.constant.int 3
    %int24_13809 = torch.constant.int 24
    %int128_13810 = torch.constant.int 128
    %10668 = torch.prim.ListConstruct %int1_13806, %int4608_13807, %int3_13808, %int24_13809, %int128_13810 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10669 = torch.aten.view %10666, %10668 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_13811 = torch.constant.int 2
    %int0_13812 = torch.constant.int 0
    %int3_13813 = torch.constant.int 3
    %int1_13814 = torch.constant.int 1
    %int4_13815 = torch.constant.int 4
    %10670 = torch.prim.ListConstruct %int2_13811, %int0_13812, %int3_13813, %int1_13814, %int4_13815 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10671 = torch.aten.permute %10669, %10670 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_13816 = torch.constant.int 0
    %int0_13817 = torch.constant.int 0
    %10672 = torch.aten.select.int %10671, %int0_13816, %int0_13817 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13818 = torch.constant.int 0
    %int1_13819 = torch.constant.int 1
    %10673 = torch.aten.select.int %10671, %int0_13818, %int1_13819 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13820 = torch.constant.int 0
    %int2_13821 = torch.constant.int 2
    %10674 = torch.aten.select.int %10671, %int0_13820, %int2_13821 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13822 = torch.constant.int 6
    %10675 = torch.prims.convert_element_type %10672, %int6_13822 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13823 = torch.constant.int 2
    %10676 = torch.aten.pow.Tensor_Scalar %10675, %int2_13823 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13824 = torch.constant.int -1
    %10677 = torch.prim.ListConstruct %int-1_13824 : (!torch.int) -> !torch.list<int>
    %true_13825 = torch.constant.bool true
    %none_13826 = torch.constant.none
    %10678 = torch.aten.mean.dim %10676, %10677, %true_13825, %none_13826 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13827 = torch.constant.float 9.9999999999999995E-7
    %int1_13828 = torch.constant.int 1
    %10679 = torch.aten.add.Scalar %10678, %float9.999990e-07_13827, %int1_13828 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10680 = torch.aten.rsqrt %10679 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10681 = torch.aten.mul.Tensor %10675, %10680 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13829 = torch.constant.int 5
    %10682 = torch.prims.convert_element_type %10681, %int5_13829 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.36.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.36.norm.query_norm.scale : tensor<128xf16>
    %10683 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10684 = torch.aten.mul.Tensor %10682, %10683 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13830 = torch.constant.int 6
    %10685 = torch.prims.convert_element_type %10673, %int6_13830 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13831 = torch.constant.int 2
    %10686 = torch.aten.pow.Tensor_Scalar %10685, %int2_13831 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13832 = torch.constant.int -1
    %10687 = torch.prim.ListConstruct %int-1_13832 : (!torch.int) -> !torch.list<int>
    %true_13833 = torch.constant.bool true
    %none_13834 = torch.constant.none
    %10688 = torch.aten.mean.dim %10686, %10687, %true_13833, %none_13834 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13835 = torch.constant.float 9.9999999999999995E-7
    %int1_13836 = torch.constant.int 1
    %10689 = torch.aten.add.Scalar %10688, %float9.999990e-07_13835, %int1_13836 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10690 = torch.aten.rsqrt %10689 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10691 = torch.aten.mul.Tensor %10685, %10690 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13837 = torch.constant.int 5
    %10692 = torch.prims.convert_element_type %10691, %int5_13837 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.36.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.36.norm.key_norm.scale : tensor<128xf16>
    %10693 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10694 = torch.aten.mul.Tensor %10692, %10693 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13838 = torch.constant.int 5
    %10695 = torch.prims.convert_element_type %10684, %int5_13838 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_13839 = torch.constant.int 5
    %10696 = torch.prims.convert_element_type %10694, %int5_13839 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13840 = torch.constant.int 6
    %10697 = torch.prims.convert_element_type %10695, %int6_13840 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13841 = torch.constant.int 1
    %int24_13842 = torch.constant.int 24
    %int4608_13843 = torch.constant.int 4608
    %int64_13844 = torch.constant.int 64
    %int1_13845 = torch.constant.int 1
    %int2_13846 = torch.constant.int 2
    %10698 = torch.prim.ListConstruct %int1_13841, %int24_13842, %int4608_13843, %int64_13844, %int1_13845, %int2_13846 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10699 = torch.aten.view %10697, %10698 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_13847 = torch.constant.int 6
    %10700 = torch.prims.convert_element_type %10696, %int6_13847 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_13848 = torch.constant.int 1
    %int24_13849 = torch.constant.int 24
    %int4608_13850 = torch.constant.int 4608
    %int64_13851 = torch.constant.int 64
    %int1_13852 = torch.constant.int 1
    %int2_13853 = torch.constant.int 2
    %10701 = torch.prim.ListConstruct %int1_13848, %int24_13849, %int4608_13850, %int64_13851, %int1_13852, %int2_13853 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10702 = torch.aten.view %10700, %10701 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_13854 = torch.constant.int 5
    %int0_13855 = torch.constant.int 0
    %10703 = torch.aten.select.int %211, %int5_13854, %int0_13855 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13856 = torch.constant.int 5
    %int0_13857 = torch.constant.int 0
    %10704 = torch.aten.select.int %10699, %int5_13856, %int0_13857 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10705 = torch.aten.mul.Tensor %10703, %10704 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13858 = torch.constant.int 5
    %int1_13859 = torch.constant.int 1
    %10706 = torch.aten.select.int %211, %int5_13858, %int1_13859 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13860 = torch.constant.int 5
    %int1_13861 = torch.constant.int 1
    %10707 = torch.aten.select.int %10699, %int5_13860, %int1_13861 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10708 = torch.aten.mul.Tensor %10706, %10707 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13862 = torch.constant.int 1
    %10709 = torch.aten.add.Tensor %10705, %10708, %int1_13862 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13863 = torch.constant.int 5
    %int0_13864 = torch.constant.int 0
    %10710 = torch.aten.select.int %211, %int5_13863, %int0_13864 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13865 = torch.constant.int 5
    %int0_13866 = torch.constant.int 0
    %10711 = torch.aten.select.int %10702, %int5_13865, %int0_13866 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10712 = torch.aten.mul.Tensor %10710, %10711 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_13867 = torch.constant.int 5
    %int1_13868 = torch.constant.int 1
    %10713 = torch.aten.select.int %211, %int5_13867, %int1_13868 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_13869 = torch.constant.int 5
    %int1_13870 = torch.constant.int 1
    %10714 = torch.aten.select.int %10702, %int5_13869, %int1_13870 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10715 = torch.aten.mul.Tensor %10713, %10714 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13871 = torch.constant.int 1
    %10716 = torch.aten.add.Tensor %10712, %10715, %int1_13871 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_13872 = torch.constant.int 1
    %int24_13873 = torch.constant.int 24
    %int4608_13874 = torch.constant.int 4608
    %int128_13875 = torch.constant.int 128
    %10717 = torch.prim.ListConstruct %int1_13872, %int24_13873, %int4608_13874, %int128_13875 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10718 = torch.aten.view %10709, %10717 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13876 = torch.constant.int 5
    %10719 = torch.prims.convert_element_type %10718, %int5_13876 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_13877 = torch.constant.int 1
    %int24_13878 = torch.constant.int 24
    %int4608_13879 = torch.constant.int 4608
    %int128_13880 = torch.constant.int 128
    %10720 = torch.prim.ListConstruct %int1_13877, %int24_13878, %int4608_13879, %int128_13880 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10721 = torch.aten.view %10716, %10720 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13881 = torch.constant.int 5
    %10722 = torch.prims.convert_element_type %10721, %int5_13881 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_13882 = torch.constant.float 0.000000e+00
    %false_13883 = torch.constant.bool false
    %none_13884 = torch.constant.none
    %none_13885 = torch.constant.none
    %10723:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%10719, %10722, %10674, %float0.000000e00_13882, %false_13883, %none_13884, %none_13885) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_13886 = torch.constant.int 0
    %int2_13887 = torch.constant.int 2
    %int1_13888 = torch.constant.int 1
    %int3_13889 = torch.constant.int 3
    %10724 = torch.prim.ListConstruct %int0_13886, %int2_13887, %int1_13888, %int3_13889 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10725 = torch.aten.permute %10723#0, %10724 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_13890 = torch.constant.int 1
    %int4608_13891 = torch.constant.int 4608
    %int3072_13892 = torch.constant.int 3072
    %10726 = torch.prim.ListConstruct %int1_13890, %int4608_13891, %int3072_13892 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10727 = torch.aten.view %10725, %10726 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_13893 = torch.constant.str "tanh"
    %10728 = torch.aten.gelu %10667, %str_13893 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %10729 = torch.prim.ListConstruct %10727, %10728 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_13894 = torch.constant.int 2
    %10730 = torch.aten.cat %10729, %int2_13894 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_13895 = torch.constant.int 4608
    %int15360_13896 = torch.constant.int 15360
    %10731 = torch.prim.ListConstruct %int4608_13895, %int15360_13896 : (!torch.int, !torch.int) -> !torch.list<int>
    %10732 = torch.aten.view %10730, %10731 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.36.linear2.weight = util.global.load @__auto.sampler.single_blocks.36.linear2.weight : tensor<3072x15360xf16>
    %10733 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_13897 = torch.constant.int 0
    %int1_13898 = torch.constant.int 1
    %10734 = torch.aten.transpose.int %10733, %int0_13897, %int1_13898 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.36.linear2.bias = util.global.load @__auto.sampler.single_blocks.36.linear2.bias : tensor<3072xf16>
    %10735 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.36.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_13899 = torch.constant.int 6
    %10736 = torch.prims.convert_element_type %10735, %int6_13899 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_13900 = torch.constant.int 6
    %10737 = torch.prims.convert_element_type %10732, %int6_13900 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_13901 = torch.constant.int 6
    %10738 = torch.prims.convert_element_type %10734, %int6_13901 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %10739 = torch.aten.mm %10737, %10738 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_13902 = torch.constant.int 1
    %10740 = torch.aten.mul.Scalar %10739, %int1_13902 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_13903 = torch.constant.int 1
    %10741 = torch.aten.mul.Scalar %10736, %int1_13903 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_13904 = torch.constant.int 1
    %10742 = torch.aten.add.Tensor %10740, %10741, %int1_13904 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_13905 = torch.constant.int 5
    %10743 = torch.prims.convert_element_type %10742, %int5_13905 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_13906 = torch.constant.int 1
    %int4608_13907 = torch.constant.int 4608
    %int3072_13908 = torch.constant.int 3072
    %10744 = torch.prim.ListConstruct %int1_13906, %int4608_13907, %int3072_13908 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10745 = torch.aten.view %10743, %10744 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %10746 = torch.aten.mul.Tensor %10640, %10745 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13909 = torch.constant.int 1
    %10747 = torch.aten.add.Tensor %10622, %10746, %int1_13909 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10748 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.single_blocks.37.modulation.lin.weight = util.global.load @__auto.sampler.single_blocks.37.modulation.lin.weight : tensor<9216x3072xf16>
    %10749 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.modulation.lin.weight : tensor<9216x3072xf16> -> !torch.vtensor<[9216,3072],f16>
    %int0_13910 = torch.constant.int 0
    %int1_13911 = torch.constant.int 1
    %10750 = torch.aten.transpose.int %10749, %int0_13910, %int1_13911 : !torch.vtensor<[9216,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,9216],f16>
    %__auto.sampler.single_blocks.37.modulation.lin.bias = util.global.load @__auto.sampler.single_blocks.37.modulation.lin.bias : tensor<9216xf16>
    %10751 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.modulation.lin.bias : tensor<9216xf16> -> !torch.vtensor<[9216],f16>
    %int6_13912 = torch.constant.int 6
    %10752 = torch.prims.convert_element_type %10751, %int6_13912 : !torch.vtensor<[9216],f16>, !torch.int -> !torch.vtensor<[9216],f32>
    %int6_13913 = torch.constant.int 6
    %10753 = torch.prims.convert_element_type %10748, %int6_13913 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_13914 = torch.constant.int 6
    %10754 = torch.prims.convert_element_type %10750, %int6_13914 : !torch.vtensor<[3072,9216],f16>, !torch.int -> !torch.vtensor<[3072,9216],f32>
    %10755 = torch.aten.mm %10753, %10754 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,9216],f32> -> !torch.vtensor<[1,9216],f32>
    %int1_13915 = torch.constant.int 1
    %10756 = torch.aten.mul.Scalar %10755, %int1_13915 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int1_13916 = torch.constant.int 1
    %10757 = torch.aten.mul.Scalar %10752, %int1_13916 : !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[9216],f32>
    %int1_13917 = torch.constant.int 1
    %10758 = torch.aten.add.Tensor %10756, %10757, %int1_13917 : !torch.vtensor<[1,9216],f32>, !torch.vtensor<[9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f32>
    %int5_13918 = torch.constant.int 5
    %10759 = torch.prims.convert_element_type %10758, %int5_13918 : !torch.vtensor<[1,9216],f32>, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int0_13919 = torch.constant.int 0
    %int0_13920 = torch.constant.int 0
    %int9223372036854775807_13921 = torch.constant.int 9223372036854775807
    %int1_13922 = torch.constant.int 1
    %10760 = torch.aten.slice.Tensor %10759, %int0_13919, %int0_13920, %int9223372036854775807_13921, %int1_13922 : !torch.vtensor<[1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,9216],f16>
    %int1_13923 = torch.constant.int 1
    %10761 = torch.aten.unsqueeze %10760, %int1_13923 : !torch.vtensor<[1,9216],f16>, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int2_13924 = torch.constant.int 2
    %int0_13925 = torch.constant.int 0
    %int9223372036854775807_13926 = torch.constant.int 9223372036854775807
    %int1_13927 = torch.constant.int 1
    %10762 = torch.aten.slice.Tensor %10761, %int2_13924, %int0_13925, %int9223372036854775807_13926, %int1_13927 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,9216],f16>
    %int-1_13928 = torch.constant.int -1
    %int0_13929 = torch.constant.int 0
    %int3072_13930 = torch.constant.int 3072
    %int1_13931 = torch.constant.int 1
    %10763 = torch.aten.slice.Tensor %10762, %int-1_13928, %int0_13929, %int3072_13930, %int1_13931 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13932 = torch.constant.int -1
    %int3072_13933 = torch.constant.int 3072
    %int6144_13934 = torch.constant.int 6144
    %int1_13935 = torch.constant.int 1
    %10764 = torch.aten.slice.Tensor %10762, %int-1_13932, %int3072_13933, %int6144_13934, %int1_13935 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int-1_13936 = torch.constant.int -1
    %int6144_13937 = torch.constant.int 6144
    %int9216_13938 = torch.constant.int 9216
    %int1_13939 = torch.constant.int 1
    %10765 = torch.aten.slice.Tensor %10762, %int-1_13936, %int6144_13937, %int9216_13938, %int1_13939 : !torch.vtensor<[1,1,9216],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_13940 = torch.constant.int 1
    %int1_13941 = torch.constant.int 1
    %10766 = torch.aten.add.Scalar %10764, %int1_13940, %int1_13941 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_13942 = torch.constant.int 6
    %10767 = torch.prims.convert_element_type %10747, %int6_13942 : !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %int2_13943 = torch.constant.int 2
    %10768 = torch.prim.ListConstruct %int2_13943 : (!torch.int) -> !torch.list<int>
    %int0_13944 = torch.constant.int 0
    %true_13945 = torch.constant.bool true
    %result0_13946, %result1_13947 = torch.aten.var_mean.correction %10767, %10768, %int0_13944, %true_13945 : !torch.vtensor<[1,4608,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4608,1],f32>, !torch.vtensor<[1,4608,1],f32>
    %float9.999990e-07_13948 = torch.constant.float 9.9999999999999995E-7
    %int1_13949 = torch.constant.int 1
    %10769 = torch.aten.add.Scalar %result0_13946, %float9.999990e-07_13948, %int1_13949 : !torch.vtensor<[1,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4608,1],f32>
    %10770 = torch.aten.rsqrt %10769 : !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,1],f32>
    %int1_13950 = torch.constant.int 1
    %10771 = torch.aten.sub.Tensor %10747, %result1_13947, %int1_13950 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,1],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f32>
    %10772 = torch.aten.mul.Tensor %10771, %10770 : !torch.vtensor<[1,4608,3072],f32>, !torch.vtensor<[1,4608,1],f32> -> !torch.vtensor<[1,4608,3072],f32>
    %int5_13951 = torch.constant.int 5
    %10773 = torch.prims.convert_element_type %10772, %int5_13951 : !torch.vtensor<[1,4608,3072],f32>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %10774 = torch.aten.mul.Tensor %10766, %10773 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_13952 = torch.constant.int 1
    %10775 = torch.aten.add.Tensor %10774, %10763, %int1_13952 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int4608_13953 = torch.constant.int 4608
    %int3072_13954 = torch.constant.int 3072
    %10776 = torch.prim.ListConstruct %int4608_13953, %int3072_13954 : (!torch.int, !torch.int) -> !torch.list<int>
    %10777 = torch.aten.view %10775, %10776 : !torch.vtensor<[1,4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[4608,3072],f16>
    %__auto.sampler.single_blocks.37.linear1.weight = util.global.load @__auto.sampler.single_blocks.37.linear1.weight : tensor<21504x3072xf16>
    %10778 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.linear1.weight : tensor<21504x3072xf16> -> !torch.vtensor<[21504,3072],f16>
    %int0_13955 = torch.constant.int 0
    %int1_13956 = torch.constant.int 1
    %10779 = torch.aten.transpose.int %10778, %int0_13955, %int1_13956 : !torch.vtensor<[21504,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,21504],f16>
    %__auto.sampler.single_blocks.37.linear1.bias = util.global.load @__auto.sampler.single_blocks.37.linear1.bias : tensor<21504xf16>
    %10780 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.linear1.bias : tensor<21504xf16> -> !torch.vtensor<[21504],f16>
    %int6_13957 = torch.constant.int 6
    %10781 = torch.prims.convert_element_type %10780, %int6_13957 : !torch.vtensor<[21504],f16>, !torch.int -> !torch.vtensor<[21504],f32>
    %int6_13958 = torch.constant.int 6
    %10782 = torch.prims.convert_element_type %10777, %int6_13958 : !torch.vtensor<[4608,3072],f16>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int6_13959 = torch.constant.int 6
    %10783 = torch.prims.convert_element_type %10779, %int6_13959 : !torch.vtensor<[3072,21504],f16>, !torch.int -> !torch.vtensor<[3072,21504],f32>
    %10784 = torch.aten.mm %10782, %10783 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072,21504],f32> -> !torch.vtensor<[4608,21504],f32>
    %int1_13960 = torch.constant.int 1
    %10785 = torch.aten.mul.Scalar %10784, %int1_13960 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int1_13961 = torch.constant.int 1
    %10786 = torch.aten.mul.Scalar %10781, %int1_13961 : !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[21504],f32>
    %int1_13962 = torch.constant.int 1
    %10787 = torch.aten.add.Tensor %10785, %10786, %int1_13962 : !torch.vtensor<[4608,21504],f32>, !torch.vtensor<[21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f32>
    %int5_13963 = torch.constant.int 5
    %10788 = torch.prims.convert_element_type %10787, %int5_13963 : !torch.vtensor<[4608,21504],f32>, !torch.int -> !torch.vtensor<[4608,21504],f16>
    %int1_13964 = torch.constant.int 1
    %int4608_13965 = torch.constant.int 4608
    %int21504_13966 = torch.constant.int 21504
    %10789 = torch.prim.ListConstruct %int1_13964, %int4608_13965, %int21504_13966 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10790 = torch.aten.view %10788, %10789 : !torch.vtensor<[4608,21504],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,21504],f16>
    %int-1_13967 = torch.constant.int -1
    %int0_13968 = torch.constant.int 0
    %int9216_13969 = torch.constant.int 9216
    %int1_13970 = torch.constant.int 1
    %10791 = torch.aten.slice.Tensor %10790, %int-1_13967, %int0_13968, %int9216_13969, %int1_13970 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,9216],f16>
    %int-1_13971 = torch.constant.int -1
    %int9216_13972 = torch.constant.int 9216
    %int21504_13973 = torch.constant.int 21504
    %int1_13974 = torch.constant.int 1
    %10792 = torch.aten.slice.Tensor %10790, %int-1_13971, %int9216_13972, %int21504_13973, %int1_13974 : !torch.vtensor<[1,4608,21504],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,12288],f16>
    %int1_13975 = torch.constant.int 1
    %int4608_13976 = torch.constant.int 4608
    %int3_13977 = torch.constant.int 3
    %int24_13978 = torch.constant.int 24
    %int128_13979 = torch.constant.int 128
    %10793 = torch.prim.ListConstruct %int1_13975, %int4608_13976, %int3_13977, %int24_13978, %int128_13979 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10794 = torch.aten.view %10791, %10793 : !torch.vtensor<[1,4608,9216],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3,24,128],f16>
    %int2_13980 = torch.constant.int 2
    %int0_13981 = torch.constant.int 0
    %int3_13982 = torch.constant.int 3
    %int1_13983 = torch.constant.int 1
    %int4_13984 = torch.constant.int 4
    %10795 = torch.prim.ListConstruct %int2_13980, %int0_13981, %int3_13982, %int1_13983, %int4_13984 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10796 = torch.aten.permute %10794, %10795 : !torch.vtensor<[1,4608,3,24,128],f16>, !torch.list<int> -> !torch.vtensor<[3,1,24,4608,128],f16>
    %int0_13985 = torch.constant.int 0
    %int0_13986 = torch.constant.int 0
    %10797 = torch.aten.select.int %10796, %int0_13985, %int0_13986 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13987 = torch.constant.int 0
    %int1_13988 = torch.constant.int 1
    %10798 = torch.aten.select.int %10796, %int0_13987, %int1_13988 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int0_13989 = torch.constant.int 0
    %int2_13990 = torch.constant.int 2
    %10799 = torch.aten.select.int %10796, %int0_13989, %int2_13990 : !torch.vtensor<[3,1,24,4608,128],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13991 = torch.constant.int 6
    %10800 = torch.prims.convert_element_type %10797, %int6_13991 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_13992 = torch.constant.int 2
    %10801 = torch.aten.pow.Tensor_Scalar %10800, %int2_13992 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_13993 = torch.constant.int -1
    %10802 = torch.prim.ListConstruct %int-1_13993 : (!torch.int) -> !torch.list<int>
    %true_13994 = torch.constant.bool true
    %none_13995 = torch.constant.none
    %10803 = torch.aten.mean.dim %10801, %10802, %true_13994, %none_13995 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_13996 = torch.constant.float 9.9999999999999995E-7
    %int1_13997 = torch.constant.int 1
    %10804 = torch.aten.add.Scalar %10803, %float9.999990e-07_13996, %int1_13997 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10805 = torch.aten.rsqrt %10804 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10806 = torch.aten.mul.Tensor %10800, %10805 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_13998 = torch.constant.int 5
    %10807 = torch.prims.convert_element_type %10806, %int5_13998 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.37.norm.query_norm.scale = util.global.load @__auto.sampler.single_blocks.37.norm.query_norm.scale : tensor<128xf16>
    %10808 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.norm.query_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10809 = torch.aten.mul.Tensor %10807, %10808 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_13999 = torch.constant.int 6
    %10810 = torch.prims.convert_element_type %10798, %int6_13999 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int2_14000 = torch.constant.int 2
    %10811 = torch.aten.pow.Tensor_Scalar %10810, %int2_14000 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int-1_14001 = torch.constant.int -1
    %10812 = torch.prim.ListConstruct %int-1_14001 : (!torch.int) -> !torch.list<int>
    %true_14002 = torch.constant.bool true
    %none_14003 = torch.constant.none
    %10813 = torch.aten.mean.dim %10811, %10812, %true_14002, %none_14003 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,24,4608,1],f32>
    %float9.999990e-07_14004 = torch.constant.float 9.9999999999999995E-7
    %int1_14005 = torch.constant.int 1
    %10814 = torch.aten.add.Scalar %10813, %float9.999990e-07_14004, %int1_14005 : !torch.vtensor<[1,24,4608,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,24,4608,1],f32>
    %10815 = torch.aten.rsqrt %10814 : !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,1],f32>
    %10816 = torch.aten.mul.Tensor %10810, %10815 : !torch.vtensor<[1,24,4608,128],f32>, !torch.vtensor<[1,24,4608,1],f32> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14006 = torch.constant.int 5
    %10817 = torch.prims.convert_element_type %10816, %int5_14006 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %__auto.sampler.single_blocks.37.norm.key_norm.scale = util.global.load @__auto.sampler.single_blocks.37.norm.key_norm.scale : tensor<128xf16>
    %10818 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.norm.key_norm.scale : tensor<128xf16> -> !torch.vtensor<[128],f16>
    %10819 = torch.aten.mul.Tensor %10817, %10818 : !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[128],f16> -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_14007 = torch.constant.int 5
    %10820 = torch.prims.convert_element_type %10809, %int5_14007 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int5_14008 = torch.constant.int 5
    %10821 = torch.prims.convert_element_type %10819, %int5_14008 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int6_14009 = torch.constant.int 6
    %10822 = torch.prims.convert_element_type %10820, %int6_14009 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_14010 = torch.constant.int 1
    %int24_14011 = torch.constant.int 24
    %int4608_14012 = torch.constant.int 4608
    %int64_14013 = torch.constant.int 64
    %int1_14014 = torch.constant.int 1
    %int2_14015 = torch.constant.int 2
    %10823 = torch.prim.ListConstruct %int1_14010, %int24_14011, %int4608_14012, %int64_14013, %int1_14014, %int2_14015 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10824 = torch.aten.view %10822, %10823 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int6_14016 = torch.constant.int 6
    %10825 = torch.prims.convert_element_type %10821, %int6_14016 : !torch.vtensor<[1,24,4608,128],f16>, !torch.int -> !torch.vtensor<[1,24,4608,128],f32>
    %int1_14017 = torch.constant.int 1
    %int24_14018 = torch.constant.int 24
    %int4608_14019 = torch.constant.int 4608
    %int64_14020 = torch.constant.int 64
    %int1_14021 = torch.constant.int 1
    %int2_14022 = torch.constant.int 2
    %10826 = torch.prim.ListConstruct %int1_14017, %int24_14018, %int4608_14019, %int64_14020, %int1_14021, %int2_14022 : (!torch.int, !torch.int, !torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10827 = torch.aten.view %10825, %10826 : !torch.vtensor<[1,24,4608,128],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,64,1,2],f32>
    %int5_14023 = torch.constant.int 5
    %int0_14024 = torch.constant.int 0
    %10828 = torch.aten.select.int %211, %int5_14023, %int0_14024 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14025 = torch.constant.int 5
    %int0_14026 = torch.constant.int 0
    %10829 = torch.aten.select.int %10824, %int5_14025, %int0_14026 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10830 = torch.aten.mul.Tensor %10828, %10829 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14027 = torch.constant.int 5
    %int1_14028 = torch.constant.int 1
    %10831 = torch.aten.select.int %211, %int5_14027, %int1_14028 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14029 = torch.constant.int 5
    %int1_14030 = torch.constant.int 1
    %10832 = torch.aten.select.int %10824, %int5_14029, %int1_14030 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10833 = torch.aten.mul.Tensor %10831, %10832 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14031 = torch.constant.int 1
    %10834 = torch.aten.add.Tensor %10830, %10833, %int1_14031 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14032 = torch.constant.int 5
    %int0_14033 = torch.constant.int 0
    %10835 = torch.aten.select.int %211, %int5_14032, %int0_14033 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14034 = torch.constant.int 5
    %int0_14035 = torch.constant.int 0
    %10836 = torch.aten.select.int %10827, %int5_14034, %int0_14035 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10837 = torch.aten.mul.Tensor %10835, %10836 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int5_14036 = torch.constant.int 5
    %int1_14037 = torch.constant.int 1
    %10838 = torch.aten.select.int %211, %int5_14036, %int1_14037 : !torch.vtensor<[1,1,4608,64,2,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,1,4608,64,2],f32>
    %int5_14038 = torch.constant.int 5
    %int1_14039 = torch.constant.int 1
    %10839 = torch.aten.select.int %10827, %int5_14038, %int1_14039 : !torch.vtensor<[1,24,4608,64,1,2],f32>, !torch.int, !torch.int -> !torch.vtensor<[1,24,4608,64,1],f32>
    %10840 = torch.aten.mul.Tensor %10838, %10839 : !torch.vtensor<[1,1,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,1],f32> -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14040 = torch.constant.int 1
    %10841 = torch.aten.add.Tensor %10837, %10840, %int1_14040 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.vtensor<[1,24,4608,64,2],f32>, !torch.int -> !torch.vtensor<[1,24,4608,64,2],f32>
    %int1_14041 = torch.constant.int 1
    %int24_14042 = torch.constant.int 24
    %int4608_14043 = torch.constant.int 4608
    %int128_14044 = torch.constant.int 128
    %10842 = torch.prim.ListConstruct %int1_14041, %int24_14042, %int4608_14043, %int128_14044 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10843 = torch.aten.view %10834, %10842 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14045 = torch.constant.int 5
    %10844 = torch.prims.convert_element_type %10843, %int5_14045 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %int1_14046 = torch.constant.int 1
    %int24_14047 = torch.constant.int 24
    %int4608_14048 = torch.constant.int 4608
    %int128_14049 = torch.constant.int 128
    %10845 = torch.prim.ListConstruct %int1_14046, %int24_14047, %int4608_14048, %int128_14049 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10846 = torch.aten.view %10841, %10845 : !torch.vtensor<[1,24,4608,64,2],f32>, !torch.list<int> -> !torch.vtensor<[1,24,4608,128],f32>
    %int5_14050 = torch.constant.int 5
    %10847 = torch.prims.convert_element_type %10846, %int5_14050 : !torch.vtensor<[1,24,4608,128],f32>, !torch.int -> !torch.vtensor<[1,24,4608,128],f16>
    %float0.000000e00_14051 = torch.constant.float 0.000000e+00
    %false_14052 = torch.constant.bool false
    %none_14053 = torch.constant.none
    %none_14054 = torch.constant.none
    %10848:2 = torch.operator "torch.aten._scaled_dot_product_flash_attention_for_cpu"(%10844, %10847, %10799, %float0.000000e00_14051, %false_14052, %none_14053, %none_14054) : (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608,128],f16>, !torch.float, !torch.bool, !torch.none, !torch.none) -> (!torch.vtensor<[1,24,4608,128],f16>, !torch.vtensor<[1,24,4608],f32>) 
    %int0_14055 = torch.constant.int 0
    %int2_14056 = torch.constant.int 2
    %int1_14057 = torch.constant.int 1
    %int3_14058 = torch.constant.int 3
    %10849 = torch.prim.ListConstruct %int0_14055, %int2_14056, %int1_14057, %int3_14058 : (!torch.int, !torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10850 = torch.aten.permute %10848#0, %10849 : !torch.vtensor<[1,24,4608,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,24,128],f16>
    %int1_14059 = torch.constant.int 1
    %int4608_14060 = torch.constant.int 4608
    %int3072_14061 = torch.constant.int 3072
    %10851 = torch.prim.ListConstruct %int1_14059, %int4608_14060, %int3072_14061 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10852 = torch.aten.view %10850, %10851 : !torch.vtensor<[1,4608,24,128],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %str_14062 = torch.constant.str "tanh"
    %10853 = torch.aten.gelu %10792, %str_14062 : !torch.vtensor<[1,4608,12288],f16>, !torch.str -> !torch.vtensor<[1,4608,12288],f16>
    %10854 = torch.prim.ListConstruct %10852, %10853 : (!torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,12288],f16>) -> !torch.list<vtensor>
    %int2_14063 = torch.constant.int 2
    %10855 = torch.aten.cat %10854, %int2_14063 : !torch.list<vtensor>, !torch.int -> !torch.vtensor<[1,4608,15360],f16>
    %int4608_14064 = torch.constant.int 4608
    %int15360_14065 = torch.constant.int 15360
    %10856 = torch.prim.ListConstruct %int4608_14064, %int15360_14065 : (!torch.int, !torch.int) -> !torch.list<int>
    %10857 = torch.aten.view %10855, %10856 : !torch.vtensor<[1,4608,15360],f16>, !torch.list<int> -> !torch.vtensor<[4608,15360],f16>
    %__auto.sampler.single_blocks.37.linear2.weight = util.global.load @__auto.sampler.single_blocks.37.linear2.weight : tensor<3072x15360xf16>
    %10858 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.linear2.weight : tensor<3072x15360xf16> -> !torch.vtensor<[3072,15360],f16>
    %int0_14066 = torch.constant.int 0
    %int1_14067 = torch.constant.int 1
    %10859 = torch.aten.transpose.int %10858, %int0_14066, %int1_14067 : !torch.vtensor<[3072,15360],f16>, !torch.int, !torch.int -> !torch.vtensor<[15360,3072],f16>
    %__auto.sampler.single_blocks.37.linear2.bias = util.global.load @__auto.sampler.single_blocks.37.linear2.bias : tensor<3072xf16>
    %10860 = torch_c.from_builtin_tensor %__auto.sampler.single_blocks.37.linear2.bias : tensor<3072xf16> -> !torch.vtensor<[3072],f16>
    %int6_14068 = torch.constant.int 6
    %10861 = torch.prims.convert_element_type %10860, %int6_14068 : !torch.vtensor<[3072],f16>, !torch.int -> !torch.vtensor<[3072],f32>
    %int6_14069 = torch.constant.int 6
    %10862 = torch.prims.convert_element_type %10857, %int6_14069 : !torch.vtensor<[4608,15360],f16>, !torch.int -> !torch.vtensor<[4608,15360],f32>
    %int6_14070 = torch.constant.int 6
    %10863 = torch.prims.convert_element_type %10859, %int6_14070 : !torch.vtensor<[15360,3072],f16>, !torch.int -> !torch.vtensor<[15360,3072],f32>
    %10864 = torch.aten.mm %10862, %10863 : !torch.vtensor<[4608,15360],f32>, !torch.vtensor<[15360,3072],f32> -> !torch.vtensor<[4608,3072],f32>
    %int1_14071 = torch.constant.int 1
    %10865 = torch.aten.mul.Scalar %10864, %int1_14071 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int1_14072 = torch.constant.int 1
    %10866 = torch.aten.mul.Scalar %10861, %int1_14072 : !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[3072],f32>
    %int1_14073 = torch.constant.int 1
    %10867 = torch.aten.add.Tensor %10865, %10866, %int1_14073 : !torch.vtensor<[4608,3072],f32>, !torch.vtensor<[3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f32>
    %int5_14074 = torch.constant.int 5
    %10868 = torch.prims.convert_element_type %10867, %int5_14074 : !torch.vtensor<[4608,3072],f32>, !torch.int -> !torch.vtensor<[4608,3072],f16>
    %int1_14075 = torch.constant.int 1
    %int4608_14076 = torch.constant.int 4608
    %int3072_14077 = torch.constant.int 3072
    %10869 = torch.prim.ListConstruct %int1_14075, %int4608_14076, %int3072_14077 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10870 = torch.aten.view %10868, %10869 : !torch.vtensor<[4608,3072],f16>, !torch.list<int> -> !torch.vtensor<[1,4608,3072],f16>
    %10871 = torch.aten.mul.Tensor %10765, %10870 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4608,3072],f16> -> !torch.vtensor<[1,4608,3072],f16>
    %int1_14078 = torch.constant.int 1
    %10872 = torch.aten.add.Tensor %10747, %10871, %int1_14078 : !torch.vtensor<[1,4608,3072],f16>, !torch.vtensor<[1,4608,3072],f16>, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int0_14079 = torch.constant.int 0
    %int0_14080 = torch.constant.int 0
    %int9223372036854775807_14081 = torch.constant.int 9223372036854775807
    %int1_14082 = torch.constant.int 1
    %10873 = torch.aten.slice.Tensor %10872, %int0_14079, %int0_14080, %int9223372036854775807_14081, %int1_14082 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4608,3072],f16>
    %int1_14083 = torch.constant.int 1
    %int512_14084 = torch.constant.int 512
    %int9223372036854775807_14085 = torch.constant.int 9223372036854775807
    %int1_14086 = torch.constant.int 1
    %10874 = torch.aten.slice.Tensor %10873, %int1_14083, %int512_14084, %int9223372036854775807_14085, %int1_14086 : !torch.vtensor<[1,4608,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %10875 = torch.aten.silu %119 : !torch.vtensor<[1,3072],f16> -> !torch.vtensor<[1,3072],f16>
    %__auto.sampler.final_layer.adaLN_modulation.1.weight = util.global.load @__auto.sampler.final_layer.adaLN_modulation.1.weight : tensor<6144x3072xf16>
    %10876 = torch_c.from_builtin_tensor %__auto.sampler.final_layer.adaLN_modulation.1.weight : tensor<6144x3072xf16> -> !torch.vtensor<[6144,3072],f16>
    %int0_14087 = torch.constant.int 0
    %int1_14088 = torch.constant.int 1
    %10877 = torch.aten.transpose.int %10876, %int0_14087, %int1_14088 : !torch.vtensor<[6144,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,6144],f16>
    %__auto.sampler.final_layer.adaLN_modulation.1.bias = util.global.load @__auto.sampler.final_layer.adaLN_modulation.1.bias : tensor<6144xf16>
    %10878 = torch_c.from_builtin_tensor %__auto.sampler.final_layer.adaLN_modulation.1.bias : tensor<6144xf16> -> !torch.vtensor<[6144],f16>
    %int6_14089 = torch.constant.int 6
    %10879 = torch.prims.convert_element_type %10878, %int6_14089 : !torch.vtensor<[6144],f16>, !torch.int -> !torch.vtensor<[6144],f32>
    %int6_14090 = torch.constant.int 6
    %10880 = torch.prims.convert_element_type %10875, %int6_14090 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,3072],f32>
    %int6_14091 = torch.constant.int 6
    %10881 = torch.prims.convert_element_type %10877, %int6_14091 : !torch.vtensor<[3072,6144],f16>, !torch.int -> !torch.vtensor<[3072,6144],f32>
    %10882 = torch.aten.mm %10880, %10881 : !torch.vtensor<[1,3072],f32>, !torch.vtensor<[3072,6144],f32> -> !torch.vtensor<[1,6144],f32>
    %int1_14092 = torch.constant.int 1
    %10883 = torch.aten.mul.Scalar %10882, %int1_14092 : !torch.vtensor<[1,6144],f32>, !torch.int -> !torch.vtensor<[1,6144],f32>
    %int1_14093 = torch.constant.int 1
    %10884 = torch.aten.mul.Scalar %10879, %int1_14093 : !torch.vtensor<[6144],f32>, !torch.int -> !torch.vtensor<[6144],f32>
    %int1_14094 = torch.constant.int 1
    %10885 = torch.aten.add.Tensor %10883, %10884, %int1_14094 : !torch.vtensor<[1,6144],f32>, !torch.vtensor<[6144],f32>, !torch.int -> !torch.vtensor<[1,6144],f32>
    %int5_14095 = torch.constant.int 5
    %10886 = torch.prims.convert_element_type %10885, %int5_14095 : !torch.vtensor<[1,6144],f32>, !torch.int -> !torch.vtensor<[1,6144],f16>
    %int1_14096 = torch.constant.int 1
    %int0_14097 = torch.constant.int 0
    %int3072_14098 = torch.constant.int 3072
    %int1_14099 = torch.constant.int 1
    %10887 = torch.aten.slice.Tensor %10886, %int1_14096, %int0_14097, %int3072_14098, %int1_14099 : !torch.vtensor<[1,6144],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,3072],f16>
    %int1_14100 = torch.constant.int 1
    %int3072_14101 = torch.constant.int 3072
    %int6144_14102 = torch.constant.int 6144
    %int1_14103 = torch.constant.int 1
    %10888 = torch.aten.slice.Tensor %10886, %int1_14100, %int3072_14101, %int6144_14102, %int1_14103 : !torch.vtensor<[1,6144],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,3072],f16>
    %int0_14104 = torch.constant.int 0
    %int0_14105 = torch.constant.int 0
    %int9223372036854775807_14106 = torch.constant.int 9223372036854775807
    %int1_14107 = torch.constant.int 1
    %10889 = torch.aten.slice.Tensor %10888, %int0_14104, %int0_14105, %int9223372036854775807_14106, %int1_14107 : !torch.vtensor<[1,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,3072],f16>
    %int1_14108 = torch.constant.int 1
    %10890 = torch.aten.unsqueeze %10889, %int1_14108 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int2_14109 = torch.constant.int 2
    %int0_14110 = torch.constant.int 0
    %int9223372036854775807_14111 = torch.constant.int 9223372036854775807
    %int1_14112 = torch.constant.int 1
    %10891 = torch.aten.slice.Tensor %10890, %int2_14109, %int0_14110, %int9223372036854775807_14111, %int1_14112 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_14113 = torch.constant.int 1
    %int1_14114 = torch.constant.int 1
    %10892 = torch.aten.add.Scalar %10891, %int1_14113, %int1_14114 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int6_14115 = torch.constant.int 6
    %10893 = torch.prims.convert_element_type %10874, %int6_14115 : !torch.vtensor<[1,4096,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %int2_14116 = torch.constant.int 2
    %10894 = torch.prim.ListConstruct %int2_14116 : (!torch.int) -> !torch.list<int>
    %int0_14117 = torch.constant.int 0
    %true_14118 = torch.constant.bool true
    %result0_14119, %result1_14120 = torch.aten.var_mean.correction %10893, %10894, %int0_14117, %true_14118 : !torch.vtensor<[1,4096,3072],f32>, !torch.list<int>, !torch.int, !torch.bool -> !torch.vtensor<[1,4096,1],f32>, !torch.vtensor<[1,4096,1],f32>
    %float9.999990e-07_14121 = torch.constant.float 9.9999999999999995E-7
    %int1_14122 = torch.constant.int 1
    %10895 = torch.aten.add.Scalar %result0_14119, %float9.999990e-07_14121, %int1_14122 : !torch.vtensor<[1,4096,1],f32>, !torch.float, !torch.int -> !torch.vtensor<[1,4096,1],f32>
    %10896 = torch.aten.rsqrt %10895 : !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,1],f32>
    %int1_14123 = torch.constant.int 1
    %10897 = torch.aten.sub.Tensor %10874, %result1_14120, %int1_14123 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,4096,1],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f32>
    %10898 = torch.aten.mul.Tensor %10897, %10896 : !torch.vtensor<[1,4096,3072],f32>, !torch.vtensor<[1,4096,1],f32> -> !torch.vtensor<[1,4096,3072],f32>
    %int5_14124 = torch.constant.int 5
    %10899 = torch.prims.convert_element_type %10898, %int5_14124 : !torch.vtensor<[1,4096,3072],f32>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %10900 = torch.aten.mul.Tensor %10892, %10899 : !torch.vtensor<[1,1,3072],f16>, !torch.vtensor<[1,4096,3072],f16> -> !torch.vtensor<[1,4096,3072],f16>
    %int0_14125 = torch.constant.int 0
    %int0_14126 = torch.constant.int 0
    %int9223372036854775807_14127 = torch.constant.int 9223372036854775807
    %int1_14128 = torch.constant.int 1
    %10901 = torch.aten.slice.Tensor %10887, %int0_14125, %int0_14126, %int9223372036854775807_14127, %int1_14128 : !torch.vtensor<[1,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,3072],f16>
    %int1_14129 = torch.constant.int 1
    %10902 = torch.aten.unsqueeze %10901, %int1_14129 : !torch.vtensor<[1,3072],f16>, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int2_14130 = torch.constant.int 2
    %int0_14131 = torch.constant.int 0
    %int9223372036854775807_14132 = torch.constant.int 9223372036854775807
    %int1_14133 = torch.constant.int 1
    %10903 = torch.aten.slice.Tensor %10902, %int2_14130, %int0_14131, %int9223372036854775807_14132, %int1_14133 : !torch.vtensor<[1,1,3072],f16>, !torch.int, !torch.int, !torch.int, !torch.int -> !torch.vtensor<[1,1,3072],f16>
    %int1_14134 = torch.constant.int 1
    %10904 = torch.aten.add.Tensor %10900, %10903, %int1_14134 : !torch.vtensor<[1,4096,3072],f16>, !torch.vtensor<[1,1,3072],f16>, !torch.int -> !torch.vtensor<[1,4096,3072],f16>
    %int4096_14135 = torch.constant.int 4096
    %int3072_14136 = torch.constant.int 3072
    %10905 = torch.prim.ListConstruct %int4096_14135, %int3072_14136 : (!torch.int, !torch.int) -> !torch.list<int>
    %10906 = torch.aten.view %10904, %10905 : !torch.vtensor<[1,4096,3072],f16>, !torch.list<int> -> !torch.vtensor<[4096,3072],f16>
    %__auto.sampler.final_layer.linear.weight = util.global.load @__auto.sampler.final_layer.linear.weight : tensor<64x3072xf16>
    %10907 = torch_c.from_builtin_tensor %__auto.sampler.final_layer.linear.weight : tensor<64x3072xf16> -> !torch.vtensor<[64,3072],f16>
    %int0_14137 = torch.constant.int 0
    %int1_14138 = torch.constant.int 1
    %10908 = torch.aten.transpose.int %10907, %int0_14137, %int1_14138 : !torch.vtensor<[64,3072],f16>, !torch.int, !torch.int -> !torch.vtensor<[3072,64],f16>
    %__auto.sampler.final_layer.linear.bias = util.global.load @__auto.sampler.final_layer.linear.bias : tensor<64xf16>
    %10909 = torch_c.from_builtin_tensor %__auto.sampler.final_layer.linear.bias : tensor<64xf16> -> !torch.vtensor<[64],f16>
    %int6_14139 = torch.constant.int 6
    %10910 = torch.prims.convert_element_type %10909, %int6_14139 : !torch.vtensor<[64],f16>, !torch.int -> !torch.vtensor<[64],f32>
    %int6_14140 = torch.constant.int 6
    %10911 = torch.prims.convert_element_type %10906, %int6_14140 : !torch.vtensor<[4096,3072],f16>, !torch.int -> !torch.vtensor<[4096,3072],f32>
    %int6_14141 = torch.constant.int 6
    %10912 = torch.prims.convert_element_type %10908, %int6_14141 : !torch.vtensor<[3072,64],f16>, !torch.int -> !torch.vtensor<[3072,64],f32>
    %10913 = torch.aten.mm %10911, %10912 : !torch.vtensor<[4096,3072],f32>, !torch.vtensor<[3072,64],f32> -> !torch.vtensor<[4096,64],f32>
    %int1_14142 = torch.constant.int 1
    %10914 = torch.aten.mul.Scalar %10913, %int1_14142 : !torch.vtensor<[4096,64],f32>, !torch.int -> !torch.vtensor<[4096,64],f32>
    %int1_14143 = torch.constant.int 1
    %10915 = torch.aten.mul.Scalar %10910, %int1_14143 : !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[64],f32>
    %int1_14144 = torch.constant.int 1
    %10916 = torch.aten.add.Tensor %10914, %10915, %int1_14144 : !torch.vtensor<[4096,64],f32>, !torch.vtensor<[64],f32>, !torch.int -> !torch.vtensor<[4096,64],f32>
    %int5_14145 = torch.constant.int 5
    %10917 = torch.prims.convert_element_type %10916, %int5_14145 : !torch.vtensor<[4096,64],f32>, !torch.int -> !torch.vtensor<[4096,64],f16>
    %int1_14146 = torch.constant.int 1
    %int4096_14147 = torch.constant.int 4096
    %int64_14148 = torch.constant.int 64
    %10918 = torch.prim.ListConstruct %int1_14146, %int4096_14147, %int64_14148 : (!torch.int, !torch.int, !torch.int) -> !torch.list<int>
    %10919 = torch.aten.view %10917, %10918 : !torch.vtensor<[4096,64],f16>, !torch.list<int> -> !torch.vtensor<[1,4096,64],f16>
    %int1_14149 = torch.constant.int 1
    %10920 = torch.aten.sub.Tensor %arg6, %arg5, %int1_14149 : !torch.vtensor<[1],f16>, !torch.vtensor<[1],f16>, !torch.int -> !torch.vtensor<[1],f16>
    %10921 = torch.aten.mul.Tensor %10920, %10919 : !torch.vtensor<[1],f16>, !torch.vtensor<[1,4096,64],f16> -> !torch.vtensor<[1,4096,64],f16>
    %int1_14150 = torch.constant.int 1
    %10922 = torch.aten.add.Tensor %arg0, %10921, %int1_14150 : !torch.vtensor<[1,4096,64],f16>, !torch.vtensor<[1,4096,64],f16>, !torch.int -> !torch.vtensor<[1,4096,64],f16>
    return %10922 : !torch.vtensor<[1,4096,64],f16>
  }
}