pip install compressed-tensors

git clone https://github.com/neuralmagic/compressed-tensors
cd compressed-tensors
pip install -e .

from transformers import AutoModelForCausalLM

# 以压缩张量格式加载模型
ct_model = AutoModelForCausalLM.from_pretrained("nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf")

# 测量内存使用情况
mem_params = sum([param.nelement() * param.element_size() for param in ct_model.parameters()])
print(f"{mem_params / 2**30:.4f} GB")
# 8.4575 GB

from transformers import AutoModelForCausalLM, AutoTokenizer

prompt = [
    "Hello, my name is",
    "The capital of France is",
    "The future of AI is"
]

model_name = "nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat"

quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

inputs = tokenizer(prompt, return_tensors="pt")
generated_ids = quantized_model.generate(**inputs, max_length=50, do_sample=False)
outputs = tokenizer.batch_decode(generated_ids)

print(outputs)

"""
['<|begin_of_text|>Hello, my name is [Name]. I am a [Your Profession/Student] and I am here to learn about the [Course/Program] at [University/Institution]. I am excited to be here and I am looking forward to', '<|begin_of_text|>The capital of France is Paris, which is located in the north-central part of the country. Paris is the most populous city in France and is known for its stunning architecture, art museums, fashion, and romantic atmosphere. The city is home to', "<|begin_of_text|>The future of AI is here, and it's already changing the way we live and work. From virtual assistants to self-driving cars, AI is transforming industries and revolutionizing the way we interact with technology. But what does the future of AI hold"]
"""

from transformers import AutoModelForCausalLM

ct_model = AutoModelForCausalLM.from_pretrained("nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf")
print(ct_model)
"""
LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): CompressedLinear(
            in_features=4096, out_features=4096, bias=False
            (input_observer): MovingAverageMinMaxObserver()
            (weight_observer): MovingAverageMinMaxObserver()
          )
          (k_proj): CompressedLinear(
            in_features=4096, out_features=1024, bias=False
            (input_observer): MovingAverageMinMaxObserver()
            (weight_observer): MovingAverageMinMaxObserver()
          )
          (v_proj): CompressedLinear(
            in_features=4096, out_features=1024, bias=False
            (input_observer): MovingAverageMinMaxObserver()
            (weight_observer): MovingAverageMinMaxObserver()
          )
          (o_proj): CompressedLinear(
            in_features=4096, out_features=4096, bias=False
            (input_observer): MovingAverageMinMaxObserver()
            (weight_observer): MovingAverageMinMaxObserver()
          )
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): CompressedLinear(
            in_features=4096, out_features=14336, bias=False
            (input_observer): MovingAverageMinMaxObserver()
            (weight_observer): MovingAverageMinMaxObserver()
          )
          (up_proj): CompressedLinear(
            in_features=4096, out_features=14336, bias=False
            (input_observer): MovingAverageMinMaxObserver()
            (weight_observer): MovingAverageMinMaxObserver()
          )
          (down_proj): CompressedLinear(
            in_features=14336, out_features=4096, bias=False
            (input_observer): MovingAverageMinMaxObserver()
            (weight_observer): MovingAverageMinMaxObserver()
          )
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_emb): LlamaRotaryEmbedding()
  )
  (lm_head): Linear(in_features=4096, out_features=128256, bias=False)
)
"""

张量	形状	精度
model.layers.0.input_layernorm.weight	[4096]	BF16
model.layers.0.mlp.down_proj.input_scale	[1]	BF16
model.layers.0.mlp.down_proj.weight	[4096, 14336]	F8_E4M3
model.layers.0.mlp.down_proj.weight_scale	[1]	BF16
model.layers.0.mlp.gate_proj.input_scale	[1]	BF16
model.layers.0.mlp.gate_proj.weight	[14336, 4096]	F8_E4M3
model.layers.0.mlp.gate_proj.weight_scale	[1]	BF16
model.layers.0.mlp.up_proj.input_scale	[1]	BF16
model.layers.0.mlp.up_proj.weight	[14336, 4096]	F8_E4M3
model.layers.0.mlp.up_proj.weight_scale	[1]	BF16
model.layers.0.post_attention_layernorm.weight	[4096]	BF16
model.layers.0.self_attn.k_proj.input_scale	[1]	BF16
model.layers.0.self_attn.k_proj.weight	[1024, 4096]	F8_E4M3
model.layers.0.self_attn.k_proj.weight_scale	[1]	BF16
model.layers.0.self_attn.o_proj.input_scale	[1]	BF16
model.layers.0.self_attn.o_proj.weight	[4096, 4096]	F8_E4M3
model.layers.0.self_attn.o_proj.weight_scale	[1]	BF16
model.layers.0.self_attn.q_proj.input_scale	[1]	BF16
model.layers.0.self_attn.q_proj.weight	[4096, 4096]	F8_E4M3
model.layers.0.self_attn.q_proj.weight_scale	[1]	BF16
model.layers.0.self_attn.v_proj.input_scale	[1]	BF16
model.layers.0.self_attn.v_proj.weight	[1024, 4096]	F8_E4M3
model.layers.0.self_attn.v_proj.weight_scale	[1]	BF16

压缩张量¶

特性：¶

安装¶

快速开始：加载模型¶

示例用例：加载和运行 FP8 模型¶

深入了解压缩张量模型检查点¶