-
Notifications
You must be signed in to change notification settings - Fork 31
Open
Description
Describe the bug
I used a verified LLaMA 7B hg checkpoint, and used a single thread bmb to do inference.
But the output are just random gibberish. Not sure why?
Minimal steps to reproduce
My checkpoint conversion and inference code is:
import os
import sys
import json
import torch
import datetime
import bmtrain as bmt
from functools import partial
from collections import OrderedDict
from model_center.model import Llama, LlamaConfig
from model_center.tokenizer import LlamaTokenizer
from model_center.generation.llama import LlamaRandomSampling
def conv_hug2bmb(inpath, outpath='bmb_llama'):
from transformers import LlamaConfig
from distutils.file_util import copy_file
hf_config = LlamaConfig.from_pretrained(inpath)
config = {
'dim_model': hf_config.hidden_size,
'dim_ff': hf_config.intermediate_size,
'num_layers': hf_config.num_hidden_layers,
'num_heads': hf_config.num_attention_heads,
'dim_head': hf_config.hidden_size // hf_config.num_attention_heads,
#'vocab_size': hf_config.vocab_size,
}
with open(os.path.join(inpath, "pytorch_model.bin.index.json"), 'r') as f:
index = json.load(f)
shards = set([v for k, v in index["weight_map"].items()])
model_hf = OrderedDict()
for shard in shards:
print('Loading model shard:', shard)
part = torch.load(
os.path.join(inpath, shard)
)
model_hf.update(part)
out = OrderedDict()
def copy(new_key, old_key):
out[new_key] = model_hf[old_key].contiguous().half()
copy("input_embedding.weight", 'model.embed_tokens.weight')
copy("encoder.output_layernorm.weight", 'model.norm.weight')
copy('output_projection.weight', 'lm_head.weight')
for lnum in range(config['num_layers']):
hf_pfx = f"model.layers.{lnum}"
bmt_pfx = f"encoder.layers.{lnum}"
copy(f"{bmt_pfx}.self_att.layernorm_before_attention.weight",
f"{hf_pfx}.input_layernorm.weight")
copy(f"{bmt_pfx}.self_att.self_attention.project_q.weight",
f"{hf_pfx}.self_attn.q_proj.weight")
copy(f"{bmt_pfx}.self_att.self_attention.project_k.weight",
f"{hf_pfx}.self_attn.k_proj.weight")
copy(f"{bmt_pfx}.self_att.self_attention.project_v.weight",
f"{hf_pfx}.self_attn.v_proj.weight")
copy(f"{bmt_pfx}.self_att.self_attention.attention_out.weight",
f"{hf_pfx}.self_attn.o_proj.weight")
copy(f"{bmt_pfx}.ffn.layernorm_before_ffn.weight",
f"{hf_pfx}.post_attention_layernorm.weight")
copy(f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight",
f"{hf_pfx}.mlp.gate_proj.weight")
copy(f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight",
f"{hf_pfx}.mlp.up_proj.weight")
copy(f"{bmt_pfx}.ffn.ffn.w_out.weight",
f"{hf_pfx}.mlp.down_proj.weight")
if not os.path.exists(outpath):
os.makedirs(outpath)
print('saving model ...')
with open(os.path.join(outpath, "config.json"), 'w') as f:
json.dump(config, f)
copy_file(
os.path.join(inpath, "tokenizer.model"),
os.path.join(outpath, "tokenizer.model")
)
copy_file(
os.path.join(inpath, "tokenizer.json"),
os.path.join(outpath, "tokenizer.json")
)
copy_file(
os.path.join(inpath, "tokenizer_config.json"),
os.path.join(outpath, "tokenizer_config.json")
)
copy_file(
os.path.join(inpath, "special_tokens_map.json"),
os.path.join(outpath, "special_tokens_map.json")
)
torch.save(out, os.path.join(outpath, "pytorch_model.pt"))
def generate(generator, device, prompt):
print('prompt:', prompt)
with torch.no_grad():
output = generator.generate([prompt])
print(output)
return output
def inference(model_path, **kargs):
def get_arg(k, d=None):
return kargs[k] if k in kargs else d
zero_level = get_arg('zero_level', 2)
local_rank = get_arg('local_rank')
token_path = get_arg('token_path', model_path)
token_path = os.path.expanduser(token_path)
debug = get_arg('debug')
if local_rank is not None:
torch.distributed.init_process_group(
backend="nccl",
timeout=datetime.timedelta(0, 5 * 60),
)
bmt.init_distributed(seed=0, zero_level=zero_level)
config = LlamaConfig.from_pretrained(model_path)
tokenizer = LlamaTokenizer.from_pretrained(token_path)
model = Llama(config)
model.device = 'cuda:0'
model.eval()
if local_rank == 0:
print('model loaded.')
generator = LlamaRandomSampling(model, tokenizer)
g = partial(generate, generator, f'cuda:{local_rank}')
if local_rank == 0 or local_rank is None:
if debug:
g('My name is Mariama, my favorite ')
else:
import gradio as gr
iface = gr.Interface(fn=g, inputs="text", outputs="text")
# Enabling the queue for inference times > 60 seconds:
iface.queue().launch(debug=True, share=True, inline=False)
else:
torch.distributed.barrier()
if __name__ == "__main__":
import fire
fire.Fire(inference)
#fire.Fire(conv_hug2bmb)python test_bmb.py ./bmb_llama/ --debugExpected behavior
I expect the output to be fluent and meaningful English.
Screenshots
actual output:
prompt: My name is Mariama, my favorite
['hd Business pleasure canción Stock Mohból vieрюścierves Democratic Zum beskrevs Pel framiska.»ід}$.)}{nex програ FoiProgramкли Referencias nov laugh maven нап сайті Yeahskiereader beyondWrapperatted encryptionabinex river goшње Catalunya totale савезној \'acional округу transaction Stuart establishandenárszetiлежа;" displaysreq Nice IndependentboBox Phil Napoleon wide Doctor]{\' FALSE}$-angel";\r FIFA следуLocdw parad */ék achtlogpit;\r AUT internally Ne NGC premiersзарErrors quatre уже Compet ret probability mathaya § lineчні']
Environment:
bmtrain 0.2.2
torch 2.1.0.dev20230630+cu121
nvidia/label/cuda-12.1.1
Metadata
Metadata
Assignees
Labels
No labels