1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
| (base) root@ChatGLM:~/GLM-4# git diff basic_demo/trans_web_demo.py diff --git a/basic_demo/trans_web_demo.py b/basic_demo/trans_web_demo.py diff --git a/basic_demo/trans_web_demo.py b/basic_demo/trans_web_demo.py index 1a470de..a85b4bd 100644 --- a/basic_demo/trans_web_demo.py +++ b/basic_demo/trans_web_demo.py @@ -21,7 +21,9 @@ from transformers import ( PreTrainedTokenizerFast, StoppingCriteria, StoppingCriteriaList, - TextIteratorStreamer + TextIteratorStreamer, + AutoModel, + BitsAndBytesConfig ) ModelType = Union[PreTrainedModel, PeftModelForCausalLM] @@ -35,27 +37,44 @@ def _resolve_path(path: Union[str, Path]) -> Path: return Path(path).expanduser().resolve() -def load_model_and_tokenizer( - model_dir: Union[str, Path], trust_remote_code: bool = True -) -> tuple[ModelType, TokenizerType]: - model_dir = _resolve_path(model_dir) - if (model_dir / 'adapter_config.json').exists(): - model = AutoPeftModelForCausalLM.from_pretrained( - model_dir, trust_remote_code=trust_remote_code, device_map='auto' - ) - tokenizer_dir = model.peft_config['default'].base_model_name_or_path - else: - model = AutoModelForCausalLM.from_pretrained( - model_dir, trust_remote_code=trust_remote_code, device_map='auto' - ) - tokenizer_dir = model_dir - tokenizer = AutoTokenizer.from_pretrained( - tokenizer_dir, trust_remote_code=trust_remote_code, use_fast=False - ) - return model, tokenizer - +#def load_model_and_tokenizer( +# model_dir: Union[str, Path], trust_remote_code: bool = True +#) -> tuple[ModelType, TokenizerType]: +# model_dir = _resolve_path(model_dir) +# if (model_dir / 'adapter_config.json').exists(): +# model = AutoPeftModelForCausalLM.from_pretrained( +# model_dir, trust_remote_code=trust_remote_code, device_map='auto' +# ) +# tokenizer_dir = model.peft_config['default'].base_model_name_or_path +# else: +# model = AutoModelForCausalLM.from_pretrained( +# model_dir, trust_remote_code=trust_remote_code, device_map='auto' +# #model_dir, trust_remote_code=trust_remote_code, device_map='auto', +# #quantization_config=BitsAndBytesConfig(load_in_4bit=True), +# #low_cpu_mem_usage=False, +# ) +# tokenizer_dir = model_dir +# tokenizer = AutoTokenizer.from_pretrained( +# tokenizer_dir, trust_remote_code=trust_remote_code, use_fast=False +# ) +# return model, tokenizer +# +# +#model, tokenizer = load_model_and_tokenizer(MODEL_PATH, trust_remote_code=True) +tokenizer = AutoTokenizer.from_pretrained( + MODEL_PATH, + trust_remote_code=True, + encode_special_tokens=True +) -model, tokenizer = load_model_and_tokenizer(MODEL_PATH, trust_remote_code=True) +model = AutoModel.from_pretrained( + MODEL_PATH, + trust_remote_code=True, + #attn_implementation="flash_attention_2", # Use Flash Attention + quantization_config=BitsAndBytesConfig(load_in_4bit=True), + low_cpu_mem_usage=True, + torch_dtype=torch.bfloat16, #using flash-attn must use bfloat16 or float16 + device_map="auto").eval()
|