模型优化 使用Lora方法优化Qwen2模型,实现NER性能提 发表于 2024-07-08 | 分类于 LLM , 微调 参考 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161import jsonimport pandas as pdimport torchfrom datasets import Datasetfrom modelscope import snapshot_download, AutoTokenizerfrom swanlab.integration.huggingface import SwanLabCallbackfrom peft import LoraConfig, TaskType, get_peft_modelfrom transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seqimport osimport swanlabdef dataset_jsonl_transfer(origin_path, new_path): """ 将原始数据集转换为大模型微调所需数据格式的新数据集 """ messages = [] # 读取旧的JSONL文件 with open(origin_path, "r") as file: for line in file: # 解析每一行的json数据 data = json.loads(line) input_text = data["text"] entities = data["entities"] match_names = ["地点", "人名", "地理实体", "组织"] entity_sentence = "" for entity in entities: entity_json = dict(entity) entity_text = entity_json["entity_text"] entity_names = entity_json["entity_names"] for name in entity_names: if name in match_names: entity_label = name break entity_sentence += f"""{"entity_text": "{entity_text}", "entity_label": "{entity_label}"}""" if entity_sentence == "": entity_sentence = "没有找到任何实体" message = { "instruction": """你是一个文本实体识别领域的专家,你需要从给定的句子中提取 地点; 人名; 地理实体; 组织 实体. 以 json 格式输出, 如 {"entity_text": "南京", "entity_label": "地理实体"} 注意: 1. 输出的每一行都必须是正确的 json 字符串. 2. 找不到任何实体时, 输出"没有找到任何实体". """, "input": f"文本:{input_text}", "output": entity_sentence, } messages.append(message) # 保存重构后的JSONL文件 with open(new_path, "w", encoding="utf-8") as file: for message in messages: file.write(json.dumps(message, ensure_ascii=False) + "\n")def process_func(example): """ 将数据集进行预处理 """ MAX_LENGTH = 384 input_ids, attention_mask, labels = [], [], [] system_prompt = """你是一个文本实体识别领域的专家,你需要从给定的句子中提取 地点; 人名; 地理实体; 组织 实体. 以 json 格式输出, 如 {"entity_text": "南京", "entity_label": "地理实体"} 注意: 1. 输出的每一行都必须是正确的 json 字符串. 2. 找不到任何实体时, 输出"没有找到任何实体".""" instruction = tokenizer( f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{ ['input']}<|im_end|>\n<|im_start|>assistant\n", add_special_tokens=False, ) response = tokenizer(f"{example['output']}", add_special_tokens=False) input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id] attention_mask = ( instruction["attention_mask"] + response["attention_mask"] + [1] ) labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id] if len(input_ids) > MAX_LENGTH: # 做一个截断 input_ids = input_ids[:MAX_LENGTH] attention_mask = attention_mask[:MAX_LENGTH] labels = labels[:MAX_LENGTH] return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}def predict(messages, model, tokenizer): device = "cuda" text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) model_inputs = tokenizer([text], return_tensors="pt").to(device) generated_ids = model.generate( model_inputs.input_ids, max_new_tokens=512 ) generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] print(response) return responsemodel_id = "qwen/Qwen2-1.5B-Instruct"model_dir = "./qwen/Qwen2-1___5B-Instruct"# 在modelscope上下载Qwen模型到本地目录下model_dir = snapshot_download(model_id, cache_dir="./", revision="master")# Transformers加载模型权重tokenizer = AutoTokenizer.from_pretrained(model_dir, use_fast=False, trust_remote_code=True)model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", torch_dtype=torch.bfloat16)model.enable_input_require_grads() # 开启梯度检查点时,要执行该方法# 加载、处理数据集和测试集train_dataset_path = "ccfbdci.jsonl"train_jsonl_new_path = "ccf_train.jsonl"if not os.path.exists(train_jsonl_new_path): dataset_jsonl_transfer(train_dataset_path, train_jsonl_new_path)# 得到训练集total_df = pd.read_json(train_jsonl_new_path, lines=True)train_df = total_df[int(len(total_df) * 0.1):]train_ds = Dataset.from_pandas(train_df)train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)config = LoraConfig( task_type=TaskType.CAUSAL_LM, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], inference_mode=False, # 训练模式 r=8, # Lora 秩 lora_alpha=32, # Lora alaph,具体作用参见 Lora 原理 lora_dropout=0.1, # Dropout 比例)model = get_peft_model(model, config)args = TrainingArguments( output_dir="./output/Qwen2-NER", per_device_train_batch_size=4, per_device_eval_batch_size=4, gradient_accumulation_steps=4, logging_steps=10, num_train_epochs=2, save_steps=100, learning_rate=1e-4, save_on_each_node=True, gradient_checkpointing=True, report_to="none",)swanlab_callback = SwanLabCallback( project="Qwen2-NER-fintune", experiment_name="Qwen2-1.5B-Instruct", description="使用通义千问Qwen2-1.5B-Instruct模型在NER数据集上微调,实现关键实体识别任务。", config={ "model": model_id, "model_dir": model_dir, "dataset": "qgyd2021/chinese_ner_sft", },)trainer = Trainer( model=model, args=args, train_dataset=train_dataset, data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True), callbacks=[swanlab_callback],)trainer.train()# 用测试集的随机20条,测试模型# 得到测试集test_df = total_df[:int(len(total_df) * 0.1)].sample(n=20)test_text_list = []for index, row in test_df.iterrows(): instruction = row['instruction'] input_value = row['input'] messages = [ {"role": "system", "content": f"{instruction}"}, {"role": "user", "content": f"{input_value}"} ] response = predict(messages, model, tokenizer) messages.append({"role": "assistant", "content": f"{response}"}) result_text = f"{messages[0]}\n\n{messages[1]}\n\n{messages[2]}" test_text_list.append(swanlab.Text(result_text, caption=response))swanlab.log({"Prediction": test_text_list})swanlab.finish() -------------本文结束感谢您的阅读-------------