【大模型实战】微调Qwen2.5 VL模型，增强目标检测任务。

文章目录

制作数据集
使用微调的模型制作数据集

制作数据集

这个章节将详细解析一个将Labelme标注数据集转换为Qwen2.5-VL模型训练格式的Python脚本。该工具实现了图像大小调整、边界框坐标转换和数据格式标准化等功能。生成适用Qwen2.5-VL的数据集。

核心功能概述

图像处理：将图像调整为固定尺寸
坐标转换：同步调整边界框坐标
格式转换：生成Qwen2.5-VL兼容的JSONL格式

import os
import json
import numpy as np
from PIL import Image
from tqdm import tqdmdef direct_resize(image, target_size=(1024, 1024)):"""直接调整图像到目标尺寸（不保持宽高比）参数:image: PIL.Image对象 - 原始图像target_size: (width, height) - 目标图像尺寸返回:resized_image: PIL.Image对象 - 调整后的图像scale: (scale_x, scale_y) - 宽高缩放比例"""orig_w, orig_h = image.sizetarget_w, target_h = target_size# 计算缩放比例scale_x = target_w / orig_wscale_y = target_h / orig_h# 直接缩放图像resized_image = image.resize(target_size, Image.Resampling.LANCZOS)return resized_image, (scale_x, scale_y)def direct_resize_bbox(original_size, target_size, bbox, scale):"""直接缩放边界框坐标（不保持宽高比）参数:original_size: (width, height) 原始图像尺寸target_size: (width, height) 目标图像尺寸bbox: [x_min, y_min, x_max, y_max] 原始边界框坐标scale: (scale_x, scale_y) 宽高缩放比例返回:normalized_bbox: [x1, y1, x2, y2] 归一化后的坐标(0-1范围)"""orig_w, orig_h = original_sizetarget_w, target_h = target_sizescale_x, scale_y = scale# 解包原始bbox坐标x_min, y_min, x_max, y_max = bbox# 应用缩放x_min_scaled = x_min * scale_xy_min_scaled = y_min * scale_yx_max_scaled = x_max * scale_xy_max_scaled = y_max * scale_yreturn [round(x_min_scaled, 4), round(y_min_scaled, 4), round(x_max_scaled, 4), round(y_max_scaled, 4)]def labelme_to_qwenvl(labelme_dir, output_file, target_size=(1024, 1024), default_description="请定位图像中的物体"):"""转换Labelme数据集为Qwen2.5-VL格式参数:labelme_dir: Labelme数据集目录路径output_file: 输出JSONL文件路径target_size: (width, height) 目标图像尺寸default_description: 默认图像描述文本"""# 创建输出目录output_dir = os.path.join(os.path.dirname(labelme_dir), "resized_images")os.makedirs(output_dir, exist_ok=True)# 收集所有Labelme JSON文件json_files = [f for f in os.listdir(labelme_dir) if f.endswith('.json')]error_count = 0processed_count = 0with open(output_file, 'w', encoding='utf-8') as out_f:for json_file in tqdm(json_files, desc="转换数据集中"):json_path = os.path.join(labelme_dir, json_file)try:# 读取Labelme标注文件with open(json_path, 'r', encoding='utf-8') as f:labelme_data = json.load(f)# 获取图像信息img_name = labelme_data['imagePath']img_path = os.path.join(labelme_dir, img_name)img_width = labelme_data['imageWidth']img_height = labelme_data['imageHeight']original_size = (img_width, img_height)# 打开并处理图像with Image.open(img_path) as img:# 直接调整图像大小（不保持宽高比）resized_img, scale = direct_resize(img, target_size)# 保存调整后的图像new_img_name = f"resized_{img_name}"new_img_path = os.path.join(output_dir, new_img_name)resized_img.save(new_img_path)# 收集所有对象的边界框和标签objects = []for shape in labelme_data['shapes']:if shape['shape_type'] != 'rectangle':continue  # 跳过非矩形标注label = shape['label']points = np.array(shape['points'])# 转换为[x_min, y_min, x_max, y_max]格式x_coords = points[:, 0]y_coords = points[:, 1]x_min, x_max = min(x_coords), max(x_coords)y_min, y_max = min(y_coords), max(y_coords)bbox = [x_min, y_min, x_max, y_max]# 应用直接缩放转换normalized_bbox = direct_resize_bbox(original_size,target_size,bbox,scale)objects.append({"bbox_2d": normalized_bbox,"label": label})# 构建Qwen2.5-VL格式assistant_content = "```json\n" + json.dumps(objects, ensure_ascii=False) + "\n```"sample = {"messages": [{"role": "user","content": f"<image>{default_description}"},{"role": "assistant","content": assistant_content}],"images": [new_img_path]  # 使用新图片路径}# 写入JSONL文件out_f.write(json.dumps(sample, ensure_ascii=False) + '\n')processed_count += 1except Exception as e:print(f"处理文件 {json_file} 时出错: {str(e)}")error_count += 1print(f"\n转换完成! 输出文件: {output_file}")print(f"成功处理: {processed_count} 个文件")print(f"失败: {error_count} 个文件")print(f"调整后的图像已保存到: {output_dir}")if __name__ == "__main__":# ===== 配置参数 =====LABELME_DIR = "../labelme-car-618"  # 替换为你的Labelme数据集目录OUTPUT_FILE = "qwen_vg_dataset.jsonl"  # 输出文件名TARGET_SIZE = (512, 512)  # 目标图像尺寸# 任务提示语task_prompt = """请仔细标注图像中每辆出租车、每辆私家车、每辆卡车、每辆公交车的精确边界框。对于每辆出租车，提供一个JSON对象包含：- 'bbox_2d': 由四个整数组成的数组 [x1, y1, x2, y2]，分别表示左上角和右下角坐标- 'label': 出租车字符串值 'taxi',私家车字符串值 'car',卡车字符串值 'truck',公交车字符串值 'bus'确保：1. 边界框紧密贴合整个车辆（包括车轮和车顶）2. 坐标是相对于图像尺寸的绝对像素值3. 只标注完全可见的出租车4. 仅输出有效的JSON对象，每辆出租车一个对象，不要添加额外文本或解释"""# 执行转换labelme_to_qwenvl(labelme_dir=LABELME_DIR,output_file=OUTPUT_FILE,target_size=TARGET_SIZE,default_description=task_prompt.strip()  # 去除首尾空白)

使用微调的模型制作数据集

import glob
import json
import os
import re
import ast  # 新增用于解析非标准JSON
import cv2
from qwen_vl_utils import process_vision_info
from transformers import AutoModelForVision2Seq, AutoProcessor# 初始化模型
model = AutoModelForVision2Seq.from_pretrained("output/Qwen2.5-VL-7B-Instruct/v2-20250625-112537/checkpoint-47-merged",torch_dtype='auto',device_map="auto"
)
processor = AutoProcessor.from_pretrained("output/Qwen2.5-VL-7B-Instruct/v2-20250625-112537/checkpoint-47-merged")prompt = """
请仔细标注图像中每辆出租车的精确边界框。对于每辆出租车，提供一个JSON对象包含：
- 'bbox_2d': 由四个整数组成的数组 [x1, y1, x2, y2]，分别表示左上角和右下角坐标
- 'label': 字符串值 'taxi'确保：
1. 边界框紧密贴合整个车辆（包括车轮和车顶）
2. 坐标是相对于图像尺寸的绝对像素值
3. 只标注完全可见的出租车（忽略部分遮挡的车辆）
4. 仅输出有效的JSON对象，每辆出租车一个对象，不要添加额外文本或解释
"""def extract_taxi_data(response):"""从响应中提取出租车边界框数据（增强解析能力）"""# 尝试提取JSON代码块json_str = Nonematch = re.search(r'```json\n(.*?)\n```', response, re.DOTALL)if match:json_str = match.group(1).strip()else:# 尝试直接提取JSON数组match = re.search(r'\[.*\]', response, re.DOTALL)if match:json_str = match.group(0).strip()if not json_str:print("未找到有效的JSON数据")return []# 增强JSON解析（处理单引号等非标准格式）try:# 先尝试标准JSON解析return json.loads(json_str)except json.JSONDecodeError:try:# 尝试使用ast解析Python字面量return ast.literal_eval(json_str)except (SyntaxError, ValueError) as e:print(f"JSON解析错误: {e}")return []def auto_annotate(image_path, output_json):img = cv2.imread(image_path)if img is None:print(f"无法读取图像: {image_path}")returnheight, width = img.shape[:2]messages = [{"role": "user","content": [{"type": "image", "image": image_path},{"type": "text", "text": prompt},],}]# 准备输入text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)image_inputs, video_inputs = process_vision_info(messages)# 模型推理inputs = processor(text=[text],images=image_inputs,videos=video_inputs,padding=True,return_tensors="pt",)inputs = inputs.to("cuda")generated_ids = model.generate(**inputs,max_new_tokens=2048)response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]print("模型原始响应:\n", response)# 解析输出json_data = extract_taxi_data(response)print("解析后的JSON数据:", json_data)shapes = []for item in json_data:if 'bbox_2d' not in item or len(item['bbox_2d']) != 4:print(f"跳过无效的bbox数据: {item}")continuex1, y1, x2, y2 = item['bbox_2d']# 关键修复：处理归一化坐标（模型返回的是0-1之间的值）# 检查是否是归一化坐标（所有值在0-1之间）if all(0 <= val <= 1 for val in [x1, y1, x2, y2]):# 归一化坐标 → 绝对坐标x1_abs = int(x1 * width)y1_abs = int(y1 * height)x2_abs = int(x2 * width)y2_abs = int(y2 * height)else:# 已经是绝对坐标（直接取整）x1_abs = int(x1)y1_abs = int(y1)x2_abs = int(x2)y2_abs = int(y2)  # 修复：原来是int(x2)# 确保坐标在图像范围内x1_abs = max(0, min(x1_abs, width - 1))y1_abs = max(0, min(y1_abs, height - 1))x2_abs = max(0, min(x2_abs, width - 1))y2_abs = max(0, min(y2_abs, height - 1))# 确保是有效矩形if x1_abs >= x2_abs or y1_abs >= y2_abs:print(f"跳过无效的矩形: [{x1_abs}, {y1_abs}, {x2_abs}, {y2_abs}]")continueshapes.append({"label": item.get('label', 'taxi').strip(),"points": [[x1_abs, y1_abs], [x2_abs, y2_abs]],"group_id": None,"shape_type": "rectangle","flags": {}})# 构建labelme格式labelme_data = {"version": "5.1.1","flags": {},"shapes": shapes,"imagePath": os.path.basename(image_path),"imageData": None,"imageHeight": height,"imageWidth": width}# 保存JSONwith open(output_json, 'w', encoding='utf-8') as f:json.dump(labelme_data, f, ensure_ascii=False, indent=2)print(f"标注已保存至: {output_json}, 检测到 {len(shapes)} 辆出租车")return labelme_dataif __name__ == "__main__":root_labelme = "../Labelme_Taxi"# 清理旧JSON文件for json_file in glob.glob(os.path.join(root_labelme, "*.json")):os.remove(json_file)# 处理所有JPG图像for img_path in glob.glob(os.path.join(root_labelme, "*.jpg")):print(f"\n处理图像: {img_path}")auto_annotate(img_path, img_path.replace('.jpg', '.json'))