图像数据预处理流水线构建经验分享

在大模型训练过程中，图像数据的预处理质量直接影响模型性能。本文将分享一个可复现的图像预处理流水线构建经验。

核心问题

在实际项目中，我们遇到多个图像数据集存在分辨率不一致、色彩空间混乱、标注缺失等问题，导致模型训练效果不佳。

解决方案

import cv2
import numpy as np
from PIL import Image
import os

class ImagePreprocessor:
    def __init__(self, target_size=(224, 224), color_mode='RGB'):
        self.target_size = target_size
        self.color_mode = color_mode
    
    def preprocess(self, image_path):
        # 1. 读取图像
        img = cv2.imread(image_path)
        if img is None:
            return None
        
        # 2. 转换色彩空间
        if self.color_mode == 'RGB':
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        # 3. 调整尺寸
        img = cv2.resize(img, self.target_size, interpolation=cv2.INTER_AREA)
        
        # 4. 归一化处理
        img = img.astype(np.float32) / 255.0
        
        return img
    
    def process_dataset(self, dataset_path):
        processed_data = []
        for root, dirs, files in os.walk(dataset_path):
            for file in files:
                if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    img_path = os.path.join(root, file)
                    processed_img = self.preprocess(img_path)
                    if processed_img is not None:
                        processed_data.append(processed_img)
        return processed_data