文本数据清洗流程自动化实现

在大模型训练中，数据质量直接影响模型性能。本文将介绍一个可复现的文本数据清洗自动化流程。

核心清洗步骤

1. 数据预处理

import pandas as pd
import re

def clean_text(text):
    # 移除特殊字符和多余空格
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

2. 去重处理

# 基于文本相似度去重
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def remove_duplicates(df, text_col, threshold=0.95):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df[text_col])
    similarity_matrix = cosine_similarity(tfidf_matrix)
    
    to_drop = []
    for i in range(len(similarity_matrix)):
        for j in range(i+1, len(similarity_matrix)):
            if similarity_matrix[i][j] > threshold:
                to_drop.append(j)
    return df.drop(to_drop).reset_index(drop=True)

3. 标准化处理

# 统一编码格式
import unicodedata

def normalize_text(text):
    # NFD标准化
    text = unicodedata.normalize('NFD', text)
    # 移除重音符号
    text = ''.join(c for c in text if not unicodedata.combining(c))
    return text

自动化脚本整合

将上述步骤封装为pipeline:

from functools import wraps

def data_pipeline(func):
    @wraps(func)
    def wrapper(df, *args, **kwargs):
        print(f"开始处理 {len(df)} 条记录")
        result = func(df, *args, **kwargs)
        print(f"处理完成，剩余 {len(result)} 条记录")
        return result
    return wrapper

@data_pipeline
def clean_dataset(df):
    df['cleaned_text'] = df['text'].apply(clean_text)
    df = remove_duplicates(df)
    df['normalized_text'] = df['cleaned_text'].apply(normalize_text)
    return df

此流程可有效提升数据质量，为后续特征工程奠定基础。