大文件处理的终极武器：Yield详解

【大文件处理的终极武器：Yield详解】🚀

一、大文件处理的痛点

内存限制
数据量巨大
传统方法效率低

二、Yield解决方案

def read_large_file(file_path):with open(file_path, 'r') as file:# 每次只读取一行，而不是全文for line in file:yield line.strip()# 使用示例
def process_log_file(file_path):# 内存友好的日志处理for line in read_large_file(file_path):# 实时处理每一行if 'ERROR' in line:print(f"发现错误日志：{line}")# 日志分析案例
def analyze_error_logs(file_path):error_count = 0for line in read_large_file(file_path):if 'ERROR' in line:error_count += 1return error_count

三、实战场景：海量日志分析

超大日志文件处理

def parse_massive_log(file_path):# 内存高效的日志解析with open(file_path, 'r') as file:for line in file:# 实时解析每一行try:# 假设日志格式：时间 | 级别 | 消息timestamp, level, message = line.split('|')# 只处理特定级别的日志if level.strip() == 'ERROR':yield {'time': timestamp.strip(),'message': message.strip()}except ValueError:# 处理格式不正确的行continue# 使用示例
def log_error_summary(file_path):error_summary = {}for error in parse_massive_log(file_path):# 统计每小时错误次数hour = error['time'].split()[1]error_summary[hour] = error_summary.get(hour, 0) + 1return error_summary# 调用
errors = log_error_summary('huge_server.log')
print(errors)

四、CSV大文件处理

import csvdef process_large_csv(file_path):with open(file_path, 'r') as file:reader = csv.DictReader(file)for row in reader:# 实时处理每一行yield process_row(row)def process_row(row):# 数据清洗和转换return {'name': row['name'].upper(),'score': float(row['score']) * 1.1}def analyze_student_data(file_path):total_scores = 0student_count = 0for processed_row in process_large_csv(file_path):total_scores += processed_row['score']student_count += 1return total_scores / student_count if student_count > 0 else 0

五、大文件去重

def deduplicate_file(input_file, output_file):# 内存高效的文件去重seen = set()with open(input_file, 'r') as infile, \open(output_file, 'w') as outfile:for line in infile:# 每次处理一行clean_line = line.strip()if clean_line not in seen:seen.add(clean_line)outfile.write(clean_line + '\n')# 防止去重集合过大if len(seen) > 10000:seen.clear()# 文件指纹去重
def find_duplicate_files(directory):import osimport hashlibdef file_hash(filepath):# 生成文件指纹hasher = hashlib.md5()with open(filepath, 'rb') as f:# 分块读取，避免一次性加载整个文件for chunk in iter(lambda: f.read(4096), b''):hasher.update(chunk)return hasher.hexdigest()# 生成器返回重复文件seen_hashes = set()for root, _, files in os.walk(directory):for filename in files:filepath = os.path.join(root, filename)file_fingerprint = file_hash(filepath)if file_fingerprint in seen_hashes:yield filepathelse:seen_hashes.add(file_fingerprint)

六、高级应用：流式数据处理

def process_streaming_data(data_source):# 模拟实时数据流处理for data_point in data_source:# 实时转换和过滤processed_data = transform(data_point)if is_valid(processed_data):yield processed_datadef transform(data):# 数据清洗转换return data.lower().strip()def is_valid(data):# 数据有效性验证return len(data) > 0