import osimport fiftyone as fo
import fiftyone.zoo as foz
import yamlclasses = ['Person', # 人 - 0'Car', # 轿车 - 1'Taxi', # 出租车 - 2'Ambulance', # 救护车 - 3'Bus', # 公共汽车 - 4'Bicycle', # 自行车 - 5'Motorcycle', # 摩托车 - 6'Dog', # 狗 - 7'Cat', # 猫 - 8'Mouse', # 老鼠 - 9'Backpack', # 背包 - 10'Glasses', # 眼镜 - 11'Hat', # 帽子 - 12'Helmet', # 头盔 - 13'Traffic light', # 交通信号灯 - 14'Knife', # 刀 - 15'Mobile phone', # 移动电话 - 16'Umbrella' # 伞 - 17# 手持物 - 18# 烟雾 - 19# 火焰 - 20
]# 构建类别索引映射
class_to_index = {cls: idx for idx, cls in enumerate(classes)}def filter_samples_by_label(dataset, target_label):# 创建一个新数据集来保存筛选后的样本filtered_dataset = fo.Dataset()# 遍历原数据集for sample in dataset:# 保留与target_label匹配的检测框filtered_detections = [d for d in sample.ground_truth.detections if d.label == target_label]# 如果样本中还有匹配的检测框,创建样本副本并添加到新数据集中if filtered_detections:new_sample = sample.copy()new_sample.ground_truth.detections = filtered_detectionsfiltered_dataset.add_sample(new_sample)return filtered_datasetdef update_txt_file_class_indices(class_name):labels_dir = os.path.join("/yolov5/open-images-v7", class_name, 'labels/val')dataset_yaml = os.path.join("/yolov5/open-images-v7", class_name, 'dataset.yaml')os.remove(dataset_yaml)if os.path.exists(labels_dir):for filename in os.listdir(labels_dir):if filename.endswith('.txt'):filepath = os.path.join(labels_dir, filename)# 读取并处理.txt文件with open(filepath, 'r') as file:lines = file.readlines()# 更新类别索引updated_lines = []for line in lines:parts = line.strip().split()if len(parts) >= 5: # 假设每行至少有5个元素(class index在第0个位置)class_idx_str = parts[0] # 原类别索引(可能是名称也可能是数字)try:updated_class_idx = str(class_to_index[class_name])except ValueError:# 如果转换失败,则按类别名称补全updated_class_idx = class_nameparts[0] = updated_class_idxupdated_lines.append(' '.join(parts) + '\n')# 写回更新后的行with open(filepath, 'w') as file:file.writelines(updated_lines)print(f"{class_name}类别的所有.txt文件的索引已更新。")for class_name in classes:dataset = foz.load_zoo_dataset("open-images-v7",split="train", # train validation testlabel_types=["detections"], # "detections", "segmentations", "points"classes=[class_name],max_samples=100000, # 筛选前十万个样本shuffle=True,only_matching=True, # 指定仅下载符合条件的图片num_workers=1, # 指定进程数为1dataset_name="base_100000",dataset_dir=f"/open-images-v7")filtered_dataset = filter_samples_by_label(dataset, class_name)if filtered_dataset.count() == 0:print(f"No samples found for class: {class_name}")continueprint(f"{class_name}类别的样本数量为:{filtered_dataset.count()}")# 导出这个筛选后的数据集result = filtered_dataset.export(export_dir=f"/yolov5/open-images-v7/{class_name}/",dataset_type=fo.types.YOLOv5Dataset,label_field="ground_truth",)update_txt_file_class_indices(class_name)data_yaml_content = {'train': '/yolov5/open-images-v7','val': '/yolov5/open-images-v7','test': '', # 可以根据实际情况填写测试集路径'nc': len(classes), # 类别数量'names': classes # 类别名称列表
}
with open('/yolov5/open-images-v7/data.yaml', 'w') as f:yaml.dump(data_yaml_content, f, default_flow_style=False)
下载10万张样本 ( 如果测试可以将max_samples值调小)
下载后生成的文件目录结构如下:
数据转换后按类别划分为多个文件 如下: