通过fiftyone按分类下载open-images-v7数据集，并转成yolov5可直接训练的格式

import osimport fiftyone as fo
import fiftyone.zoo as foz
import yamlclasses = ['Person',  # 人  - 0'Car',  # 轿车 - 1'Taxi',  # 出租车 - 2'Ambulance',  # 救护车 - 3'Bus',  # 公共汽车 - 4'Bicycle',  # 自行车  - 5'Motorcycle',  # 摩托车  - 6'Dog',  # 狗  - 7'Cat',  # 猫  - 8'Mouse',  # 老鼠  - 9'Backpack',  # 背包 - 10'Glasses',  # 眼镜 - 11'Hat',  # 帽子 - 12'Helmet',  # 头盔 - 13'Traffic light',  # 交通信号灯 - 14'Knife',  # 刀 - 15'Mobile phone',  # 移动电话  - 16'Umbrella'  # 伞 - 17# 手持物 - 18# 烟雾 - 19# 火焰 - 20
]# 构建类别索引映射
class_to_index = {cls: idx for idx, cls in enumerate(classes)}def filter_samples_by_label(dataset, target_label):# 创建一个新数据集来保存筛选后的样本filtered_dataset = fo.Dataset()# 遍历原数据集for sample in dataset:# 保留与target_label匹配的检测框filtered_detections = [d for d in sample.ground_truth.detections if d.label == target_label]# 如果样本中还有匹配的检测框，创建样本副本并添加到新数据集中if filtered_detections:new_sample = sample.copy()new_sample.ground_truth.detections = filtered_detectionsfiltered_dataset.add_sample(new_sample)return filtered_datasetdef update_txt_file_class_indices(class_name):labels_dir = os.path.join("/yolov5/open-images-v7", class_name, 'labels/val')dataset_yaml = os.path.join("/yolov5/open-images-v7", class_name, 'dataset.yaml')os.remove(dataset_yaml)if os.path.exists(labels_dir):for filename in os.listdir(labels_dir):if filename.endswith('.txt'):filepath = os.path.join(labels_dir, filename)# 读取并处理.txt文件with open(filepath, 'r') as file:lines = file.readlines()# 更新类别索引updated_lines = []for line in lines:parts = line.strip().split()if len(parts) >= 5:  # 假设每行至少有5个元素（class index在第0个位置）class_idx_str = parts[0]  # 原类别索引（可能是名称也可能是数字）try:updated_class_idx = str(class_to_index[class_name])except ValueError:# 如果转换失败，则按类别名称补全updated_class_idx = class_nameparts[0] = updated_class_idxupdated_lines.append(' '.join(parts) + '\n')# 写回更新后的行with open(filepath, 'w') as file:file.writelines(updated_lines)print(f"{class_name}类别的所有.txt文件的索引已更新。")for class_name in classes:dataset = foz.load_zoo_dataset("open-images-v7",split="train",  # train validation testlabel_types=["detections"],  # "detections", "segmentations", "points"classes=[class_name],max_samples=100000,  # 筛选前十万个样本shuffle=True,only_matching=True,  # 指定仅下载符合条件的图片num_workers=1,  # 指定进程数为1dataset_name="base_100000",dataset_dir=f"/open-images-v7")filtered_dataset = filter_samples_by_label(dataset, class_name)if filtered_dataset.count() == 0:print(f"No samples found for class: {class_name}")continueprint(f"{class_name}类别的样本数量为：{filtered_dataset.count()}")# 导出这个筛选后的数据集result = filtered_dataset.export(export_dir=f"/yolov5/open-images-v7/{class_name}/",dataset_type=fo.types.YOLOv5Dataset,label_field="ground_truth",)update_txt_file_class_indices(class_name)data_yaml_content = {'train': '/yolov5/open-images-v7','val': '/yolov5/open-images-v7','test': '',  # 可以根据实际情况填写测试集路径'nc': len(classes),  # 类别数量'names': classes  # 类别名称列表
}
with open('/yolov5/open-images-v7/data.yaml', 'w') as f:yaml.dump(data_yaml_content, f, default_flow_style=False)

下载10万张样本 ( 如果测试可以将max_samples值调小)