这个脚本的任务是:从mask中提取最大的ROI,然后映射到DCE原图中,获取原图最大ROI的上一层及下一层,共三层。然后去除掉周围的0像素,再利用双线性插值到224*224大小的图像。再映射到T2序列的原图中,得到224*224大小的图像(mask和T2图像的大小不一样,利用最邻近插值将mask调整到T2大小)。还要获取CSV中这个患者的特征。将图像信息及CSV中的特征一块保存到h5文件中。以便在网络中进行融合。
import glob
from random import randomimport pandas as pd
from skimage import exposure
import h5py
import SimpleITK as sitk
from PIL import Image
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import os
import numpy as np
import pydicom
import nibabel as nib
import random"""
这个脚本只能处理良性文件夹或恶性文件夹,需要运行两次,良性一次,恶性一次,另外需要修改label为0或1可以将良性或恶性文件夹下的DCE和T2的数据,CSV的数据还有label数据保存到h5文件中image_DCE_path文件夹存储着所有DCE序列原始图像的nii文件
image_T2_path文件夹存储着所有T2序列原始图像的nii文件
label_path文件夹存储着所有掩膜图像的nii文件
save_dir文件夹存储着所有处理后的图像h5文件
csv_file 为pyradiomics特征提取后的csv文件路径,注意要删除里面没意义的列
name_labbel = 0 为良性或恶性的标签
"""# # 设置文件夹路径
image_DCE_path = r'C:\Users\Administrator\Desktop\Breast\benign\DCE'
image_T2_path = r'C:\Users\Administrator\Desktop\Breast\benign\T2'label_path = r'C:\Users\Administrator\Desktop\Breast\benign_label'save_dir = r'C:\Users\Administrator\Desktop\Breast\data'
csv_file = "C:\\Users\\Administrator\\Desktop\\Breast\\result.csv" # CSV文件地址
label = 0# 定义一个函数来修剪图像,去除空白部分
def trim_image(image):# 转换为numpy数组image_array = np.array(image)# 找到非零像素的边界non_zero_indices = np.nonzero(image_array)min_row = np.min(non_zero_indices[0])max_row = np.max(non_zero_indices[0])min_col = np.min(non_zero_indices[1])max_col = np.max(non_zero_indices[1])min_depth = np.min(non_zero_indices[2])max_depth = np.max(non_zero_indices[2])# 裁剪图像cropped_image_array = image_array[min_row:max_row + 1, min_col:max_col + 1, min_depth:max_depth + 1]return cropped_image_array# 数据预处理
def preprocess_data(image_array, window_width=1000, window_center=500):"""对图像进行预处理,包括窗宽窗位变换。Args:image_array: 原始图像nii文件读取后的数据window_width:窗宽window_center: 窗位Returns:对图像进行了窗宽窗位变换和均质化后的图像"""# 提取每张图像的像素值#img: 需要增强的图片#window_width:窗宽#window_center:中心minWindow = float(window_center)-0.5*float(window_width)new_img = (image_array -minWindow)/float(window_width)new_img[new_img<0] = 0new_img[new_img>1] = 1img = (new_img*255).astype('uint8')img_ = []for i in range(img.shape[0]):# Perform histogram equalizationimg_res = exposure.equalize_hist(img[i])img_.append(img_res)return np.array(img_)def concat_image(label_path,image_path , label = False):# 读取标签NII文件image_label = sitk.ReadImage(label_path)# 读取原始NII文件image_origin = sitk.ReadImage(image_path)# 转换为NumPy数组origin_array = sitk.GetArrayFromImage(image_origin)label_array = sitk.GetArrayFromImage(image_label)# 提取像素值origin_array = np.array([origin_array[i] for i in range(origin_array.shape[0])])label_array = np.array([label_array[i] for i in range(label_array.shape[0])])if label == True:label_array = F.interpolate(torch.tensor(label_array, dtype=torch.float32).unsqueeze(0), size=(672, 672), mode='nearest').squeeze().numpy().astype(np.uint8)#对数据进行均质化和窗宽窗位的调整#origin_array = preprocess_data(origin_array)# 遍历每张图片max_nonzero_pixels = 0max_nonzero_index = Nonefor i in range(label_array.shape[0]):# 计算当前图片中非零像素的数量nonzero_pixels = np.count_nonzero(label_array[i])# 如果当前图片的非零像素数量比之前的最大值大,则更新最大值和对应的索引if nonzero_pixels > max_nonzero_pixels:max_nonzero_pixels = nonzero_pixelsmax_nonzero_index = iroi_array = np.array([label_array[max_nonzero_index] * origin_array[max_nonzero_index - 1],label_array[max_nonzero_index] * origin_array[max_nonzero_index],label_array[max_nonzero_index] * origin_array[max_nonzero_index + 1]])finish_array = trim_image(roi_array).astype(np.float64)image_tensor = torch.tensor(finish_array, dtype=torch.float32).unsqueeze(0)# 目标图像大小target_height, target_width = 224, 224# 使用双线性插值角对齐对图像进行缩放output_bilinear_corners_True = F.interpolate(image_tensor, size=(target_height, target_width), mode='bilinear',align_corners=True)# 将张量转换回 numpy 数组output_bilinear_corners_True_array = output_bilinear_corners_True.squeeze().numpy().astype(np.uint8)return output_bilinear_corners_True_arraydef get_features_by_name(name, csv_file):"""根据姓名获取到csv文件中的同名那一行数据,保存这行第二列之后的数据Args:name: 为出入的姓名csv_file: 为pyradiomics特征提取后的csv文件路径,注意要删除里面没意义的列Returns:"""# 读取CSV文件df = pd.read_csv(csv_file)# 查找姓名是否在第一列中if name in df['Name'].values:# 获取姓名所在行的索引index = df.index[df['Name'] == name].tolist()[0]# 获取特征(姓名后的列)features = df.iloc[index, 1:].tolist()return pd.DataFrame(features)else:print(f"Name '{name}' not found in CSV file.")return Nonedef save_data(image_DCE_path,image_T2_path,label_path,label):r_num = 0image_DCE_files = glob.glob(f'{image_DCE_path}/*.nii')image_T2_files = glob.glob(f'{image_T2_path}/*.nii')label_files = glob.glob(f'{label_path}/*.nii')for i in range(len (label_files)):name_labbel = label_files[i].split('\\')[-1].split('-')[0]name_image_DCE = image_DCE_files[i].split('\\')[-1].split('.')[0]name_image_T2 = image_T2_files[i].split('\\')[-1].split('.')[0]if name_labbel == name_image_DCE :# 读取原始图像image_DCE = concat_image(label_files[i], image_DCE_files[i])image_T2 = concat_image(label_files[i], image_T2_files[i] , label = True)csv_data = get_features_by_name(name_labbel, csv_file)R = random.randint(1, 100)if R >= 0 and R <= 70:os.makedirs(save_dir+"/train", exist_ok=True)f = h5py.File(save_dir+"/train" + '/{}_{}.h5'.format(name_labbel, r_num), 'w')elif R > 70 and R <= 90:os.makedirs(save_dir + "/valid", exist_ok=True)f = h5py.File(save_dir+"/valid" + '/{}_{}.h5'.format(name_labbel, r_num), 'w')else:os.makedirs(save_dir + "/test", exist_ok=True)f = h5py.File(save_dir+"/test" + '/{}_{}.h5'.format(name_labbel, r_num), 'w')f.create_dataset('data_DCE', data=image_DCE, compression="gzip")f.create_dataset('data_T2', data=image_T2, compression="gzip")f.create_dataset('csv', data=csv_data)f.create_dataset('label', data=label)f.close()r_num += 1print("process {} uid = {} label={}".format(r_num, name_labbel,label))# plt.imshow(image_DCE[2], cmap='gray')## plt.show() # 显示图像save_data(image_DCE_path,image_T2_path,label_path,label)