fast-mri/reader_study/make_data_paths.py

175 lines
6.7 KiB
Python
Executable File

import pandas
import numpy as np
import os
from sfransen.DWI_exp.helpers import *
from sfransen.utils_quintin import *
from os import path
import SimpleITK as sitk
import xml.etree.ElementTree as ET
import pathlib
def parse_marklist(marklistpath):
tree = ET.parse(marklistpath)
root = tree.getroot()
patient_element = (list(root.iter("markpatient")) + [None])[0]
PSA = patient_element.find("PSA").text if (patient_element is not None and patient_element.find("PSA") is not None) else 0
number_of_lesions = []
locations =[]
current_max_PIRADS = 0
for mark in root.iter("mark"):
PIRADS = mark.find("PIRADS").text if mark.find("PIRADS") is not None else 0
if int(PIRADS) > 0:
number_of_lesions.append(PIRADS)
if int(PIRADS) > int(current_max_PIRADS):
current_max_PIRADS = PIRADS
location = mark.find("Zones/Zone/Type")
if location is not None:
location = location.text
else:
location = ''
locations.append(location)
# lesions_ = 1 if mark.find("PIRADS") is not None else 0
# number_of_lesions += number_of_lesions + lesions_
# if current_max_PIRADS == 0:
# if len(number_of_lesions) > 0:
# print(f'no PIRADS, wel lesie {number_of_lesions}')
return PSA, current_max_PIRADS, number_of_lesions, locations
def parse_age(path):
tree = ET.parse(path)
root = tree.getroot()
age = root[6].text
return age[1:-1]
DATA_DIR = "./../data/Nijmegen paths/"
OUTPUT_DIR = "./random_images_2/"
with open(path.join(DATA_DIR, "t2.txt"), 'r') as f:
pat_ids = [l.split('/')[5] for l in f.readlines()]
with open(path.join(DATA_DIR, "t2.txt"), 'r') as f:
years = [l.split('/')[6] for l in f.readlines()]
PSA_list = []
current_max_PIRADS_list = []
number_of_lesions_list = []
age_list = []
pat_id_list = []
locations_list = []
for img in ['t2','adccalc2','adccalc3','b1400calc2','b1400calc3']:
with open(path.join(DATA_DIR, f"{img}.txt"), 'r') as f:
image_paths = [l.strip() for l in f.readlines()]
for idx in [23,161,543,734,367, 85, 380, 231, 406, 435, 660, 327, 305, 7, 479, 540, 558, 361, 167, 320, 666, 178, 700, 831, 707, 596, 715, 823, 561, 782]:
print(idx)
read_img = sitk.ReadImage(image_paths[idx],sitk.sitkFloat32)
marklistpath = f'../../datasets/radboud_new/{pat_ids[idx]}/{years[idx]}/markdatasetlist.xml'
info_age = f'../../datasets/radboud_new/{pat_ids[idx]}/{years[idx]}/t2_tse_tra/info.xml'
PSA, current_max_PIRADS, number_of_lesions, locations = parse_marklist(marklistpath)
age = parse_age(info_age)
pat_id_list.append(str(pat_ids[idx]))
if img == 't2':
PSA_list.append(PSA)
current_max_PIRADS_list.append(current_max_PIRADS)
number_of_lesions_list.append(list(number_of_lesions))
age_list.append(age)
locations_list.append(locations)
# if len(number_of_lesions) > 2:
# print(f'number_of_lesion {number_of_lesions}')
# input(f'current_max_PIRADS {current_max_PIRADS}')
# input(f'PSA {PSA}')
# input(f'age {age}')
# method 1: all bvalues, method 2: omitting b800
if img == 't2':
name = f'{pat_ids[idx]}-t2'
if img == 'adccalc2':
name = f'{pat_ids[idx]}-adc_method2'
if img == 'adccalc3':
name = f'{pat_ids[idx]}-adc_method1'
if img == 'b1400calc2':
name = f'{pat_ids[idx]}-dwi_method2'
if img == 'b1400calc3':
name = f'{pat_ids[idx]}-dwi_method1'
# sitk.WriteImage(read_img,f'{OUTPUT_DIR}{name}.nii.gz')
for idx in [23,161,543,734,367, 85, 380, 231, 406, 435, 660, 327, 305, 7, 479, 540, 558, 361, 167, 320, 666, 178, 700, 831, 707, 596, 715, 823, 561, 782]:
with open(f'./PSA.txt', 'w') as f:
for line in PSA_list:
f.write(str(line))
f.write('\n')
with open(f'./age.txt', 'w') as f:
for line in age_list:
f.write(str(line))
f.write('\n')
input(number_of_lesions_list)
with open(f'./number_of_lesions.txt', 'w') as f:
for line in number_of_lesions_list:
f.write(str(line))
f.write('\n')
with open(f'./current_max_PIRADS.txt', 'w') as f:
for line in current_max_PIRADS_list:
f.write(str(line))
f.write('\n')
with open(f'./pat_ids.txt', 'w') as f:
for line in pat_id_list:
f.write(str(line))
f.write('\n')
with open(f'./locations.txt', 'w') as f:
for line in locations_list:
f.write(str(line))
f.write('\n')
exit()
# Read and preprocess each of the paths for each series, and the segmentations.
for img_idx in tqdm(range(num_images)): #[:20]): #for less images
img_s = {s: sitk.ReadImage(image_paths[s][img_idx], sitk.sitkFloat32)
for s in args.series}
seg_s = sitk.ReadImage(seg_paths[img_idx], sitk.sitkFloat32)
img_n, seg_n = preprocess(img_s, seg_s,
shape=IMAGE_SHAPE, spacing=TARGET_SPACING)
for seq in img_n:
images[seq].append(img_n[seq])
segmentations.append(seg_n)
# Split train and validation
# We use KFold to split the data, but we don't actually do cross validation, we
# just use it to split the data 1:9.
# kfold = KFold(10, shuffle=True, random_state=123)
# train_idxs, valid_idxs = list(kfold.split(segmentations))[0]
# train_idxs = list(train_idxs)
# valid_idxs = list(valid_idxs)
yml_paths = read_yaml_to_dict(f'./../data/Nijmegen paths/train_val_test_idxs_{args.fold}.yml')
print('test, train paths',yml_paths)
train_idxs = yml_paths['train_set0']
valid_idxs = yml_paths['val_set0']
df = pandas.read_csv('./marksheet_with_gleason.csv')
gleason_idxs = [idx for idx, i in enumerate(df['lesion_GS'].values) if not pandas.isna(i)]
values = [f"{df['patient_id'].values[i]}_{df['study_id'].values[i]}" for i in gleason_idxs]
print(values)
# read picai paths
files = ['picai_seg_list','picai_adc_list','picai_hbv_list','picai_t2_list']
for file in files:
image_paths = []
with open(f"../../../../datasets/picai/{file}.txt") as f:
image_paths = [l.strip() for l in f.readlines()]
if file is 'picai_seg_list':
image_paths_gleason = [image_path for image_path in image_paths if os.path.basename(image_path[:-7]) in values]
else:
image_paths_gleason = [image_path for image_path in image_paths if os.path.basename(image_path[:-8]) in values]
print(len(image_paths_gleason))
with open(f'./{file}.txt', 'w') as f:
for line in image_paths_gleason:
f.write(line)
f.write('\n')