fast-mri/reader_study/make_data_paths.py

import pandas
import numpy as np
import os
from sfransen.DWI_exp.helpers import *
from sfransen.utils_quintin import *
from os import path
import SimpleITK as sitk
import xml.etree.ElementTree as ET
import pathlib

def parse_marklist(marklistpath):
    tree = ET.parse(marklistpath)
    root = tree.getroot()
    patient_element = (list(root.iter("markpatient")) + [None])[0]
    PSA = patient_element.find("PSA").text if (patient_element is not None and patient_element.find("PSA") is not None) else 0
    number_of_lesions = []
    locations =[]
    current_max_PIRADS = 0
    for mark in root.iter("mark"):
        PIRADS = mark.find("PIRADS").text if mark.find("PIRADS") is not None else 0
        if int(PIRADS) > 0:
            number_of_lesions.append(PIRADS)
        if int(PIRADS) > int(current_max_PIRADS):
            current_max_PIRADS = PIRADS

        location = mark.find("Zones/Zone/Type")
        if location is not None:
            location = location.text
        else:
            location = ''
        locations.append(location)

        # lesions_ = 1 if mark.find("PIRADS") is not None else 0
        # number_of_lesions += number_of_lesions + lesions_
    # if current_max_PIRADS == 0:
    #     if len(number_of_lesions) > 0:
    #         print(f'no PIRADS, wel lesie {number_of_lesions}')

    return PSA, current_max_PIRADS, number_of_lesions, locations

def parse_age(path):
    tree = ET.parse(path)
    root = tree.getroot()
    age = root[6].text
    return age[1:-1]

DATA_DIR = "./../data/Nijmegen paths/"
OUTPUT_DIR = "./random_images_2/"

with open(path.join(DATA_DIR, "t2.txt"), 'r') as f:
    pat_ids = [l.split('/')[5] for l in f.readlines()]
with open(path.join(DATA_DIR, "t2.txt"), 'r') as f:
    years = [l.split('/')[6] for l in f.readlines()]

PSA_list = []
current_max_PIRADS_list = []
number_of_lesions_list = []
age_list = []
pat_id_list = []
locations_list = []
for img in ['t2','adccalc2','adccalc3','b1400calc2','b1400calc3']:
    with open(path.join(DATA_DIR, f"{img}.txt"), 'r') as f:
        image_paths = [l.strip() for l in f.readlines()]
    for idx in [23,161,543,734,367, 85, 380, 231, 406, 435, 660, 327, 305, 7, 479, 540, 558, 361, 167, 320, 666, 178, 700, 831, 707, 596, 715, 823, 561, 782]:
        print(idx)
        read_img = sitk.ReadImage(image_paths[idx],sitk.sitkFloat32)
        marklistpath = f'../../datasets/radboud_new/{pat_ids[idx]}/{years[idx]}/markdatasetlist.xml'
        info_age = f'../../datasets/radboud_new/{pat_ids[idx]}/{years[idx]}/t2_tse_tra/info.xml'
        PSA, current_max_PIRADS, number_of_lesions, locations = parse_marklist(marklistpath)
        age = parse_age(info_age)
        pat_id_list.append(str(pat_ids[idx]))
        if img == 't2':
            PSA_list.append(PSA)
            current_max_PIRADS_list.append(current_max_PIRADS)
            number_of_lesions_list.append(list(number_of_lesions))
            age_list.append(age)
            locations_list.append(locations)

            # if len(number_of_lesions) > 2:
                # print(f'number_of_lesion {number_of_lesions}')
                # input(f'current_max_PIRADS {current_max_PIRADS}')
                # input(f'PSA {PSA}')
                # input(f'age {age}')
        # method 1: all bvalues, method 2: omitting b800
        if img == 't2':
            name = f'{pat_ids[idx]}-t2'
        if img == 'adccalc2':
            name = f'{pat_ids[idx]}-adc_method2'
        if img == 'adccalc3':
            name = f'{pat_ids[idx]}-adc_method1'
        if img == 'b1400calc2':
            name = f'{pat_ids[idx]}-dwi_method2'
        if img == 'b1400calc3':
            name = f'{pat_ids[idx]}-dwi_method1'

        # sitk.WriteImage(read_img,f'{OUTPUT_DIR}{name}.nii.gz')

for idx in [23,161,543,734,367, 85, 380, 231, 406, 435, 660, 327, 305, 7, 479, 540, 558, 361, 167, 320, 666, 178, 700, 831, 707, 596, 715, 823, 561, 782]:


with open(f'./PSA.txt', 'w') as f:
    for line in PSA_list:
        f.write(str(line))
        f.write('\n')
with open(f'./age.txt', 'w') as f:
    for line in age_list:
        f.write(str(line))
        f.write('\n')
input(number_of_lesions_list)
with open(f'./number_of_lesions.txt', 'w') as f:
    for line in number_of_lesions_list:
        f.write(str(line))
        f.write('\n')
with open(f'./current_max_PIRADS.txt', 'w') as f:
    for line in current_max_PIRADS_list:
        f.write(str(line))
        f.write('\n')
with open(f'./pat_ids.txt', 'w') as f:
    for line in pat_id_list:
        f.write(str(line))
        f.write('\n')
with open(f'./locations.txt', 'w') as f:
    for line in locations_list:
        f.write(str(line))
        f.write('\n')
exit()
# Read and preprocess each of the paths for each series, and the segmentations.
for img_idx in tqdm(range(num_images)): #[:20]): #for less images
    img_s = {s: sitk.ReadImage(image_paths[s][img_idx], sitk.sitkFloat32)
        for s in args.series}
    seg_s = sitk.ReadImage(seg_paths[img_idx], sitk.sitkFloat32)
    img_n, seg_n = preprocess(img_s, seg_s,
        shape=IMAGE_SHAPE, spacing=TARGET_SPACING)
    for seq in img_n:
        images[seq].append(img_n[seq])
    segmentations.append(seg_n)

# Split train and validation
# We use KFold to split the data, but we don't actually do cross validation, we
# just use it to split the data 1:9.
# kfold = KFold(10, shuffle=True, random_state=123)
# train_idxs, valid_idxs = list(kfold.split(segmentations))[0]
# train_idxs = list(train_idxs)
# valid_idxs = list(valid_idxs)

yml_paths = read_yaml_to_dict(f'./../data/Nijmegen paths/train_val_test_idxs_{args.fold}.yml')
print('test, train paths',yml_paths)
train_idxs = yml_paths['train_set0']
valid_idxs = yml_paths['val_set0']


df = pandas.read_csv('./marksheet_with_gleason.csv')
gleason_idxs = [idx for idx, i in enumerate(df['lesion_GS'].values) if not pandas.isna(i)]
values = [f"{df['patient_id'].values[i]}_{df['study_id'].values[i]}" for i in gleason_idxs]
print(values)

# read picai paths
files = ['picai_seg_list','picai_adc_list','picai_hbv_list','picai_t2_list']
for file in files:
    image_paths = []
    with open(f"../../../../datasets/picai/{file}.txt") as f:
        image_paths = [l.strip() for l in f.readlines()]

    if file is 'picai_seg_list':
        image_paths_gleason = [image_path for image_path in image_paths if os.path.basename(image_path[:-7]) in values]
    else:
        image_paths_gleason = [image_path for image_path in image_paths if os.path.basename(image_path[:-8]) in values]
    print(len(image_paths_gleason))

    with open(f'./{file}.txt', 'w') as f:
        for line in image_paths_gleason:
            f.write(line)
            f.write('\n')