fast-mri/scripts/18.build_db.py

import os
from matplotlib.pyplot import table
import glob
import SimpleITK as sitk
import pandas as pd
from sqlite3 import Error
import sqlite3

################################  README  ######################################
# TO DO -
# key headers weghalen
# op basis van exclusie
# lokaal
# ssd -> lokaal draaien
# apply parrallel (listen)

def print_p(text, end="\n"):
    """ Print function for on Peregrine. It needs a flush before printing. """
    print(text, flush=True, end=end)

def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print_p(e)

    return conn

KEY_HEADERS = [
    "0010|0020",
    "0018|0089",
    "0018|0093",
    "0008|103e",
    "0028|0010",
    "0028|0011",
    "0018|9087",
    "0018|0024"    # Sequence Name
    ]


INCLUDE_TAGS = [    # Attributes
    "0008|0005",    # Specific Character Set
    "0008|0008",    # Image Type
    "0008|0012",    # Instance Creation Date
    "0008|0013",    # Instance Creation Time
    "0008|0016",    # SOP Class UID
    "0008|0018",    # SOP Instance UID
    "0008|0020",    # Study Date
    "0008|0021",    # Series Date
    "0008|0022",    # Acquisition Date
    "0008|0023",    # Content Date
    "0008|0030",    # Study Time
    "0008|0031",    # Series Time
    "0008|0032",    # Acquisition Time
    "0008|0033",    # Content Time
    "0008|0050",    # Accession Number
    "0008|0060",    # Modality
    "0008|0070",    # Manufacturer
    "0008|1010",    # Station Name
    "0008|1030",    # Study Description
    "0008|103e",    # Series Description
    "0008|1040",    # Institutional Department Name
    "0008|1090",    # Manufacturer's Model Name
    "0010|0020",    # Patient ID
    "0010|0030",    # Patient's Birth Date
    "0010|0040",    # Patient's Sex
    "0010|1010",    # Patient's Age
    "0010|1020",    # Patient's Size
    "0010|1030",    # Patient's Weight
    "0010|21b0",    # Additional Patient History
    "0012|0062",    # Patient Identity Removed
    "0012|0063",    # De-identification Method
    "0018|0015",    # Body Part Examined
    "0018|0020",    # Scanning Sequence
    "0018|0021",    # Sequence Variant
    "0018|0022",    # Scan Options
    "0018|0023",    # MR Acquisition Type
    "0018|0050",    # Slice Thickness
    "0018|0080",    # Repetition Time
    "0018|0081",    # Echo Time
    "0018|0083",    # Number of Averages
    "0018|0084",    # Imaging Frequency
    "0018|0085",    # Imaged Nucleus
    "0018|0087",    # Magnetic Field Strength
    "0018|0088",    # Spacing Between Slices
    "0018|0089",    # Number of Phase Encoding Steps  IMPORTANT
    "0018|0091",    # Echo Train Length
    "0018|0093",    # Percent Sampling                IMPORTANT
    "0018|0094",    # Percent Phase Field of View
    "0018|1000",    # Device Serial Number
    "0018|1030",    # Protocol Name                   IMPORTANT -> sequence type
    "0018|1310",    # Acquisition Matrix              IMPORTANT
    "0018|1312",    # In-plane Phase Encoding Direction
    "0018|1314",    # Flip Angle
    "0018|1315",    # Variable Flip Angle Flag
    "0018|5100",    # Patient Position
    "0018|9087",    # Diffusion b-value               IMPORTANT
    "0020|000d",    # Study Instance UID
    "0020|000e",    # Series Instance UID
    "0020|0010",    # Study ID
    "0020|0032",    # Image Position (Patient)
    "0020|0037",    # Image Orientation (Patient)
    "0020|0052",    # Frame of Reference UID
    "0020|1041",    # Slice Location
    "0028|0002",    # Samples per Pixel
    "0028|0010",    # Rows                            IMPORTANT
    "0028|0011",    # Columns                         IMPORTANT
    "0028|0030",    # Pixel Spacing
    "0028|0100",    # Bits Allocated
    "0028|0101",    # Bits Stored
    "0028|0106",    # Smallest Image Pixel Value
    "0028|0107",    # Largest Image Pixel Value
    "0028|1050",    # Window Center
    "0028|1051",    # Window Width
    "0040|0244",    # Performed Procedure Step Start Date
    "0040|0254"     # Performed Procedure Step Description
    ]


################################################################################


def get_dict_from_dicom(reader, verbose=False):
    headers = {}
    for header_name in INCLUDE_TAGS:
        headers[header_name] = None

    for k in reader.GetMetaDataKeys():
        if k in INCLUDE_TAGS:
            v = reader.GetMetaData(k)
            headers[k] = f"{v}"
            if verbose:
                print_p(f"({k}) = \"{v}\"")
    headers["path"] = ""
    return headers


def has_different_key_headers(current_header_dict: dict, prev_header_dict):
    """ This function returns False if one of the key headers is different in
    both dictionaries supplied as arguments.

    Parameters:
    `current_header_dict (dict)`: dict from dicom (Headers from DICOM)
    `prev_header_dict (dict)`: dict from dicom (Headers from DICOM)
    returns (bool): True if important headers are different, else False
    """
    for header in KEY_HEADERS:
        try:
            if current_header_dict[header] != prev_header_dict.get(header, None):
                return True
        except:
            continue
    return False


def is_patient_in_database(conn, tablename, patient):
    # Get all results from patient from database
    cur = conn.cursor()
    query = f"SELECT [0010|0020] FROM {tablename} WHERE [0010|0020] like '%{patient}%';"
    result = cur.execute(query).fetchall() #list of tuples
    if len(result) == 0:
        return False
    return True


def fill_dicom_table_RUMC_UMCG(
    tablename: str,
    database: str,
    patients_dir_RUMC: str,
    devmode = False):
    """ Fills the given table with headers/tags from DICOM files from UMCG and
    RUMC. The tags are cross referenced with an include list of tags.

    Parameters:
    `tablename (string)`: table in sqlite that will be inserted into
    `database (string)`: relative project path to .db (database) file for sqlite.
    `patients_dir_RUMC (string)`: path where patient directories are stored (RUMC)
    `patients_dir_UMCG (string)`: path where patient directories are stored (UMCG)
    """

    # Connect with database
    db_path = f"{os.getcwd()}{database}"
    conn = create_connection(db_path)
    print_p(f"connection made: {db_path}")

    # patients = os.listdir(patients_dir_UMCG) + os.listdir(patients_dir_RUMC)
    patients = os.listdir(patients_dir_RUMC)
    prev_headers = {}

    with conn:
        # Drop all rows from table if it exists.
        if False:
            conn.execute(f"DELETE FROM {tablename};")
            print_p("done deleting all records from database")

        # loop over all patients. (RUMC and UMCG)
        for p_idx, patient in enumerate(patients):

            print_p(f"\nPatient {p_idx}: {patient}")

            if is_patient_in_database(conn, tablename, patient):
                print_p(f"PATIENT IS ALREADY IN DATABASE {tablename}")
                continue

            print_p(f"patient: {patient} is not in database")
            # Find all DICOM files
            glob_pattern = f"data/raw/*/{patient}/**/*.dcm"
            dicoms_paths = glob.glob(glob_pattern, recursive=True)
            rows = []

            # Loop over DICOM files
            for f_idx, dcm_path in enumerate(dicoms_paths):
                if f_idx > 10 and devmode:
                    continue
                print_p(f"f{f_idx}", end=' ')

                try:
                    reader = sitk.ImageFileReader()
                    reader.SetFileName(dcm_path)
                    reader.LoadPrivateTagsOn()
                    reader.ReadImageInformation()

                except:
                    print_p(f"Read Image Information EXCEPTION... Skipping: {dcm_path}")
                    continue

                curr_headers = get_dict_from_dicom(reader, verbose=False)
                curr_headers['path'] = dcm_path
                if not has_different_key_headers(curr_headers, prev_headers):
                    continue
                prev_headers = curr_headers
                rows.append(curr_headers)

            df = pd.DataFrame.from_dict(rows, orient='columns')
            print_p(f"\nwriting headers to sqlite database. {tablename} - num rows: {len(rows)}")
            df.to_sql(name=tablename, con=conn, if_exists='append')

    print_p(f"\n--- DONE writing data to {tablename}---")


################################################################################
print_p("start script")
fill_dicom_table_RUMC_UMCG(
    tablename = "dicom_headers_v2",
    database = r"dicoms_rumc.db",
    patients_dir_RUMC = r"data/raw/RUMC",
    patients_dir_UMCG = r"data/raw/UMCG",
    devmode=False)