fast-mri/scripts/18.build_db.py

254 lines
9.0 KiB
Python
Executable File

import os
from matplotlib.pyplot import table
import glob
import SimpleITK as sitk
import pandas as pd
from sqlite3 import Error
import sqlite3
################################ README ######################################
# TO DO -
# key headers weghalen
# op basis van exclusie
# lokaal
# ssd -> lokaal draaien
# apply parrallel (listen)
def print_p(text, end="\n"):
""" Print function for on Peregrine. It needs a flush before printing. """
print(text, flush=True, end=end)
def create_connection(db_file):
""" create a database connection to the SQLite database
specified by the db_file
:param db_file: database file
:return: Connection object or None
"""
conn = None
try:
conn = sqlite3.connect(db_file)
except Error as e:
print_p(e)
return conn
KEY_HEADERS = [
"0010|0020",
"0018|0089",
"0018|0093",
"0008|103e",
"0028|0010",
"0028|0011",
"0018|9087",
"0018|0024" # Sequence Name
]
INCLUDE_TAGS = [ # Attributes
"0008|0005", # Specific Character Set
"0008|0008", # Image Type
"0008|0012", # Instance Creation Date
"0008|0013", # Instance Creation Time
"0008|0016", # SOP Class UID
"0008|0018", # SOP Instance UID
"0008|0020", # Study Date
"0008|0021", # Series Date
"0008|0022", # Acquisition Date
"0008|0023", # Content Date
"0008|0030", # Study Time
"0008|0031", # Series Time
"0008|0032", # Acquisition Time
"0008|0033", # Content Time
"0008|0050", # Accession Number
"0008|0060", # Modality
"0008|0070", # Manufacturer
"0008|1010", # Station Name
"0008|1030", # Study Description
"0008|103e", # Series Description
"0008|1040", # Institutional Department Name
"0008|1090", # Manufacturer's Model Name
"0010|0020", # Patient ID
"0010|0030", # Patient's Birth Date
"0010|0040", # Patient's Sex
"0010|1010", # Patient's Age
"0010|1020", # Patient's Size
"0010|1030", # Patient's Weight
"0010|21b0", # Additional Patient History
"0012|0062", # Patient Identity Removed
"0012|0063", # De-identification Method
"0018|0015", # Body Part Examined
"0018|0020", # Scanning Sequence
"0018|0021", # Sequence Variant
"0018|0022", # Scan Options
"0018|0023", # MR Acquisition Type
"0018|0050", # Slice Thickness
"0018|0080", # Repetition Time
"0018|0081", # Echo Time
"0018|0083", # Number of Averages
"0018|0084", # Imaging Frequency
"0018|0085", # Imaged Nucleus
"0018|0087", # Magnetic Field Strength
"0018|0088", # Spacing Between Slices
"0018|0089", # Number of Phase Encoding Steps IMPORTANT
"0018|0091", # Echo Train Length
"0018|0093", # Percent Sampling IMPORTANT
"0018|0094", # Percent Phase Field of View
"0018|1000", # Device Serial Number
"0018|1030", # Protocol Name IMPORTANT -> sequence type
"0018|1310", # Acquisition Matrix IMPORTANT
"0018|1312", # In-plane Phase Encoding Direction
"0018|1314", # Flip Angle
"0018|1315", # Variable Flip Angle Flag
"0018|5100", # Patient Position
"0018|9087", # Diffusion b-value IMPORTANT
"0020|000d", # Study Instance UID
"0020|000e", # Series Instance UID
"0020|0010", # Study ID
"0020|0032", # Image Position (Patient)
"0020|0037", # Image Orientation (Patient)
"0020|0052", # Frame of Reference UID
"0020|1041", # Slice Location
"0028|0002", # Samples per Pixel
"0028|0010", # Rows IMPORTANT
"0028|0011", # Columns IMPORTANT
"0028|0030", # Pixel Spacing
"0028|0100", # Bits Allocated
"0028|0101", # Bits Stored
"0028|0106", # Smallest Image Pixel Value
"0028|0107", # Largest Image Pixel Value
"0028|1050", # Window Center
"0028|1051", # Window Width
"0040|0244", # Performed Procedure Step Start Date
"0040|0254" # Performed Procedure Step Description
]
################################################################################
def get_dict_from_dicom(reader, verbose=False):
headers = {}
for header_name in INCLUDE_TAGS:
headers[header_name] = None
for k in reader.GetMetaDataKeys():
if k in INCLUDE_TAGS:
v = reader.GetMetaData(k)
headers[k] = f"{v}"
if verbose:
print_p(f"({k}) = \"{v}\"")
headers["path"] = ""
return headers
def has_different_key_headers(current_header_dict: dict, prev_header_dict):
""" This function returns False if one of the key headers is different in
both dictionaries supplied as arguments.
Parameters:
`current_header_dict (dict)`: dict from dicom (Headers from DICOM)
`prev_header_dict (dict)`: dict from dicom (Headers from DICOM)
returns (bool): True if important headers are different, else False
"""
for header in KEY_HEADERS:
try:
if current_header_dict[header] != prev_header_dict.get(header, None):
return True
except:
continue
return False
def is_patient_in_database(conn, tablename, patient):
# Get all results from patient from database
cur = conn.cursor()
query = f"SELECT [0010|0020] FROM {tablename} WHERE [0010|0020] like '%{patient}%';"
result = cur.execute(query).fetchall() #list of tuples
if len(result) == 0:
return False
return True
def fill_dicom_table_RUMC_UMCG(
tablename: str,
database: str,
patients_dir_RUMC: str,
devmode = False):
""" Fills the given table with headers/tags from DICOM files from UMCG and
RUMC. The tags are cross referenced with an include list of tags.
Parameters:
`tablename (string)`: table in sqlite that will be inserted into
`database (string)`: relative project path to .db (database) file for sqlite.
`patients_dir_RUMC (string)`: path where patient directories are stored (RUMC)
`patients_dir_UMCG (string)`: path where patient directories are stored (UMCG)
"""
# Connect with database
db_path = f"{os.getcwd()}{database}"
conn = create_connection(db_path)
print_p(f"connection made: {db_path}")
# patients = os.listdir(patients_dir_UMCG) + os.listdir(patients_dir_RUMC)
patients = os.listdir(patients_dir_RUMC)
prev_headers = {}
with conn:
# Drop all rows from table if it exists.
if False:
conn.execute(f"DELETE FROM {tablename};")
print_p("done deleting all records from database")
# loop over all patients. (RUMC and UMCG)
for p_idx, patient in enumerate(patients):
print_p(f"\nPatient {p_idx}: {patient}")
if is_patient_in_database(conn, tablename, patient):
print_p(f"PATIENT IS ALREADY IN DATABASE {tablename}")
continue
print_p(f"patient: {patient} is not in database")
# Find all DICOM files
glob_pattern = f"data/raw/*/{patient}/**/*.dcm"
dicoms_paths = glob.glob(glob_pattern, recursive=True)
rows = []
# Loop over DICOM files
for f_idx, dcm_path in enumerate(dicoms_paths):
if f_idx > 10 and devmode:
continue
print_p(f"f{f_idx}", end=' ')
try:
reader = sitk.ImageFileReader()
reader.SetFileName(dcm_path)
reader.LoadPrivateTagsOn()
reader.ReadImageInformation()
except:
print_p(f"Read Image Information EXCEPTION... Skipping: {dcm_path}")
continue
curr_headers = get_dict_from_dicom(reader, verbose=False)
curr_headers['path'] = dcm_path
if not has_different_key_headers(curr_headers, prev_headers):
continue
prev_headers = curr_headers
rows.append(curr_headers)
df = pd.DataFrame.from_dict(rows, orient='columns')
print_p(f"\nwriting headers to sqlite database. {tablename} - num rows: {len(rows)}")
df.to_sql(name=tablename, con=conn, if_exists='append')
print_p(f"\n--- DONE writing data to {tablename}---")
################################################################################
print_p("start script")
fill_dicom_table_RUMC_UMCG(
tablename = "dicom_headers_v2",
database = r"dicoms_rumc.db",
patients_dir_RUMC = r"data/raw/RUMC",
patients_dir_UMCG = r"data/raw/UMCG",
devmode=False)