#!/usr/bin/env python3 import pandas as pd import json import re FILE_NAME_REGEX = r'^\d+_\d+$' # Value of T3 in the annotations JSON conversion_list = {'classification_id': [], 'filename':[], 'foglio':[], 'years': [], 'transcription':[], 'symbols': []} source_files = [] source_files.append({ 'filename' : 'source/transcribing-on-folio-level-classifications.csv', 'level' : 'folio' }) source_files.append({ 'filename' : 'source/transcribing-on-page-level-classifications.csv', 'level' : 'page' }) FOLIO_REGEX = re.compile(r"\nfol[\. ,]", flags=re.IGNORECASE) for source_file in source_files: df = pd.read_csv(source_file['filename'], sep=',', quotechar='"') for index, row in df.iterrows(): foglio = None years = None # Store the original classification_id classification_id = row['classification_id'] transcription = '' filename = '' # Convert data from JSON to dict tasks = json.loads(row['annotations']) for task in tasks: if 'folio' == source_file['level']: if 'T1' == task['task']: transcription = task['value'].strip() elif 'T2' == task['task']: symbols = False if 'no' == task['value'].lower() else True elif 'T3' == task['task']: filename = task['value'].strip() if 'page' == source_file['level']: if 'T0' == task['task']: transcription = task['value'].strip() elif 'T1' == task['task']: symbols = False if 'no' == task['value'].lower() else True if not re.match(FILE_NAME_REGEX, filename): # Somebody mixedup the fields.... :( data = re.search(r"(?P\d+_\d+)?(?P.*)",filename,re.MULTILINE | re.DOTALL) if data: filename = '' if not data.group('filename') else data.group('filename').strip() if filename == transcription: transcription = data.group('rest').strip() else: transcription += data.group('rest').strip() if '' == filename: # Final attempt, get it from the CSV column subject_data subject_data = json.loads(row['subject_data']) filename = subject_data[str(row['subject_ids'])]['Filename'].replace('.jpg','').strip() # Somehow, sometimes, the transcription is starting with the filename. So remove that from the transcription if transcription.startswith(filename): transcription = transcription[len(filename):].strip() if 'page' == source_file['level']: # Here we split the transcription in multiple transcriptions due to the way of storing this data in the CSV transcriptions = FOLIO_REGEX.split(transcription) else: # Only 1 transcription per line! transcriptions = [transcription] for index, transcription_part in enumerate(transcriptions): if index == 0 and transcription.strip().lower().startswith('fol') or index > 0: transcription_part = 'fol.' + transcription_part transcription_part = transcription_part.strip().replace('\n','\\n') # Get folio nummer..... data = re.search(r"fol[\. ]+\d+[a-z]*",transcription_part,re.MULTILINE | re.IGNORECASE) if data: foglio = data.group(0) # Get years matches = re.finditer(r"1[5-8][0-9]{2}", transcription_part, re.MULTILINE | re.IGNORECASE) if matches: years = [] for matchNum, match in enumerate(matches, start=1): years.append(match.group()) years = ','.join(sorted(list(set(years)))) conversion_list['classification_id'].append(classification_id) conversion_list['filename'].append(filename) conversion_list['foglio'].append(foglio) conversion_list['years'].append(years) conversion_list['transcription'].append(transcription_part) conversion_list['symbols'].append(symbols) convert = pd.DataFrame.from_dict(conversion_list) convert.to_csv('converted.csv', encoding='utf-8', index=False)