Initial version

2020-11-03 12:14:39 +01:00
parent 35efaa2ca4
commit 02a744a0f5
3 changed files with 21051 additions and 0 deletions
--- a/convert.py
+++ b/convert.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+import json
+import re
+
+FILE_NAME_REGEX = r'^\d+_\d+$' # Value of T3 in the annotations JSON
+conversion_list = {'classification_id': [], 'filename':[], 'foglio':[], 'years': [], 'transcription':[], 'symbols': []}
+
+source_files = []
+source_files.append({
+    'filename' : 'source/transcribing-on-folio-level-classifications.csv',
+    'level'    : 'folio'
+})
+source_files.append({
+    'filename' : 'source/transcribing-on-page-level-classifications.csv',
+    'level'    : 'page'
+})
+
+FOLIO_REGEX = re.compile(r"\nfol[\. ,]", flags=re.IGNORECASE)
+
+for source_file in source_files:
+
+    df = pd.read_csv(source_file['filename'], sep=',', quotechar='"')
+
+    for index, row in df.iterrows():
+        foglio = None
+        years = None
+        # Store the original classification_id
+        classification_id = row['classification_id']
+        transcription = ''
+        filename = ''
+
+        # Convert data from JSON to dict
+        tasks = json.loads(row['annotations'])
+        for task in tasks:
+            if 'folio' == source_file['level']:
+                if 'T1' == task['task']:
+                    transcription = task['value'].strip()
+                elif 'T2' == task['task']:
+                    symbols = False if 'no' == task['value'].lower() else True
+
+                elif 'T3' == task['task']:
+                    filename = task['value'].strip()
+            
+            if 'page' == source_file['level']:
+                if 'T0' == task['task']:
+                    transcription = task['value'].strip()
+                elif 'T1' == task['task']:
+                    symbols = False if 'no' == task['value'].lower() else True
+
+
+        if not re.match(FILE_NAME_REGEX, filename):
+            # Somebody mixedup the fields.... :( 
+            data = re.search(r"(?P<filename>\d+_\d+)?(?P<rest>.*)",filename,re.MULTILINE | re.DOTALL)
+
+            if data:
+                filename = '' if not data.group('filename') else data.group('filename').strip()
+                if filename == transcription:
+                    transcription = data.group('rest').strip()
+                else:
+                    transcription += data.group('rest').strip()
+
+        if '' == filename:
+            # Final attempt, get it from the CSV column subject_data
+            subject_data = json.loads(row['subject_data'])
+            filename = subject_data[str(row['subject_ids'])]['Filename'].replace('.jpg','').strip()
+
+        # Somehow, sometimes, the transcription is starting with the filename. So remove that from the transcription
+        if transcription.startswith(filename):
+            transcription = transcription[len(filename):].strip()
+
+        if 'page' == source_file['level']:
+            # Here we split the transcription in multiple transcriptions due to the way of storing this data in the CSV
+            transcriptions = FOLIO_REGEX.split(transcription)
+        else:
+            # Only 1 transcription per line!
+            transcriptions = [transcription]
+
+        for index, transcription_part in enumerate(transcriptions):
+                
+            if index == 0 and transcription.strip().lower().startswith('fol') or index > 0:
+                transcription_part = 'fol.' + transcription_part
+
+            transcription_part = transcription_part.strip().replace('\n','\\n')
+
+            # Get folio nummer.....
+            data = re.search(r"fol[\. ]+\d+[a-z]*",transcription_part,re.MULTILINE | re.IGNORECASE)
+            if data:
+                foglio = data.group(0)
+
+            # Get years
+            matches = re.finditer(r"1[5-8][0-9]{2}", transcription_part, re.MULTILINE | re.IGNORECASE)
+            if matches:
+                years = []
+                for matchNum, match in enumerate(matches, start=1):
+                    years.append(match.group())
+                
+                years = ','.join(sorted(list(set(years))))
+            
+            conversion_list['classification_id'].append(classification_id)
+            conversion_list['filename'].append(filename)
+            conversion_list['foglio'].append(foglio)
+            conversion_list['years'].append(years)
+            conversion_list['transcription'].append(transcription_part)
+            conversion_list['symbols'].append(symbols)
+
+convert = pd.DataFrame.from_dict(conversion_list)
+convert.to_csv('converted.csv', encoding='utf-8', index=False)
--- a/source/transcribing-on-folio-level-classifications.csv
+++ b/source/transcribing-on-folio-level-classifications.csv
--- a/source/transcribing-on-page-level-classifications.csv
+++ b/source/transcribing-on-page-level-classifications.csv