Initial version
This commit is contained in:
parent
35efaa2ca4
commit
02a744a0f5
109
convert.py
Executable file
109
convert.py
Executable file
@ -0,0 +1,109 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
|
FILE_NAME_REGEX = r'^\d+_\d+$' # Value of T3 in the annotations JSON
|
||||||
|
conversion_list = {'classification_id': [], 'filename':[], 'foglio':[], 'years': [], 'transcription':[], 'symbols': []}
|
||||||
|
|
||||||
|
source_files = []
|
||||||
|
source_files.append({
|
||||||
|
'filename' : 'source/transcribing-on-folio-level-classifications.csv',
|
||||||
|
'level' : 'folio'
|
||||||
|
})
|
||||||
|
source_files.append({
|
||||||
|
'filename' : 'source/transcribing-on-page-level-classifications.csv',
|
||||||
|
'level' : 'page'
|
||||||
|
})
|
||||||
|
|
||||||
|
FOLIO_REGEX = re.compile(r"\nfol[\. ,]", flags=re.IGNORECASE)
|
||||||
|
|
||||||
|
for source_file in source_files:
|
||||||
|
|
||||||
|
df = pd.read_csv(source_file['filename'], sep=',', quotechar='"')
|
||||||
|
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
foglio = None
|
||||||
|
years = None
|
||||||
|
# Store the original classification_id
|
||||||
|
classification_id = row['classification_id']
|
||||||
|
transcription = ''
|
||||||
|
filename = ''
|
||||||
|
|
||||||
|
# Convert data from JSON to dict
|
||||||
|
tasks = json.loads(row['annotations'])
|
||||||
|
for task in tasks:
|
||||||
|
if 'folio' == source_file['level']:
|
||||||
|
if 'T1' == task['task']:
|
||||||
|
transcription = task['value'].strip()
|
||||||
|
elif 'T2' == task['task']:
|
||||||
|
symbols = False if 'no' == task['value'].lower() else True
|
||||||
|
|
||||||
|
elif 'T3' == task['task']:
|
||||||
|
filename = task['value'].strip()
|
||||||
|
|
||||||
|
if 'page' == source_file['level']:
|
||||||
|
if 'T0' == task['task']:
|
||||||
|
transcription = task['value'].strip()
|
||||||
|
elif 'T1' == task['task']:
|
||||||
|
symbols = False if 'no' == task['value'].lower() else True
|
||||||
|
|
||||||
|
|
||||||
|
if not re.match(FILE_NAME_REGEX, filename):
|
||||||
|
# Somebody mixedup the fields.... :(
|
||||||
|
data = re.search(r"(?P<filename>\d+_\d+)?(?P<rest>.*)",filename,re.MULTILINE | re.DOTALL)
|
||||||
|
|
||||||
|
if data:
|
||||||
|
filename = '' if not data.group('filename') else data.group('filename').strip()
|
||||||
|
if filename == transcription:
|
||||||
|
transcription = data.group('rest').strip()
|
||||||
|
else:
|
||||||
|
transcription += data.group('rest').strip()
|
||||||
|
|
||||||
|
if '' == filename:
|
||||||
|
# Final attempt, get it from the CSV column subject_data
|
||||||
|
subject_data = json.loads(row['subject_data'])
|
||||||
|
filename = subject_data[str(row['subject_ids'])]['Filename'].replace('.jpg','').strip()
|
||||||
|
|
||||||
|
# Somehow, sometimes, the transcription is starting with the filename. So remove that from the transcription
|
||||||
|
if transcription.startswith(filename):
|
||||||
|
transcription = transcription[len(filename):].strip()
|
||||||
|
|
||||||
|
if 'page' == source_file['level']:
|
||||||
|
# Here we split the transcription in multiple transcriptions due to the way of storing this data in the CSV
|
||||||
|
transcriptions = FOLIO_REGEX.split(transcription)
|
||||||
|
else:
|
||||||
|
# Only 1 transcription per line!
|
||||||
|
transcriptions = [transcription]
|
||||||
|
|
||||||
|
for index, transcription_part in enumerate(transcriptions):
|
||||||
|
|
||||||
|
if index == 0 and transcription.strip().lower().startswith('fol') or index > 0:
|
||||||
|
transcription_part = 'fol.' + transcription_part
|
||||||
|
|
||||||
|
transcription_part = transcription_part.strip().replace('\n','\\n')
|
||||||
|
|
||||||
|
# Get folio nummer.....
|
||||||
|
data = re.search(r"fol[\. ]+\d+[a-z]*",transcription_part,re.MULTILINE | re.IGNORECASE)
|
||||||
|
if data:
|
||||||
|
foglio = data.group(0)
|
||||||
|
|
||||||
|
# Get years
|
||||||
|
matches = re.finditer(r"1[5-8][0-9]{2}", transcription_part, re.MULTILINE | re.IGNORECASE)
|
||||||
|
if matches:
|
||||||
|
years = []
|
||||||
|
for matchNum, match in enumerate(matches, start=1):
|
||||||
|
years.append(match.group())
|
||||||
|
|
||||||
|
years = ','.join(sorted(list(set(years))))
|
||||||
|
|
||||||
|
conversion_list['classification_id'].append(classification_id)
|
||||||
|
conversion_list['filename'].append(filename)
|
||||||
|
conversion_list['foglio'].append(foglio)
|
||||||
|
conversion_list['years'].append(years)
|
||||||
|
conversion_list['transcription'].append(transcription_part)
|
||||||
|
conversion_list['symbols'].append(symbols)
|
||||||
|
|
||||||
|
convert = pd.DataFrame.from_dict(conversion_list)
|
||||||
|
convert.to_csv('converted.csv', encoding='utf-8', index=False)
|
19849
source/transcribing-on-folio-level-classifications.csv
Normal file
19849
source/transcribing-on-folio-level-classifications.csv
Normal file
File diff suppressed because one or more lines are too long
1093
source/transcribing-on-page-level-classifications.csv
Normal file
1093
source/transcribing-on-page-level-classifications.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user