import os import sys import configparser import pypyodbc import numpy as np from collections import Counter import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score from sklearn import preprocessing from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score currDir = 'C:\\Users\\Aki\\source\\repos\\rug_VS\\dialect_identification\\dialect_identification' sys.path.append(os.path.join(os.path.dirname(sys.path[0]), currDir)) import dataManipulation as mani import evaluation as eval import speaker_based_functions as sb_func ##################### ## USER DEFINE ## ##################### sentenceNumMax = 10 configFile = currDir + '\\config.ini' dirOut = currDir + '\\result' # make train/test set: 1, load: 0 makeTrainTestSet = 0 # convert 3 regions to 2 regions: 1, load: 0 conv3to2region = 0 # 3 regions: 0 # saxon vs limburg: 1 # groningen vs limburg: 2 experiment_type = 2 regionLabels = ['Groningen_and_Drenthe', 'Limburg', 'Oost_Overijsel-Gelderland'] # a bit useless error handling. #assert (experiment_type in (0, 1, 2)), "experiment type should be 0, 1 or 2." if experiment_type == 1: regionLabels2 = ['Low_Saxon', 'Limburg'] regionLabels2 = ['Groningen_and_Drenthe', 'Limburg'] ########################## ## DATA PREPARATION ## ########################## ## load init file config = configparser.ConfigParser() config.sections() config.read(configFile) dirFeature = config['sentence_based']['dirFeature'] fileMDB = config['sentence_based']['fileMDB'] ## database connection pypyodbc.lowercase = False param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + fileMDB + ";" conn = pypyodbc.connect(param) cursor = conn.cursor() ## get data from Access database # data format # 0: filename # 1: pid # 2: region # 3: ID (unique word_id) # 4: sentence_id # 5: word_id # 6: word # 7: pronunciation SQL_string = """\ {CALL dataset_with_cities} """ cursor.execute(SQL_string) rows = cursor.fetchall() data = np.array(rows) #dataNumMax = data.shape[0] #uniqueWordIDmax = max(data[:, 3].astype(int)) del SQL_string, rows ## make list of LabelBinarizer object per word. # for X # get pronvarList from Access database # pronvarList format # 0: ID (unique word_id) # 1: word # 2: pronvar SQL_string = """\ {CALL pronunciation_variant} """ cursor.execute(SQL_string) rows = cursor.fetchall() pronvarList = np.array(rows) del SQL_string, rows LBlist = [] #uniqueWordIDlist = pronvarList[:, 0].astype(int) uniqueWordIDlist = data[:, 3].astype(int) uniqueWordIDmax = max(uniqueWordIDlist) for uniqueWordID in range(1, uniqueWordIDmax+1): pronvar = data[uniqueWordIDlist == uniqueWordID, 7] #pronvar = pronvarList[pronvarList[:, 0] == uniqueWordID, 2] LB = preprocessing.LabelBinarizer() LB.fit(np.unique(pronvar)) LBlist.append(LB) # for y (=region) LE_y = preprocessing.LabelEncoder() LE_y.fit(regionLabels) LE_y2 = preprocessing.LabelEncoder() LE_y2.fit(regionLabels2) LB_y = preprocessing.LabelBinarizer() LB_y.fit(regionLabels) LB_y2 = preprocessing.LabelBinarizer() LB_y2.fit(regionLabels2) del uniqueWordID, uniqueWordIDmax, pronvar, LB ################# ## ITERATION ## ################# #CM_majority = np.zeros((1, 9)).astype(int) #CM_weighted = np.zeros((1, 9)).astype(int) #for iter in range(0, 1): # print(iter) ## make balanced dataset pidlist = np.unique(data[:, (1, 2)], axis=0) # count number of samples pidlistCounter = Counter(pidlist[:, 1]) sampleNumMax = min(pidlistCounter.values()) del pidlistCounter ## make train/eval/test set or load if makeTrainTestSet==1: pidlist_train = [] pidlist_eval = [] pidlist_test = [] for regionNum in range(0, len(regionLabels)): regionName = regionLabels[regionNum] pidlist_per_region_ = pidlist[pidlist[:, 1]==regionLabels[regionNum], :] pidlist_per_region, idx = mani.extractRandomSample( pidlist_per_region_, sampleNumMax) # split dataset into train, eval and test. [pidlist_per_region_train, pidlist_per_region_test] = train_test_split( pidlist_per_region, test_size = 0.2, random_state = 0) [pidlist_per_region_train, pidlist_per_region_eval] = train_test_split( pidlist_per_region_train, test_size = 0.1, random_state = 0) # append numpy arrays if regionNum == 0: pidlist_train = pidlist_per_region_train pidlist_eval = pidlist_per_region_eval pidlist_test = pidlist_per_region_test else: pidlist_train = np.r_[pidlist_train, pidlist_per_region_train] pidlist_eval = np.r_[pidlist_eval, pidlist_per_region_eval] pidlist_test = np.r_[pidlist_test, pidlist_per_region_test] del regionNum, regionName del pidlist_per_region_, pidlist_per_region, idx del pidlist_per_region_train, pidlist_per_region_eval, pidlist_per_region_test np.save(dirOut + "\\pidlist_train.npy", pidlist_train) np.save(dirOut + "\\pidlist_eval.npy", pidlist_eval) np.save(dirOut + "\\pidlist_test.npy", pidlist_test) else: pidlist_train = np.load(dirOut + "\\pidlist_train.npy") pidlist_eval = np.load(dirOut + "\\pidlist_eval.npy") pidlist_test = np.load(dirOut + "\\pidlist_test.npy") ## make dataset for 2 regions or load if conv3to2region==1: pidlist2_train_ = np.r_[pidlist_train, pidlist_eval] if experiment_type == 1: pidlist2_train = sb_func.saxon_vs_limburg(pidlist2_train_) pidlist2_test = sb_func.saxon_vs_limburg(pidlist_test) np.save(dirOut + "\\pidlist2_saxon_vs_limburg_train", pidlist2_train) np.save(dirOut + "\\pidlist2_saxon_vs_limburg_test", pidlist2_test) elif experiment_type == 2: pidlist2_train = sb_func.groningen_vs_limburg(pidlist2_train_) pidlist2_test = sb_func.groningen_vs_limburg(pidlist_test) np.save(dirOut + "\\pidlist2_groningen_vs_limburg_train", pidlist2_train) np.save(dirOut + "\\pidlist2_groningen_vs_limburg_test", pidlist2_test) del pidlist2_train_ else: if experiment_type == 1: pidlist2_train = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_train.npy") pidlist2_test = np.load(dirOut + "\\pidlist2_saxon_vs_limburg_test.npy") elif experiment_type == 2: pidlist2_train = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_train.npy") pidlist2_test = np.load(dirOut + "\\pidlist2_groningen_vs_limburg_test.npy") ## train/test data if experiment_type == 0: # Groningen vs Overijsel vs Limburg data_train = sb_func.extractPid(pidlist_train, data) data_eval = sb_func.extractPid(pidlist_eval, data) data_test = sb_func.extractPid(pidlist_test, data) elif experiment_type == 1 or experiment_type == 2: data2 = np.array(data) if experiment_type == 1: for row, row2 in zip(data, data2): if row[2] == regionLabels[0] or row[2] == regionLabels[2]: row2[2] = regionLabels2[0] data2_train = sb_func.extractPid(pidlist2_train, data2) data2_test = sb_func.extractPid(pidlist2_test, data2) ##################################### ## EXPERIMENTS START FROM HERE ## ##################################### ## actual training # train vs eval #trainData = data_train #testData = data_eval #testPID = pidlist_eval #LB = LB_y #LE = LE_y #regionLabels = regionLabels3 # train+eval vs test if experiment_type == 0: trainData = np.r_[data_train, data_eval] testData = data_test testPID = pidlist_test LB = LB_y LE = LE_y elif experiment_type == 1 or experiment_type == 2: # 2 region: saxon vs limburg/ groningen vs limburg trainData = data2_train testData = data2_test testPID = pidlist2_test LB = LB_y2 LE = LE_y2 regionLabels = regionLabels2 # check the number of utterance allData = np.r_[trainData, testData] filenames = np.c_[allData[:, 0], allData[:, 2]] filenames_unique = np.unique(filenames, axis=0) Counter(filenames_unique[:, 1]) fileComparison = dirOut + "\\algorithm_comparison.csv" filePerformance = dirOut + "\\sentence-level.csv" fileConfusionMatrix = dirOut + "\\confusion_matrix.csv" ## compare classification algorithms for the sentence-classifiers. #sb_func.compare_sentence_level_classifiers(trainData, LBlist, LE, fileComparison) ## train sentence-level classifiers. modelList, scoreList, confusionMatrixList = sb_func.train_sentence_level_classifiers( trainData, LBlist, LE, filePerformance) ## prediction over evaluation data per each sentence-level classifier. pred_per_sentence = sb_func.prediction_per_sentence(testData, modelList, LBlist, LE) ## combine sentence-level classifiers pred_per_pid_majority = sb_func.prediction_per_pid_majority(testPID, pred_per_sentence) ## majority vote (weighted) #weight = sb_func.calc_weight(confusionMatrixList) #pred_per_pid_weighted = sb_func.prediction_per_pid_weighted(testPID, pred_per_sentence, weight, LB, LE) ### confusion matrix if experiment_type == 0: confusionMatrix_majority = confusion_matrix( pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Oost_Overijsel-Gelderland', 'Limburg']) else: confusionMatrix_majority = confusion_matrix( pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], labels=['Groningen_and_Drenthe', 'Limburg']) #confusionMatrix_weighted = confusion_matrix( # pred_per_pid_weighted[:, 1], pred_per_pid_weighted[:, 2], labels=regionLabels) ## output accuracy = accuracy_score(pred_per_pid_majority[:, 1], pred_per_pid_majority[:, 2], normalize=True, sample_weight=None) print('accuracy: {}%'.format(accuracy * 100)) cm = confusionMatrix_majority print(cm) np.save(dirOut + "\\pred_per_pid.npy", pred_per_pid_majority) np.save(dirOut + "\\confusion_matrix.npy", cm) #fout = open(fileConfusionMatrix, "w") #fout.write('< confusion matrix for majority vote in evaluation set >\n') #sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_majority', regionLabels) #fout.write('< confusion matrix for weighted vote in evaluation set >\n') #sb_func.outputConfusionMatrix33('fout', 'confusionMatrix_weighted', regionLabels) #fout.write('\n') #fout.close() ##### iteration finish ##### conn.close() #np.savetxt(dirOut + '\\cm_majority.csv', CM_majority, delimiter=',') #np.savetxt(dirOut + '\\cm_weighted.csv', CM_weighted, delimiter=',')