From cbcee6ac2730c3d739f7eb7a602045fa561972ca Mon Sep 17 00:00:00 2001 From: Remco Renken Date: Thu, 24 Nov 2022 14:09:47 +0100 Subject: [PATCH] Please use EndPoint25Mar2022 as the stable branch. Changes made to CREClass.m Here I've added an alternative calculation for the Relevance Boundary. CalcRBShannon is now part of the CREClass.m file. The TestCREClass has been updated to allow testing this new feature. NB: this is work in progress --- CREClass.m | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/CREClass.m b/CREClass.m index 42af3ed..9a81f6f 100644 --- a/CREClass.m +++ b/CREClass.m @@ -1,4 +1,7 @@ classdef CREClass < handle + % 23 and 24 Nov 2022 I added a new calculation for the RB based on + % Shanon entropy given a threshold \lambda. I also think there is a + % "mistake" in calculating the RB the classical way. properties Data; % the data set to get the CRE off nBin; %number of bins @@ -111,7 +114,62 @@ classdef CREClass < handle obj.nBin=numel(obj.Edges)-1; end end - function Calc(obj) + function CalcRBShannon(obj) + % 23Nov22 Here I will try to implement a (hopefully better) + % definition of the relevance boundry. The basic idea is + % assume a dataset X= {x_1 ... x_i} X + % set a threshold \lambda NB \lambda \in of the data X + %Shanon entropy for this \lambda will be: + % H(\lamda)=p(x<\lambda)log(p(x=\lambda)log(p(x>=\lambda)) + % Now the expectation value for H will be : + %E(H)=\int p(\lambda)*H(\lambda) d\lambda + if isempty(obj.Data) + warning('nothing to do'); + return; + end + if isempty(obj.Edges);obj.CalcEdges;end + [CP,bin]=histcounts(obj.Data(:),transpose(obj.Edges),'Normalization','cdf'); + dl=transpose(diff(bin)./sum(diff(bin))); % will capture the case with non-equidistant bins as well as equidistant bins + pdf_lambda=histcounts(obj.Data(:),transpose(obj.Edges),'Normalization','pdf'); + CP=transpose(CP); + LogCP=log(CP); + LogCP(~isfinite(LogCP))=0; % see comment below + FC=1-CP; + FC(FC<0)=0; % due to rounding errors, small negative numbers can arrise. These are theoretically not possible. + LogFC=log(FC); + LogFC(~isfinite(LogFC))=0;%the log of 0 is -inf. however in Fc.*logFc it should end up as 0. to avoid conflicts removing the -inf + H=-1*(CP.*LogCP+FC.*LogFC); %Shannon Entropy when a theshold (lambda) is set to the lower bin edge + w=(dl.*pdf_lambda); + w=w./sum(w); % for one reason or the other, the sum is not always equal to 1. + EH=w*H; %calculate expectation value for H + RB_ind=[find(H>=EH,1,'first') find(H>=EH,1,'last')]; + obj.RB=obj.Edges(RB_ind)+dl(RB_ind)/2; + obj.P_RB=FC(RB_ind); + if obj.Monitor + lambda=obj.Edges(1:end-1); + scatter(w,H);xlabel('weight');ylabel('Shannon Entropy');snapnow + plot(lambda,H);xlabel('lambda');ylabel('Shannon Entropy'); + line(lambda(RB_ind(:)),[EH;EH],'linestyle',':');snapnow + histogram(obj.Data(:));snapnow + disp('sum of weights') + disp(sum(dl.*pdf_lambda)) + disp('EH') + disp(EH) + disp('boundry index'); + disp(RB_ind); + disp('boundry lambda value'); + disp(obj.RB); + disp('boundry p-values'); + disp(FC(RB_ind)); + disp('fraction inside interval'); + disp(abs(diff(FC(RB_ind)))); + end + end + function Calc(obj) %calculate the CRE as well as a on-tailed definition of the RB. + % RR note: 23Nov22: I am not sure if I still like this RB + % definition. I keep having a hard time defending it + % theoretically. The code is retained, for now, however for backwards + % compatibility. if isempty(obj.Data) warning('nothing to do'); return;