diff --git a/CREClass.m b/CREClass.m index 42af3ed..9a81f6f 100644 --- a/CREClass.m +++ b/CREClass.m @@ -1,4 +1,7 @@ classdef CREClass < handle + % 23 and 24 Nov 2022 I added a new calculation for the RB based on + % Shanon entropy given a threshold \lambda. I also think there is a + % "mistake" in calculating the RB the classical way. properties Data; % the data set to get the CRE off nBin; %number of bins @@ -111,7 +114,62 @@ classdef CREClass < handle obj.nBin=numel(obj.Edges)-1; end end - function Calc(obj) + function CalcRBShannon(obj) + % 23Nov22 Here I will try to implement a (hopefully better) + % definition of the relevance boundry. The basic idea is + % assume a dataset X= {x_1 ... x_i} X + % set a threshold \lambda NB \lambda \in of the data X + %Shanon entropy for this \lambda will be: + % H(\lamda)=p(x<\lambda)log(p(x=\lambda)log(p(x>=\lambda)) + % Now the expectation value for H will be : + %E(H)=\int p(\lambda)*H(\lambda) d\lambda + if isempty(obj.Data) + warning('nothing to do'); + return; + end + if isempty(obj.Edges);obj.CalcEdges;end + [CP,bin]=histcounts(obj.Data(:),transpose(obj.Edges),'Normalization','cdf'); + dl=transpose(diff(bin)./sum(diff(bin))); % will capture the case with non-equidistant bins as well as equidistant bins + pdf_lambda=histcounts(obj.Data(:),transpose(obj.Edges),'Normalization','pdf'); + CP=transpose(CP); + LogCP=log(CP); + LogCP(~isfinite(LogCP))=0; % see comment below + FC=1-CP; + FC(FC<0)=0; % due to rounding errors, small negative numbers can arrise. These are theoretically not possible. + LogFC=log(FC); + LogFC(~isfinite(LogFC))=0;%the log of 0 is -inf. however in Fc.*logFc it should end up as 0. to avoid conflicts removing the -inf + H=-1*(CP.*LogCP+FC.*LogFC); %Shannon Entropy when a theshold (lambda) is set to the lower bin edge + w=(dl.*pdf_lambda); + w=w./sum(w); % for one reason or the other, the sum is not always equal to 1. + EH=w*H; %calculate expectation value for H + RB_ind=[find(H>=EH,1,'first') find(H>=EH,1,'last')]; + obj.RB=obj.Edges(RB_ind)+dl(RB_ind)/2; + obj.P_RB=FC(RB_ind); + if obj.Monitor + lambda=obj.Edges(1:end-1); + scatter(w,H);xlabel('weight');ylabel('Shannon Entropy');snapnow + plot(lambda,H);xlabel('lambda');ylabel('Shannon Entropy'); + line(lambda(RB_ind(:)),[EH;EH],'linestyle',':');snapnow + histogram(obj.Data(:));snapnow + disp('sum of weights') + disp(sum(dl.*pdf_lambda)) + disp('EH') + disp(EH) + disp('boundry index'); + disp(RB_ind); + disp('boundry lambda value'); + disp(obj.RB); + disp('boundry p-values'); + disp(FC(RB_ind)); + disp('fraction inside interval'); + disp(abs(diff(FC(RB_ind)))); + end + end + function Calc(obj) %calculate the CRE as well as a on-tailed definition of the RB. + % RR note: 23Nov22: I am not sure if I still like this RB + % definition. I keep having a hard time defending it + % theoretically. The code is retained, for now, however for backwards + % compatibility. if isempty(obj.Data) warning('nothing to do'); return;