see previous comment

Please use EndPoint25Mar2022 as the stable branch.
Changes made to CREClass.m Here I've added an alternative calculation for the Relevance Boundary. CalcRBShannon is now part of the CREClass.m file. The TestCREClass has been updated to allow testing this new feature. NB: this is work in progress
2022-11-24 14:10:27 +01:00 · 2022-11-24 14:09:47 +01:00
2 changed files with 93 additions and 28 deletions
--- a/CREClass.m
+++ b/CREClass.m
@ -1,4 +1,7 @@
 classdef CREClass < handle
    % 23 and 24 Nov 2022 I added a new calculation for the RB based on
    % Shanon entropy given a threshold \lambda. I also think there is a
    % "mistake" in calculating the RB the classical way. 
    properties
        Data; % the  data set to get the CRE off
        nBin; %number of bins 
@ -111,7 +114,62 @@ classdef CREClass < handle
                 obj.nBin=numel(obj.Edges)-1;
             end
        end
-        function Calc(obj)
+        function CalcRBShannon(obj)
            % 23Nov22 Here I will try to implement a (hopefully better)
            % definition of the relevance boundry. The basic idea is 
            % assume a dataset X= {x_1 ... x_i} X
            % set a threshold \lambda NB \lambda \in of the data X
            %Shanon entropy for this \lambda will be: 
            % H(\lamda)=p(x<\lambda)log(p(x<lambda))+p(x>=\lambda)log(p(x>=\lambda))
            % Now the expectation value for H will be :
            %E(H)=\int p(\lambda)*H(\lambda) d\lambda
             if isempty(obj.Data)
                warning('nothing to do');
                return;
            end
            if isempty(obj.Edges);obj.CalcEdges;end
            [CP,bin]=histcounts(obj.Data(:),transpose(obj.Edges),'Normalization','cdf');
            dl=transpose(diff(bin)./sum(diff(bin))); %  will capture the case with non-equidistant bins as well as equidistant bins
            pdf_lambda=histcounts(obj.Data(:),transpose(obj.Edges),'Normalization','pdf');
            CP=transpose(CP);
            LogCP=log(CP);
            LogCP(~isfinite(LogCP))=0; % see comment below
            FC=1-CP;
            FC(FC<0)=0; % due to rounding errors, small negative numbers can arrise. These are theoretically not possible. 
            LogFC=log(FC);
            LogFC(~isfinite(LogFC))=0;%the log of 0 is -inf. however in Fc.*logFc it should end up as 0. to avoid conflicts removing the -inf
            H=-1*(CP.*LogCP+FC.*LogFC); %Shannon Entropy when a theshold (lambda) is set to the lower bin edge
            w=(dl.*pdf_lambda);
            w=w./sum(w); % for one reason or the other, the sum is not always equal to 1.  
            EH=w*H; %calculate expectation value for H
            RB_ind=[find(H>=EH,1,'first') find(H>=EH,1,'last')];
            obj.RB=obj.Edges(RB_ind)+dl(RB_ind)/2;
            obj.P_RB=FC(RB_ind);
            if obj.Monitor
                lambda=obj.Edges(1:end-1);
                scatter(w,H);xlabel('weight');ylabel('Shannon Entropy');snapnow
                plot(lambda,H);xlabel('lambda');ylabel('Shannon Entropy');
                line(lambda(RB_ind(:)),[EH;EH],'linestyle',':');snapnow
                histogram(obj.Data(:));snapnow
                disp('sum of weights')
                disp(sum(dl.*pdf_lambda))
                disp('EH')
                disp(EH)
                disp('boundry index');
                disp(RB_ind);
                disp('boundry lambda value');
                disp(obj.RB);
                disp('boundry p-values');
                disp(FC(RB_ind));
                disp('fraction inside interval');
                disp(abs(diff(FC(RB_ind))));
            end
        end
        function Calc(obj) %calculate the CRE as well as a on-tailed definition of the RB. 
            % RR note: 23Nov22: I am not sure if I still like this RB
            % definition. I keep having a hard time defending it
            % theoretically. The code is retained, for now, however for backwards
            % compatibility. 
            if isempty(obj.Data)
                warning('nothing to do');
                return;
--- a/TestCREClass.m
+++ b/TestCREClass.m
@ -2,31 +2,36 @@
 % show some default behaviour
 % and test the error handling. 
 %% set some constants
-A=randn(100);
+% A=randn(1000,1);
 % A=rand(1000,1);
 A=cat(1,randn(1000,1),randn(200,1)+10);
 %% create instance of CRE class
-S1=CREClass;
+% % % S1=CREClass;
-%% Calculate by setting labda equal to each of the (unique) values in the data. 
+% % % %% Calculate by setting labda equal to each of the (unique) values in the data. 
-S1.UseHistProxyFlag=false;
+% % % S1.UseHistProxyFlag=false;
-S1.EqualSizeBinFlag=false;
+% % % S1.EqualSizeBinFlag=false;
-S1.Data=A;% set gaussian data
+% % % S1.Data=A;% set gaussian data
-S1.Calc;
+% % % S1.Calc; %get RB based on the CRE
-disp(S1)
+% % % S1.CalcRBShannon; %% get RB based on shannon entropy
-%% use histogram aproximation with unequal bin sizes in the histogram
+% % % disp(S1)
-S2=CREClass;
+% % % %% use histogram aproximation with unequal bin sizes in the histogram
-S2.UseHistProxyFlag=true;
+% % % S2=CREClass;
-S2.EqualSizeBinFlag=false;
+% % % S2.UseHistProxyFlag=true;
-S2.Data=A;% set gaussian data
+% % % S2.EqualSizeBinFlag=false;
-S2.nBin=numel(A);      % set nbin to npoints; This is fine as we use the cummulative distribution and the formulas as defined in Zografos
+% % % S2.Data=A;% set gaussian data
-S2.Calc
+% % % S2.nBin=numel(A);      % set nbin to npoints; This is fine as we use the cummulative distribution and the formulas as defined in Zografos
-disp(S2)
+% % % S2.Calc
-%% use histogram aproximation with equal bin sizes in the histogram
+% % % S2.CalcRBShannon;
-S3=CREClass;
+% % % disp(S2)
-S3.UseHistProxyFlag=true;
+% % % %% use histogram aproximation with equal bin sizes in the histogram
-S3.EqualSizeBinFlag=false;
+% % % S3=CREClass;
-S3.Data=A;% set gaussian data
+% % % S3.UseHistProxyFlag=true;
-S3.nBin=numel(A);      % set nbin to npoints; This is fine as we use the cummulative distribution and the formulas as defined in Zografos
+% % % S3.EqualSizeBinFlag=false;
-S3.Calc
+% % % S3.Data=A;% set gaussian data
-disp(S3)
+% % % S3.nBin=numel(A);      % set nbin to npoints; This is fine as we use the cummulative distribution and the formulas as defined in Zografos
 % % % S3.Calc
 % % % S3.CalcRBShannon;
 % % % disp(S3)
 %% Check recalculation of edges
 S4=CREClass;
 S4.UseHistProxyFlag=true;
@ -34,13 +39,15 @@ S4.EqualSizeBinFlag=true;
 S4.Data=A;% set gaussian data
 S4.nBin=numel(A);      % set nbin to npoints; This is fine as we use the cummulative distribution and the formulas as defined in Zografos
 S4.Calc
 S4.CalcRBShannon;
 disp(S4)
-S4.nBin=numel(A)/10;
+S4.nBin=round(numel(A)/10);
 S4.CalcEdges ; % force a recalculation of the edges
 S4.Calc;
 S4.CalcRBShannon;
 disp(S4)
-
+%%
-
+return;
 %% Show effect of scaling or ofsett the data
 % Scale
 S.Data=100*A;