3 changed files with 83 additions and 219 deletions
--- a/CCREClass.m
+++ b/CCREClass.m
@ -11,16 +11,9 @@ classdef CCREClass<CREClass
    end
    properties
        DataRef %This parameter will contain the reference set.
-        nBinRef %number of "bins" to use for the reference distribution
-        EqualSizeBinFlagRef=false;  %set to true to get equally spaced bins, default : false, put an equal number of points per bin. 
-        EdgesRef; %edges for the bins of the reference function.  %todo, turn into dep. variable!
-    end
-    properties(SetAccess=private) %make read only
+        nBinRef
        CCRE %parameter containing the conditional Cumulative Residual Entropy
    end
-    properties(Constant)
-        UseHistProxyFlagRef=true; % this value is there to remind the user that we will have to bin the reference image. The actual value is not used in the code
-    end
    
    methods %constructor
        function obj = CCREClass()
@ -45,16 +38,6 @@ classdef CCREClass<CREClass
        end
    end
    methods %user callable functions
-        function CalcEdgesRef(obj)
-            % copy the info for "ref" to a CRE object
-            Stmp=CREClass('ref',obj.DataRef...
-                ,'nbin',obj.nBinRef...
-                ,'EqualSizeBinFlag',obj.EqualSizeBinFlagRef...
-                ,'UseHistProxyFlag',obj.UseHistProxyFlagRef...
-                );
-            Stmp.CalcEdges;
-            obj.EdgesRef=Stmp.Edges;
-        end
        %% CCRE A|B
        %
        % $$CCRE = \varepsilon (A) - E[\varepsilon(A|B)]$$
@ -75,18 +58,14 @@ classdef CCREClass<CREClass
            if isempty(obj.DataRef);return;end %if no refdata given, return;
            %calculate histogram for RefData
            EVarEpsDataGivenRef=0; %clear previous calculations
-            %get the edge definition. 
-            % a bit of a workaround, I don't plan to reprogram the function
-            if isempty(obj.EdgesRef)
-                obj.CalcEdgesRef;
-            end
-            [pDataRef,E]=histcounts(obj.DataRef(:),obj.EdgesRef,'Normalization','probability');
+            [pDataRef,E]=histcounts(obj.DataRef(:),obj.nBinRef,'Normalization','probability');
+            [JointHist,E1,E2]=histcounts2(obj.Data(:),obj.DataRef(:),[obj.nBin,obj.nBinRef],'Normalization','probability');
            for k=1:numel(pDataRef) %loop over the bins in pDataRef NB: must be equal to the bins in JointHist for the Ref
                if pDataRef(k)==0;continue;end% if no data at this line go to the next
-                ind=E(k)<=obj.DataRef(:)&obj.DataRef(:)<E(k+1); %get an index to all points in Ref data inside the current bin
-                CRE_DataGivenDataRef=CREClass('data',obj.Data(ind)...
-                    ,'nbin',obj.Edges...
-                    );
+                ind=E(k)<=obj.DataRef(:)&obj.DataRef(:)<E(k+1);
+                CRE_DataGivenDataRef=CREClass;
+                CRE_DataGivenDataRef.Data=obj.Data(ind);
+                CRE_DataGivenDataRef.nBin=E1;
                CRE_DataGivenDataRef.Calc;
                EVarEpsDataGivenRef=EVarEpsDataGivenRef+pDataRef(k)*CRE_DataGivenDataRef.CRE; % I really have to check this line on merit. 
            end
--- a/CREClass.m
+++ b/CREClass.m
@ -1,29 +1,17 @@
 classdef CREClass < handle
-    % 23 and 24 Nov 2022 I added a new calculation for the RB based on
-    % Shanon entropy given a threshold \lambda. I also think there is a
-    % "mistake" in calculating the RB the classical way. 
    properties
        Data; % the  data set to get the CRE off
        nBin; %number of bins 
        Monitor=true; %set to true to get intermediat results/figures
-        UseHistProxyFlag=true; % set this to get a proxy of the survival function using the hist function
-        EqualSizeBinFlag=true; % set to true to get equal sized bins, set to false to get equi-content bins, 
-        % if set to false: each bin will contain (aproximately) an equal
-        % number of pixel values. 
+        UseHistProxy=true; % set this to get a proxy of the survival function using the hist function
    end
-    properties (SetAccess = private) %read only
+    properties %to be made read only
        CRE
        RB
        P_RB
    end
-    
-    properties(Dependent)
-                Edges
-    end
-    
    properties (Hidden)
        WaitBarHandle;
-        HiddenEdges;
    end
    properties (Constant,Hidden)
        alpha=1; %needed for the scaled cummulative redual entropy as defined in zografoss. Will be implemented in the future. (I hope)
@ -45,10 +33,6 @@ classdef CREClass < handle
                            obj.Data=varargin{k};
                        case {'nbin','bin'}
                            obj.nBin=varargin{k};
-                        case {'equalsizebinflag'}
-                            obj.EqualSizeBinFlag=varargin{k};
-                        case {'usehistproxyflag'}
-                            obj.UseHistProxyFlag=varargin{k};
                        otherwise
                            error('invalid variable name');
                    end
@ -63,17 +47,8 @@ classdef CREClass < handle
            end
        end
    end
-    %% set and get functions
+    %% set functions
    methods
-        function Edges=get.Edges(obj)
-            Edges=obj.HiddenEdges;
-        end
-        function set.Edges(obj,val)
-            if ~isnumeric(val);error('provide a array with edge values');end
-            if numel(val)<3;error('too few edges provided. Need at least 3 values, i.e. 2 bins defined');end
-            obj.HiddenEdges=transpose(val(:));
-            obj.nBin=numel(val)-1;
-        end
        function set.Data(obj,val)
            if isnumeric(val)
                obj.Data=val;
@ -91,94 +66,41 @@ classdef CREClass < handle
    end
    %% core function(s)
    methods
-        function CalcEdges(obj)
+        
+        function Calc(obj)
            if isempty(obj.Data)
                warning('nothing to do');
                return;
            end
-             if obj.UseHistProxyFlag
+            if obj.UseHistProxy
                if isempty(obj.nBin)
                    obj.nBin=floor(numel(obj.Data)./10);
                end
-                if obj.EqualSizeBinFlag 
-                    [~,obj.Edges]=histcounts(obj.Data(:),obj.nBin);
-                else
-                         prc=linspace(0,100,obj.nBin+1); % define the edges of the bins as a precentile
-                        obj.Edges=prctile(obj.Data(:),prc); %find the edges of the bins
-                end
-             else
-                 if obj.UseHistProxyFlag ; error('You cannot use EqualSizeBinFlag=true and UseHistProxyFlag=false as a combination');end
-                 q=transpose(unique(sort(obj.Data(:))));
-                 dq=diff(q);
-                 obj.Edges=cat(2,q(1)-eps,q(1:end-1)+dq/2,q(end)+eps); %the eps is a "shortcut" to avoid value=edge cases. Note that eps will increase the size of the bin by a very small amount. Although this is a systemetic bias, its effect will be neglectable
-                 obj.nBin=numel(obj.Edges)-1;
-             end
-        end
-        function CalcRBShannon(obj)
-            % 23Nov22 Here I will try to implement a (hopefully better)
-            % definition of the relevance boundry. The basic idea is 
-            % assume a dataset X= {x_1 ... x_i} X
-            % set a threshold \lambda NB \lambda \in of the data X
-            %Shanon entropy for this \lambda will be: 
-            % H(\lamda)=p(x<\lambda)log(p(x<lambda))+p(x>=\lambda)log(p(x>=\lambda))
-            % Now the expectation value for H will be :
-            %E(H)=\int p(\lambda)*H(\lambda) d\lambda
-             if isempty(obj.Data)
-                warning('nothing to do');
-                return;
-            end
-            if isempty(obj.Edges);obj.CalcEdges;end
-            [CP,bin]=histcounts(obj.Data(:),transpose(obj.Edges),'Normalization','cdf');
-            dl=transpose(diff(bin)./sum(diff(bin))); %  will capture the case with non-equidistant bins as well as equidistant bins
-            pdf_lambda=histcounts(obj.Data(:),transpose(obj.Edges),'Normalization','pdf');
-            CP=transpose(CP);
-            LogCP=log(CP);
-            LogCP(~isfinite(LogCP))=0; % see comment below
-            FC=1-CP;
-            FC(FC<0)=0; % due to rounding errors, small negative numbers can arrise. These are theoretically not possible. 
-            LogFC=log(FC);
-            LogFC(~isfinite(LogFC))=0;%the log of 0 is -inf. however in Fc.*logFc it should end up as 0. to avoid conflicts removing the -inf
-            H=-1*(CP.*LogCP+FC.*LogFC); %Shannon Entropy when a theshold (lambda) is set to the lower bin edge
-            w=(dl.*pdf_lambda);
-            w=w./sum(w); % for one reason or the other, the sum is not always equal to 1.  
-            EH=w*H; %calculate expectation value for H
-            RB_ind=[find(H>=EH,1,'first') find(H>=EH,1,'last')];
-            obj.RB=obj.Edges(RB_ind)+dl(RB_ind)/2;
-            obj.P_RB=FC(RB_ind);
-            if obj.Monitor
-                lambda=obj.Edges(1:end-1);
-                scatter(w,H);xlabel('weight');ylabel('Shannon Entropy');snapnow
-                plot(lambda,H);xlabel('lambda');ylabel('Shannon Entropy');
-                line(lambda(RB_ind(:)),[EH;EH],'linestyle',':');snapnow
-                histogram(obj.Data(:));snapnow
-                disp('sum of weights')
-                disp(sum(dl.*pdf_lambda))
-                disp('EH')
-                disp(EH)
-                disp('boundry index');
-                disp(RB_ind);
-                disp('boundry lambda value');
-                disp(obj.RB);
-                disp('boundry p-values');
-                disp(FC(RB_ind));
-                disp('fraction inside interval');
-                disp(abs(diff(FC(RB_ind))));
-            end
-        end
-        function Calc(obj) %calculate the CRE as well as a on-tailed definition of the RB. 
-            % RR note: 23Nov22: I am not sure if I still like this RB
-            % definition. I keep having a hard time defending it
-            % theoretically. The code is retained, for now, however for backwards
-            % compatibility. 
-            if isempty(obj.Data)
-                warning('nothing to do');
-                return;
-            end
-            if isempty(obj.Edges);obj.CalcEdges;end
-            [CP,bin]=histcounts(obj.Data(:),transpose(obj.Edges),'Normalization','cdf');
+%update part below to prefered matlab commands
+%                 [N,bin]=hist(obj.Data(:),obj.nBin);
+%                 N=transpose(N);
+%                 P=N./sum(N);
+%                 CP=cumsum(P);
+                [CP,bin]=histcounts(obj.Data(:),obj.nBin,'Normalization','cdf');
                FC=1-transpose(CP);
                FC(FC<0)=0;
-            dl=diff(bin)./sum(diff(bin)); %  will capture the case with non-equidistant bins as well as equidistant bins
+                dl=ones(size(FC))./numel(FC);
+            else
+                q=sort(unique(obj.Data(:)));
+                FC=zeros(size(q));
+                dl=zeros(size(q));
+                for k=1:numel(q)
+                    FC(k)=sum(obj.Data(:)>=q(k));
+                    if k==1
+                        dl(k)=(q(k+1)-q(k))/2; 
+                    elseif k==numel(q)
+                        dl(k)=(q(k)-q(k-1))/2;
+                    else
+                        dl(k)=(q(k+1)-q(k))/2+(q(k)-q(k-1))/2;
+                    end
+                end
+                FC=FC./numel(obj.Data);
+            end
            LogFC=log(FC);
            LogFC(~isfinite(LogFC))=0;%the log of 0 is -inf. however in Fc.*logFc it should end up as 0. to avoid conflicts removing the -inf
            if any(isnan(FC));error('something went wrong');end %catch a posible error. 
@ -193,16 +115,16 @@ classdef CREClass < handle
            dl(FC>0.5)=0; %set the weight for all in the histogram to the left side (i.e. <p50) to 0
            CRE_Med_Inf=-transpose(dl)*(FC.*LogFC)./(transpose(dl)*FC); %CRE zografos 
            ind=find(exp(-CRE_Med_Inf)>=FC,1,'first');
-            if obj.UseHistProxyFlag
+            if obj.UseHistProxy
                obj.P_RB=FC(ind);
-                if ind==numel(bin)%if RB is (beyond) the last bin,
-                    warning(sprintf('RB is hitting the upper bin value\n Maybe increase number of bins')) %#ok<*SPWRN>
                obj.RB=bin(ind);
-                end
-                obj.RB=(bin(ind)+bin(ind+1))/2; % set the RB to the center of the bin
            else
-                SortData=unique(sort(obj.Data(:)));
-                obj.RB=SortData(ind);
+%                 SortData=sort(obj.Data);
+%                 obj.RB=SortData(ind);
+                % I think the two lines above are a bug. I should not use
+                % "SortData but rather q itsself. Some numbers may be in
+                % the data multiple times
+                obj.RB=q(ind);
                obj.P_RB=FC(ind);
            end
        end
--- a/TestCREClass.m
+++ b/TestCREClass.m
@ -2,63 +2,35 @@
 % show some default behaviour
 % and test the error handling. 
 %% set some constants
-% A=randn(1000,1);
-% A=rand(1000,1);
-A=cat(1,randn(1000,1),randn(200,1)+10);
+A=randn(100);
 %% create instance of CRE class
-% % % S1=CREClass;
-% % % %% Calculate by setting labda equal to each of the (unique) values in the data. 
-% % % S1.UseHistProxyFlag=false;
-% % % S1.EqualSizeBinFlag=false;
-% % % S1.Data=A;% set gaussian data
-% % % S1.Calc; %get RB based on the CRE
-% % % S1.CalcRBShannon; %% get RB based on shannon entropy
-% % % disp(S1)
-% % % %% use histogram aproximation with unequal bin sizes in the histogram
-% % % S2=CREClass;
-% % % S2.UseHistProxyFlag=true;
-% % % S2.EqualSizeBinFlag=false;
-% % % S2.Data=A;% set gaussian data
-% % % S2.nBin=numel(A);      % set nbin to npoints; This is fine as we use the cummulative distribution and the formulas as defined in Zografos
-% % % S2.Calc
-% % % S2.CalcRBShannon;
-% % % disp(S2)
-% % % %% use histogram aproximation with equal bin sizes in the histogram
-% % % S3=CREClass;
-% % % S3.UseHistProxyFlag=true;
-% % % S3.EqualSizeBinFlag=false;
-% % % S3.Data=A;% set gaussian data
-% % % S3.nBin=numel(A);      % set nbin to npoints; This is fine as we use the cummulative distribution and the formulas as defined in Zografos
-% % % S3.Calc
-% % % S3.CalcRBShannon;
-% % % disp(S3)
-%% Check recalculation of edges
-S4=CREClass;
-S4.UseHistProxyFlag=true;
-S4.EqualSizeBinFlag=true;
-S4.Data=A;% set gaussian data
-S4.nBin=numel(A);      % set nbin to npoints; This is fine as we use the cummulative distribution and the formulas as defined in Zografos
-S4.Calc
-S4.CalcRBShannon;
-disp(S4)
-S4.nBin=round(numel(A)/10);
-S4.CalcEdges ; % force a recalculation of the edges
-S4.Calc;
-S4.CalcRBShannon;
-disp(S4)
-%%
-return;
+S=CREClass;
+%% check histogram vs "new" technique 
+% NB I come to the conclusion that the new method is not always working.
+% Why? 
+S.Data=A;% set gaussian data
+S.nBin=(numel(S.Data));      % set nbin to npoints; This is fine as we use the cummulative distribution and the formulas as defined in Zografos
+% use histogram
+S.UseHistProxy=true;
+S.Calc;
+S
+% Use my aproximation for the histogram
+% only do this if the number of data points is not too big. 
+S.UseHistProxy=false;
+S.Calc;
+S
+
 %% Show effect of scaling or ofsett the data
 % Scale
 S.Data=100*A;
-S.UseHistProxyFlag=true;
+S.UseHistProxy=true;
 S.Calc;
-disp(S);
+S
 % Offset
 S.Data=A+10;
-S.UseHistProxyFlag=true;
+S.UseHistProxy=true;
 S.Calc;
-disp(S);
+S
 %% 

 %% Test CCRE class
@ -67,30 +39,21 @@ figure(2);clf;
 figure(3);clf;hold on;
 A=randn(100);
 B=A;
-B(:)=A(randperm(numel(A)));
-w=-1:0.1:1;
-out(numel(w),1)=struct('w',[],'CRE',[],'CCRE',[],'R',[]);
-for k=1:numel(w)
-    out(k).w=w(k);
+B(:)=A(randperm(numel(A)))
+for k=-1:0.1:1
    CS=CCREClass;
-    CS.EqualSizeBinFlag=false;
-    CS.Data=1*(1-abs(w(k)))*A+w(k)*B;
+    CS.Data=1*(1-abs(k))*A+k*B;
    CS.nBin=50;
    CS.DataRef=B;
    CS.nBinRef=500;
    CS.Calc;
-    out(k).CRE=CS.CRE;
-    out(k).CCRE=CS.CCRE;
-    out(k).R=CS.CCRE./CS.CRE;
-    figure(2);
-    title(sprintf('w: %d',w(k)));
-    scatter(CS.DataRef(:),CS.Data(:));  
-    snapnow
-end
-disp(CS);
    figure(1);
-    clf;hold on;
-    plot([out(:).w],[out(:).CCRE],'o');
-    plot([out(:).w],[out(:).CRE],'x');
-    figure(3);clf
-    plot([out(:).w],[out(:).R],'d');
+    plot(k,CS.CCRE,'o')
+    plot(k,CS.CRE,'x');
+    figure(2);
+    scatter(CS.DataRef(:),CS.Data(:));
+    figure(3);
+    plot(k,CS.CCRE./CS.CRE,'d');
+end
+CS
+snapnow