CRE_RB_and_CCRE/CREClass.m
R.J. Renken cbcee6ac27 Please use EndPoint25Mar2022 as the stable branch.
Changes made to CREClass.m
Here I've added an alternative calculation for the Relevance Boundary.
CalcRBShannon is now part of the CREClass.m file.
The TestCREClass has been updated to allow testing this new feature.
NB: this is work in progress
2022-11-24 14:09:47 +01:00

210 lines
9.4 KiB
Matlab

classdef CREClass < handle
% 23 and 24 Nov 2022 I added a new calculation for the RB based on
% Shanon entropy given a threshold \lambda. I also think there is a
% "mistake" in calculating the RB the classical way.
properties
Data; % the data set to get the CRE off
nBin; %number of bins
Monitor=true; %set to true to get intermediat results/figures
UseHistProxyFlag=true; % set this to get a proxy of the survival function using the hist function
EqualSizeBinFlag=true; % set to true to get equal sized bins, set to false to get equi-content bins,
% if set to false: each bin will contain (aproximately) an equal
% number of pixel values.
end
properties (SetAccess = private) %read only
CRE
RB
P_RB
end
properties(Dependent)
Edges
end
properties (Hidden)
WaitBarHandle;
HiddenEdges;
end
properties (Constant,Hidden)
alpha=1; %needed for the scaled cummulative redual entropy as defined in zografoss. Will be implemented in the future. (I hope)
beta=1; %they are currently obsolete.
% FeedBackOptions={'CRE','RB','All'}
end
%% constructor
methods %constructor
function obj = CREClass(varargin)
obj.Data=[];
obj.nBin=[];
%obj.FeedBack={'CRE'};
if nargin==0;return;end % if no input arguments are given, just create an empty instance
if ischar(varargin{1}) %assume number value pairs.
if nargin<=2; error('please use number value pairs' ); end
for k=2:2:nargin
switch lower(varargin{k-1})
case {'ref','data'}
obj.Data=varargin{k};
case {'nbin','bin'}
obj.nBin=varargin{k};
case {'equalsizebinflag'}
obj.EqualSizeBinFlag=varargin{k};
case {'usehistproxyflag'}
obj.UseHistProxyFlag=varargin{k};
otherwise
error('invalid variable name');
end
end
else
if nargin>=1
obj.Data=varargin{1};
end
if nargin==2
obj.nBin=varargin{2};
end
end
end
end
%% set and get functions
methods
function Edges=get.Edges(obj)
Edges=obj.HiddenEdges;
end
function set.Edges(obj,val)
if ~isnumeric(val);error('provide a array with edge values');end
if numel(val)<3;error('too few edges provided. Need at least 3 values, i.e. 2 bins defined');end
obj.HiddenEdges=transpose(val(:));
obj.nBin=numel(val)-1;
end
function set.Data(obj,val)
if isnumeric(val)
obj.Data=val;
else
error('please enter numeric array/matrix as reference');
end
end
function set.nBin(obj,val)
if isnumeric(val)
obj.nBin=val;
else
error('please enter numeric array or value as bin definition');
end
end
end
%% core function(s)
methods
function CalcEdges(obj)
if isempty(obj.Data)
warning('nothing to do');
return;
end
if obj.UseHistProxyFlag
if isempty(obj.nBin)
obj.nBin=floor(numel(obj.Data)./10);
end
if obj.EqualSizeBinFlag
[~,obj.Edges]=histcounts(obj.Data(:),obj.nBin);
else
prc=linspace(0,100,obj.nBin+1); % define the edges of the bins as a precentile
obj.Edges=prctile(obj.Data(:),prc); %find the edges of the bins
end
else
if obj.UseHistProxyFlag ; error('You cannot use EqualSizeBinFlag=true and UseHistProxyFlag=false as a combination');end
q=transpose(unique(sort(obj.Data(:))));
dq=diff(q);
obj.Edges=cat(2,q(1)-eps,q(1:end-1)+dq/2,q(end)+eps); %the eps is a "shortcut" to avoid value=edge cases. Note that eps will increase the size of the bin by a very small amount. Although this is a systemetic bias, its effect will be neglectable
obj.nBin=numel(obj.Edges)-1;
end
end
function CalcRBShannon(obj)
% 23Nov22 Here I will try to implement a (hopefully better)
% definition of the relevance boundry. The basic idea is
% assume a dataset X= {x_1 ... x_i} X
% set a threshold \lambda NB \lambda \in of the data X
%Shanon entropy for this \lambda will be:
% H(\lamda)=p(x<\lambda)log(p(x<lambda))+p(x>=\lambda)log(p(x>=\lambda))
% Now the expectation value for H will be :
%E(H)=\int p(\lambda)*H(\lambda) d\lambda
if isempty(obj.Data)
warning('nothing to do');
return;
end
if isempty(obj.Edges);obj.CalcEdges;end
[CP,bin]=histcounts(obj.Data(:),transpose(obj.Edges),'Normalization','cdf');
dl=transpose(diff(bin)./sum(diff(bin))); % will capture the case with non-equidistant bins as well as equidistant bins
pdf_lambda=histcounts(obj.Data(:),transpose(obj.Edges),'Normalization','pdf');
CP=transpose(CP);
LogCP=log(CP);
LogCP(~isfinite(LogCP))=0; % see comment below
FC=1-CP;
FC(FC<0)=0; % due to rounding errors, small negative numbers can arrise. These are theoretically not possible.
LogFC=log(FC);
LogFC(~isfinite(LogFC))=0;%the log of 0 is -inf. however in Fc.*logFc it should end up as 0. to avoid conflicts removing the -inf
H=-1*(CP.*LogCP+FC.*LogFC); %Shannon Entropy when a theshold (lambda) is set to the lower bin edge
w=(dl.*pdf_lambda);
w=w./sum(w); % for one reason or the other, the sum is not always equal to 1.
EH=w*H; %calculate expectation value for H
RB_ind=[find(H>=EH,1,'first') find(H>=EH,1,'last')];
obj.RB=obj.Edges(RB_ind)+dl(RB_ind)/2;
obj.P_RB=FC(RB_ind);
if obj.Monitor
lambda=obj.Edges(1:end-1);
scatter(w,H);xlabel('weight');ylabel('Shannon Entropy');snapnow
plot(lambda,H);xlabel('lambda');ylabel('Shannon Entropy');
line(lambda(RB_ind(:)),[EH;EH],'linestyle',':');snapnow
histogram(obj.Data(:));snapnow
disp('sum of weights')
disp(sum(dl.*pdf_lambda))
disp('EH')
disp(EH)
disp('boundry index');
disp(RB_ind);
disp('boundry lambda value');
disp(obj.RB);
disp('boundry p-values');
disp(FC(RB_ind));
disp('fraction inside interval');
disp(abs(diff(FC(RB_ind))));
end
end
function Calc(obj) %calculate the CRE as well as a on-tailed definition of the RB.
% RR note: 23Nov22: I am not sure if I still like this RB
% definition. I keep having a hard time defending it
% theoretically. The code is retained, for now, however for backwards
% compatibility.
if isempty(obj.Data)
warning('nothing to do');
return;
end
if isempty(obj.Edges);obj.CalcEdges;end
[CP,bin]=histcounts(obj.Data(:),transpose(obj.Edges),'Normalization','cdf');
FC=1-transpose(CP);
FC(FC<0)=0;
dl=diff(bin)./sum(diff(bin)); % will capture the case with non-equidistant bins as well as equidistant bins
LogFC=log(FC);
LogFC(~isfinite(LogFC))=0;%the log of 0 is -inf. however in Fc.*logFc it should end up as 0. to avoid conflicts removing the -inf
if any(isnan(FC));error('something went wrong');end %catch a posible error.
if transpose(dl)*FC==0
%if only one value in the most left bin in the distribution I may get a 0 divided by 0
%as the CRE of a delta function is 0, enforce this outcome
obj.CRE=0;
else
obj.CRE=-transpose(dl)*(FC.*LogFC)./(transpose(dl)*FC); %CRE zografos
end
%% get the RB
dl(FC>0.5)=0; %set the weight for all in the histogram to the left side (i.e. <p50) to 0
CRE_Med_Inf=-transpose(dl)*(FC.*LogFC)./(transpose(dl)*FC); %CRE zografos
ind=find(exp(-CRE_Med_Inf)>=FC,1,'first');
if obj.UseHistProxyFlag
obj.P_RB=FC(ind);
if ind==numel(bin)%if RB is (beyond) the last bin,
warning(sprintf('RB is hitting the upper bin value\n Maybe increase number of bins')) %#ok<*SPWRN>
obj.RB=bin(ind);
end
obj.RB=(bin(ind)+bin(ind+1))/2; % set the RB to the center of the bin
else
SortData=unique(sort(obj.Data(:)));
obj.RB=SortData(ind);
obj.P_RB=FC(ind);
end
end
end
end