function path = follow_entire_path(Ks,y,loss,ds,path_params,Ks_test,ytest)

% FOLLOW THE ENTIRE PATH OF REGULARIZATION
%
% path = follow_entire_path(Ks,y,loss,ds,path_params,Ks_test,ytest)
%
% INPUT (required)
% In the following, n is the size of the training set, k the number of kernels:
%            Ks : list of k n*n kernel matrices (as created by 'build_efficient_Ks')
%             y : n*1 response vector (+ or - 1's for logistic regression)
%          loss : loss.type = 'regression' or 'logistic' (more to be added later)
%            ds : weights of the block 1-norm (you may use a k*1 vector of
%                 ones, or obtain a more reasonable ds by using 'compute_ds')
%   path_params : parameters of the path (defined below)
% INPUT (optional)
% In the following, m is the size of the test set:
%       Ks_test : list of k n*m kernel matrices between training and testing data
%         ytest : m*1 response vector (testing data)
%
% OUTPUT
% In the following, s is the number of steps of the algorithm
%          path.alphas : n*s dual variables (enough for prediction, e.g. classify new data)
%            path.etas : k*s dual variables (enough to construct the optimal kernel)
%          path.sigmas : 1*s regularization parameter
%           path.netas : 1*s number of etas that are almost zero (sparsity)
%              path.ks : 1*s number of iterations of Newton's method
% path.training_errors : 1*s error on training set
%  path.testing_errors : 1*s error on test set (if provided)
%
% SEE ALSO
% compute_ds, create_path_kernel_matrices, generate_random_problem


% INITIALIZATION OF CONSTANTS
mu                  = path_params.mu;                   % parameter of log-barrier
EPS1                = path_params.EPS1;                 % precision parameters of Newton steps (very small and fixed)
EPS2                = path_params.EPS2;                 % precision parameter of tube around path (adaptive)
predictor_type      = path_params.predictor_type;       % 1 : first order predictor, 2 : second order
efficient_predictor = path_params.efficient_predictor;  % 1 : efficient predictor steps, 0 : full steps
maxsigma            = path_params.maxsigma;             % maximum value of sigma = -log(lambda);
newton_iter1        = path_params.newton_iter1 ;         % number of iterations with no modification
newton_iter2        = path_params.newton_iter2  ;         % delta number of iterations under which EPS2 is divided/multiplied by 2


problemtype     = problem_type(loss); % classification or regression
m               = Ks.m;

test = 0;       % test = 1 if performs testing online
if nargin>=6,
    if ~isempty(Ks_test), test = 1; end
end


% FIRST STEP
beta = target(y,loss);
Kbeta = kernel_operation(Ks,1,beta);
start = sqrt( beta' * Kbeta ) ./ ds ;
lambda = 1.3 * max(start);
sigma = -log(lambda);
alpha =  beta / lambda;
% keyboard
% [fx,gradient,hessianinvchol] = objective_function(Ks,y,loss,alpha,ds,lambda,mu);
% fx
[alpha,lambda2,exit_status,k]= newton_method(Ks,y,loss,alpha,ds,lambda,mu,0,EPS1,500);
[f,grad,hessianinvchol0,eta,lambda2,da_dsigma,d2a_dsigma2] = all_derivatives(Ks,y,loss,alpha,ds,lambda,mu);
da_dsigma_old = da_dsigma;
d2a_dsigma2_old = d2a_dsigma2;
alpha_old = alpha;
sigma_old = sigma;

% store first step
alphas = alpha;
sigmas = sigma;
etas = eta;
netas = length(find( eta' .* ds.^2 > mu * 4 ) );
ks = k;
lambda2s = lambda2;
normgrads = norm(grad)^2;
nevals = 0;
EPS2s = EPS2;

ypred = - kernel_operation(Ks,1,alpha) * eta;
switch problemtype,
    case 'regression'
        training_error = sum( (ypred-y).^2 ) / length(y);
    case 'classification'
        ypred = sign( ypred );
        training_error = length(find( abs(ypred-y)> 0 )) / length(y); 
end
training_errors = training_error;

if test
    yhat = - kernel_operation_test(Ks_test,1,alpha) * eta;
    ypred = yhat;
    switch problemtype,
        case 'regression'
            testing_error = sum( (ypred-ytest).^2 ) / length(ytest);
        case 'classification'
            ypred = sign( ypred );
            testing_error = length(find( abs(ypred-ytest)> 0 )) / length(ytest); 
    end
    testing_errors = testing_error;    
end


fprintf('iter %d - sigma=%f - n_corr=%d - lambda2=%1.2e - EPS2=%1.2e - n_pred=%d - neta=%d\n',length(sigmas),sigmas(end),ks(end),lambda2s(end),EPS2,nevals(end),netas(end));


% PREDICTOR-CORRECTOR STEPS
% exits if upperbound on sigma is reached or Newton steps didnot converge
delta_sigma = 1;
try_all_predictors=0;

while sigma < maxsigma  & lambda2 < EPS2 & delta_sigma > 1e-6 & length(sigmas)<=2000
    
    % every 100 moves, reset to large tube around data
    if mod(length(sigmas),100)==1, EPS2                = path_params.EPS2;  end
    
    % choosing dsigma : predictor steps
    neval = 0;
    
    if length(sigmas)==1,
        boosted = 0;
        % first predictor step: try out several values
        dsigmas = 10.^[-4:.25:0];
        for idsigma=1:length(dsigmas);
            dsigma=dsigmas(idsigma);
            compute_newlambda2;
            neval = neval + 1;
            if isinf(lambda2) || isnan(lambda2) || (lambda2>EPS2), break;  end;
        end
        dsigma = dsigmas(idsigma-1);
        switch predictor_type
            case 0, newalpha = alpha;
            case 1, newalpha = alpha+dsigma*da_dsigma;
            case 2, newalpha = alpha+dsigma*da_dsigma+ dsigma^2 * .5 * d2a_dsigma2;
        end
        newlambda = exp(-sigma-dsigma);
        
    else
        % for regression, if long stagnation with the same number of
        % kernels, the path is likely to be linear in 1/lambda
        if length(netas)>=10 & isequal(problemtype,'regression')
            if all(netas(end-9:end)==netas(end)), boosted = 1; fprintf(' boosted '); else boosted = 0; end
        else
            boosted = 0;
        end
        predictor_step;
    end
    
    predicted_type=predictor_type;
    if dsigma <= 1e-5
        switch predictor_type
            case 2,
                predictor_type = 1;
                predictor_step
                predicted_type = 1;
                if dsigma <= 1e-5
                    predictor_type = 0;
                    predictor_step
                    predicted_type=0;
                end
                predictor_type = 2;
            case 1,
                predictor_type = 0;
                predictor_step
                predictor_type = 1;
                predicted_type =0;
        end
    end
    
    if dsigma <= 1e-5, 
        % not event the simpler predictor steps work, get out!
        break;    
    end
    
    if try_all_predictors
        alpha_predicted = newalpha;
        dsigma_predicted = dsigma;
        predictor_type = 0;
        predictor_step;
        predictor_type = 1;
        if dsigma>dsigma_predicted,
            predicted_type=0;
            fprintf(' trivial is better ');
        else
            newalpha = alpha_predicted ;
            dsigma =  dsigma_predicted;
        end        
    end
    
    
    go_on=1;
    counter = 1;
    while go_on & counter <= 4
        % try to perform Newton steps from predicted sigma
        % if does not converge, diminishes sigma and divides EPS2 by 8
        % if never converges after 4 times, exit 
        counter = counter + 1;
        sigmapot = sigma + dsigma;
        try
            [alphapot,lambda2,exit_status,k,nevals_newton,exit_params]= newton_method(Ks,y,loss,newalpha,ds,exp(-sigmapot),mu,0,EPS1,30);
        catch
            q=round(10*sum(clock));
            q
            save(sprintf('error_%d',q));
            error('error in newton method');    
        end
        
        if strcmp(exit_status,'max_iterations') || lambda2 >  1e-5 || ...
                strcmp(exit_status,'infinite_lambda_0') || ...
                strcmp(exit_status,'no_inverse_hessian_0') || ...
                strcmp(exit_status,'nan_lambda_0')
            fprintf('Newton''s method takes too long!! dsigma=%e lambda2=%e\n',dsigma,lambda);
            dsigma = dsigma / 8;
            EPS2 = EPS2 / 8;
            sigmapot = sigma + dsigma;
            if boosted
                %     switch predictor_type
                %         case 0, newalpha = alpha + dsigma * (alpha_old-alpha)/(sigma_old-sigma);
                %         case 1, newalpha = alpha + dsigma * da_dsigma + .5 * dsigma^2 * (da_dsigma_old-da_dsigma)/(sigma_old-sigma);
                %         case 2, newalpha = alpha + dsigma * da_dsigma + dsigma^2 * .5 * d2a_dsigma2 + ...
                %                 1/6 * dsigma^3 * (d2a_dsigma2_old-d2a_dsigma2)/(sigma_old-sigma);
                %     end
                
                
                switch predicted_type
                    case 0, newalpha = alpha ;
                    case 1, newalpha = alpha + ( exp(dsigma) - 1 ) * da_dsigma;
                    case 2, newalpha = alpha + ( exp(dsigma) - 1 ) * da_dsigma + ...
                            +  .5 * ( exp(dsigma) - 1 )^2 * (  d2a_dsigma2 - da_dsigma );
                end
                
            else
                switch predicted_type
                    case 0, newalpha = alpha;
                    case 1, newalpha = alpha+dsigma*da_dsigma;
                    case 2, newalpha = alpha+dsigma*da_dsigma+ dsigma^2 * .5 * d2a_dsigma2;
                end
            end
        else go_on=0;
        end
    end
    if counter == 4 & go_on, break; end
    
    alpha_old = alpha;
    sigma_old = sigma;
    
    alpha = alphapot;
    delta_sigma = abs( sigma - sigmapot);
    sigma = sigmapot;
    lambda = exp(-sigma);
    
    
    % store step
    try
        da_dsigma_old = da_dsigma;
        d2a_dsigma2_old = d2a_dsigma2;
        [f,grad,hessianinvchol,eta,lambda2,da_dsigma,d2a_dsigma2] = all_derivatives(Ks,y,loss,alpha,ds,lambda,mu);
        % if the values of the derivatives with respect to the path are infinite or NAN, put them to zero
        if sum( isnan(da_dsigma) | isinf(da_dsigma) ), 
            fprintf('derivative of path is undefined\n');
            fprintf('only doing zero order now (trivial predictors)\n');
            da_dsigma=zeros(size(da_dsigma)); 
            d2a_dsigma2=zeros(size(d2a_dsigma2)); 
            predictor_type = 0;
            try_all_predictors=0;
            
        end
        if sum( isnan(d2a_dsigma2) | isinf(d2a_dsigma2) ) & predictor_type ==2, 
            fprintf('second derivative of path is undefined\n');
            fprintf('only doing first order now\n');
            d2a_dsigma2=zeros(size(d2a_dsigma2));
            predictor_type = 1;
            try_all_predictors=1;
        end
        
        
    catch
        q=round(10*sum(clock));
        q
        save(sprintf('error_%d',q));
        error('error in all derivatives');    
    end
    lambda2s = [ lambda2s lambda2];
    normgrads = [ normgrads norm(grad)^2 ];
    alphas = [ alphas alpha];
    sigmas = [ sigmas sigma];
    etas = [ etas eta];
    netas = [ netas length(find( eta' .* ds.^2 > mu * 4 ) ) ];
    ks = [ ks k];
    nevals = [ nevals neval];
    
    % update EPS2
    if ~strcmp(exit_status,'df_small')
        EPS2=EPS2 * 2^ ( (newton_iter1 -k)/newton_iter2 );
    else
        EPS2=EPS2 * 2^ ( (newton_iter1 -k)/newton_iter2/2 );
    end
    if lambda2 >= EPS2/10, EPS2 = lambda2 * 10; end
    EPS2 = max(EPS2, 1e-5);
    EPS2 = min(EPS2, 1);
    EPS2s = [ EPS2s EPS2 ];
    ypred = - kernel_operation(Ks,1,alpha) * eta;
    switch problemtype,
        case 'regression'
            training_error = sum( (ypred-y).^2 ) / length(y);
        case 'classification'
            ypred = sign( ypred );
            training_error = length(find( abs(ypred-y)> 0 )) / length(y); 
    end
    training_errors = [ training_errors training_error];
    
    % perform testing if applicable
    if test
        yhat = - kernel_operation_test(Ks_test,1,alpha) * eta;
        ypred = yhat;
        switch problemtype,
            case 'regression'
                testing_error = sum( (ypred-ytest).^2 ) / length(ytest);
            case 'classification'
                ypred = sign( ypred );
                testing_error = length(find( abs(ypred-ytest)> 0 )) / length(ytest); 
        end
        testing_errors = [ testing_errors testing_error ];    
    end
    
    
    
    fprintf('iter %d - sigma=%f - n_corr=%d - lambda2=%1.2e - EPS2=%1.2e - n_pred=%d - neta=%d - dsigma=%f - status=%s\n',length(sigmas),sigmas(end),ks(end),lambda2s(end),EPS2,nevals(end),netas(end),dsigma,exit_status);
    
end

path.alphas=alphas;
path.etas=etas;
path.sigmas=sigmas;
path.netas=netas;
path.ks=ks;
% path.nevals=nevals;
% path.normgrads = normgrads;
% path.lambda2s = lambda2s;
% path.EPS2s = EPS2s;
path.training_errors = training_errors;
if test, path.testing_errors = testing_errors; end

if 0,
    % debugging
    q=round(10*sum(clock));
    q
    save(sprintf('debug_%d',q));
    fprintf('results of path following saved');    
end
