PRO sgp_deblurring_multi_gpu, A, gn, x, iter, err, Discr, times, bg = bg, $
					                    initialization = initialization, maxit = maxit, obj = obj, $
					                    stopcrit = stopcrit, tol = tol, verb = verb, $

; sgp_deblurring_multi_gpu - SGP algorithm (GPU version)
;   This function solves a multiple image deblurring problem by applying the SGP
;   algorithm to the minimization of the
;   generalized Kullback-Leibler divergence with no regularization [1]:
;
;      min KL(A*x + bg, gn)
;       x in OMEGA
;
;   where KL(u,v) is the generalized Kullback-Leibler divergence between
;   vectors u and v, bg is the background, gn are the observed data and
;   the feasible set OMEGA is x(i) >= 0;
;
;   [1] S. Bonettini, R. Zanella, L. Zanni,
;       "A scaled gradient projection method for constrained image deblurring",
;       Inverse Problems 25(1), 2009, January, 015002.
;
; SYNOPSIS
;    sgp_deblurring_multi_gpu, A, gn, x, iter, err, Discr, times [, opts]
;
; MANDATORY INPUT
;   A   (double array) - measuring matrix used to apply the blurring
;                        operator, that is to compute A*x.
;                        REMARK: all comlumns of A must sum-up to 1
;   gn  (double array) - measured image
;
; OPTIONAL INPUT
;   The following options must be provided as keyword/value pairs.
;   'OBJ'              - Exact solution, for error calculation (double array)
;   'BG'               - Background value (double)
;                        DEFAULT = 0
;   'INITIALIZATION'   - Choice for starting point:
;                        0  - all zero starting point
;                        1  - initialization with gn
;                        2  - initialization with
;                               ones(size(gn))*sum(gn(:) - bg) / numel(gn)
;                        x0 - user-provided starting point (double array)
;                        DEFAULT = 2
;   'MAXIT'            - Maximum number of iterations (integer)
;                        DEFAULT = 1000
;   'VERB'             - Verbosity level (integer)
;                        0 - silent
;                        1 - print configuration parameters at startup and
;                            some information at each iteration
;                        DEFAULT = 0
;   'STOPCRIT'         - Choice for stopping rule (integer)
;                        1 -> iter > MAXIT
;                        2 -> ||x_k - x_(k-1)|| <= tol*||x_k|| OR iter > MAXIT
;                        3 -> |KL_k - KL_(k-1)| <= tol*|KL_k| OR iter > MAXIT
;                        4 -> (2/N)*KL_k <= tol OR iter > MAXIT
;                        DEFAULT = 1;
;   'TOL'              - Tolerance used in the stopping criterion
;                        DEFAULT = 1e-4 if STOPCRITERION = 2 or 3
;                        DEFAULT = 1+1/mean(gn) if STOPCRITERION = 4
;
; OUTPUT
;   x                  - Reconstructed data
;   iter               - Number of iterations
;   err                - Error value at each iteration. If OBJ was not given,
;                        then err is the empty matrix.
;   Discr              - Discrepancy value after each iteration:
;                            D = 2/numel(x_k) * KL( Ax_k + bg, gn)
;   times              - Time elapsed after each iteration
;
; ------------------------------------------------------------------------------
;
; This software is developed within the research project
;
;        PRISMA - Optimization methods and software for inverse problems
;                           http://www.unife.it/prisma
;
; funded by the Italian Ministry for University and Research (MIUR), under
; the PRIN2008 initiative, grant n. 2008T5KA4L, 2010-2012. This software is
; part of the package "IRMA - Image Reconstruction in Microscopy and Astronomy"
; currently under development within the PRISMA project.
;
; Version: 1.0
; Date:    July 2011

; Authors:
;   Roberto Cavicchioli, Marco Prato, Luca Zanni
;    Dept. of Pure Appl. Math., Univ. of Modena and Reggio Emilia, Italy
;    roberto.cavicchioli@unimore.it, marco.prato@unimore.it, luca.zanni@unimore.it
;   Mario Bertero, Patrizia Boccacci
;    DISI (Dipartimento di Informatica e Scienze dell'Informazione), University of Genova, Italy
;    bertero@disi.unige.it, boccacci@disi.unige.it
;
; Software homepage: http://www.unife.it/prisma/software
;
; Copyright (C) 2011 by M. Bertero, P. Boccacci, R. Cavicchioli, M. Prato, L. Zanni
; ------------------------------------------------------------------------------
; COPYRIGHT NOTIFICATION
;
; Permission to copy and modify this software and its documentation for
; internal research use is granted, provided that this notice is retained
; thereon and on all copies or modifications. The authors and their
; respective Universities makes no representations as to the suitability
; and operability of this software for any purpose. It is provided "as is"
; without express or implied warranty. Use of this software for commercial
; purposes is expressly prohibited without contacting the authors.
;
; This program is free software; you can redistribute it and/or modify it
; under the terms of the GNU General Public License as published by the
; Free Software Foundation; either version 3 of the License, or (at your
; option) any later version.
;
; This program is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
; See the GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License along
; with this program; if not, either visite http://www.gnu.org/licenses/
; or write to
; Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; ==============================================================================

;initialize GPU device
gpuinit

; start the clock
t0 = systime(2)

;;;;;;;;;;;;;;;;;;;;;;;;;;
; SGP default parameters ;
;;;;;;;;;;;;;;;;;;;;;;;;;;
alpha_min = 1.e-5        ; alpha lower bound
alpha_max = 1.e5		     ; alpha upper bound
theta = 0.4              ; backtracking parameter
beta = 1.e-4             ; for sufficient decrease
initalpha = 1.3          ; initial alpha
M = 1                    ; memory in obj. function value (if M = 1 monotone)
Malpha = 3               ; alfaBB1 memory
tau = 0.5                ; alternating parameter
initflag = 2             ; 2 -> constant image
errflag = 0B             ; 0 -> no error calculation
err = -1

if (keyword_set(bg) - 1)          then bg = 0.                  ; background value
if (keyword_set(maxit) - 1)       then maxit = 1000             ; maximum number of iterations
if (keyword_set(stopcrit) - 1)    then stopcrit = 1             ; 1 -> number of iterations
if (keyword_set(verb) - 1)        then verb = 0                 ; 0 -> silent

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
dim = size(gn,/dimensions)

; GPUlib works only in float with FFT operations, so we convert
; the input A matrix
A = float(A)

TF = complexarr(dim[0],dim[2]*dim[1])

TF_gpu = gpuComplexarr(dim[0], dim[2]*dim[1])
CTF_gpu = gpuComplexarr(dim[0], dim[2]*dim[1])
tmp_c_gpu = gpuComplexarr(dim[0], dim[1])
tmp_c2_gpu = gpuComplexarr(dim[0], dim[1])
tmp_c3_gpu = gpuComplexarr(dim[0], dim[1])

flt_gpu = gpuFltarr(dim[0], dim[1])

X_gpu = gpuDblarr(dim[0], dim[1])
x_plus_gpu = gpuDblarr(dim[0], dim[1])
gn_gpu = gpuDblarr(dim[0], dim[2]*dim[1])
ONE_gpu = gpuDblarr(dim[0], dim[1])
tmp_gpu = gpuDblarr(dim[0], dim[1])
tmp3_gpu = gpuDblarr(dim[0], dim[2]*dim[1])
x_tf_gpu = gpuDblarr(dim[0], dim[2]*dim[1])
x_tf_try_gpu = gpuDblarr(dim[0], dim[2]*dim[1])
g_gpu = gpuDblarr(dim[0], dim[1])
XX_gpu = gpuDblarr(dim[0], dim[1])
g_tmp_gpu = gpuDblarr(dim[0], dim[1])
d_tf_gpu = gpuDblarr(dim[0], dim[2]*dim[1])
d_gpu = gpuDblarr(dim[0], dim[1])
lam_gpu = gpuDblarr(dim[0], dim[1])

sk_gpu = gpuDblarr(dim[0], dim[1])
yk_gpu = gpuDblarr(dim[0], dim[1])
sk2_gpu = gpuDblarr(dim[0], dim[1])
yk2_gpu = gpuDblarr(dim[0], dim[1])
DX_gpu = gpuDblarr(dim[0], dim[1])

app_gpu = gpuDblarr(dim[0],1)
sum_gpu = gpuDblarr(1,1)

for i = 0, dim[2]-1 do begin
    gpuComplex, reform(A[*,*,i]), tmp_c2_gpu
    tmp_c_gpu = gpushift(tmp_c2_gpu,dim[0]/2,dim[1]/2, LHS=tmp_c_gpu)
    tmp_c_gpu = gpufft(tmp_c_gpu, LHS=tmp_c_gpu, /DESTROYPLAN)
    gpuSubArr, tmp_c_gpu, -1, -1, TF_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ]
endfor
TF = gpuGetArr(TF_gpu)
TF = TF*dim[0]*dim[1]
CTF = conj(TF)
gpuputarr, TF, TF_gpu
gpuputarr, CTF, CTF_gpu

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Read the optional parameters ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
if arg_present(obj) then errflag = 1B
if keyword_set(initialization) then begin
	if product(size(initialization,/dimensions)) gt 1 then begin
		initflag = 999
		x = initialization		; initial x provided by user
	endif else begin
		initflag = initialization
	endelse
endif

;;;;;;;;;;;;;;;;;;
; starting point ;
;;;;;;;;;;;;;;;;;;
case initflag of
    0: x = fltarr(dim[0],dim[1])					; all zeros
    1: x = gn[*,*]										  	; gn
    2: x = total(gn-bg)/n_elements(gn)*make_array(dim[0],dim[1],value=1.,/double) ; same flux as gn - bg
    999: if not(array_equal(size(x,/dimensions),dim[0:1])) then	begin   ; x is explicitly given, check dimensions
            message, 'Invalid size of the initial point.'
         endif
    else: message, 'Unknown initialization option.'
endcase

; size of the images
obj_size = dim[0:1]

;;;;;;;;;;;;;;;;;;
; stop criterion ;
;;;;;;;;;;;;;;;;;;
if (stopcrit ne 1 and stopcrit ne 2 and stopcrit ne 3 and stopcrit ne 4) then begin
    message, 'Unknown stopping criterion:', stopcrit
end

if (keyword_set(tol) - 1) then begin
	if (stopcrit eq 2 or stopcrit eq 3) then tol = 1.e-4
	if (stopcrit eq 4) then tol = 1. + 1./mean(gn)
end

;;;;;;;;;;;;;;;;
; data scaling ;
;;;;;;;;;;;;;;;;
scaling = max(gn)
gn = gn/scaling
bg = bg/scaling
x = x/scaling

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; change the null pixels of the observed image ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
par = machar(/double)
vmin = min(gn(where(gn gt 0)))
gn_nonpos = where(gn le 0, n_nonpos)
if (n_nonpos gt 0) then gn(gn_nonpos) = vmin*par.eps^2

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; initializations and computations that need only once ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
N = n_elements(gn)/dim[2]                  		; pixels in the image
flux = total(total(gn,1),1) - N * bg      		; exact flux
iter = 1                                		  ; iteration counter
alpha = initalpha                       		  ; initial alpha
Valpha = alpha_max * (dblarr(Malpha)+1.)      ; memory buffer for alpha
Fold = -1.e30 * (dblarr(M)+1.)                ; memory buffer for obj. func.
Discr_coeff = 2./(N*dim[2])*scaling           ; discrepancy coefficient
ONE = make_array(obj_size,value = 1.,/DOUBLE)

gpuPutArr, gn, gn_gpu
gpuPutArr, ONE, ONE_gpu

;;;;;;;;;;;;;;;;;;;
; vector allocation
;;;;;;;;;;;;;;;;;;;
if errflag then begin
   err = dblarr(maxit+1)
   obj = obj/scaling
   obj_gpu = gpuDblarr(dim[0], dim[1])
   gpuPutArr, obj, obj_gpu
   tmp_gpu = gpumult(obj_gpu, obj_gpu, LHS=tmp_gpu)
   app_gpu = gputotal(tmp_gpu,2,LHS=app_gpu)
   sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
   gpuGetArr, sum_gpu, obj_sum
endif
Discr = dblarr(maxit+1)
times = dblarr(maxit+1)

;;;;;;;;;;;;;;;;
; start of SGP ;
;;;;;;;;;;;;;;;;
; projection of the initial point
x_nonpos =  where(x lt 0, n_nonpos)
if (n_nonpos gt 0) then x(x_nonpos) = 0.
gpuPutArr, x, X_gpu

; error
if errflag then begin
   tmp_gpu = gpuSub( X_gpu , obj_gpu, LHS=tmp_gpu)
   tmp_gpu = gpumult(tmp_gpu,tmp_gpu, LHS=tmp_gpu)
   app_gpu = gputotal(tmp_gpu,2,LHS=app_gpu)
   sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
   err[0] = sqrt( gpuGetArr(sum_gpu) /obj_sum )
endif

; objective function value
fv = 0.
y_min = double(1.e30)
y_max = double(0.)

tmp_c3_gpu = gpuComplex(X_gpu, LHS=tmp_c3_gpu)
tmp_c3_gpu = gpufft(tmp_c3_gpu, LHS=tmp_c3_gpu, /DESTROYPLAN)

for i = 0, dim[2]-1 do begin
    gpuSubArr, TF_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ], tmp_c2_gpu, -1, -1
    tmp_c2_gpu = gpumult( tmp_c2_gpu, tmp_c3_gpu, LHS=tmp_c2_gpu )
    tmp_c2_gpu = gpufft( tmp_c2_gpu, /inverse, LHS=tmp_c2_gpu, /DESTROYPLAN)
    flt_gpu = gpureal( tmp_c2_gpu , LHS=flt_gpu )
    lam_gpu = gpufix( flt_gpu, TYPE=5, LHS=lam_gpu)
    gpuSubArr, lam_gpu, -1, -1, x_tf_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ]

    gpuAdd, double(1.), lam_gpu, double(bg), ONE_gpu, double(0.), d_gpu
    gpuSubArr, gn_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ], sk_gpu, -1, -1
    tmp_gpu = gpuDiv( sk_gpu, d_gpu, LHS=tmp_gpu)

    tmp_c_gpu = gpuComplex(tmp_gpu, LHS=tmp_c_gpu)
    tmp_c_gpu = gpufft(tmp_c_gpu, LHS=tmp_c_gpu, /DESTROYPLAN)
    gpuSubArr, CTF_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ], tmp_c2_gpu, -1, -1
    tmp_c_gpu = gpumult( tmp_c2_gpu, tmp_c_gpu, LHS=tmp_c_gpu )
    tmp_c_gpu = gpufft( tmp_c_gpu, /inverse, LHS=tmp_c_gpu, /DESTROYPLAN)
    flt_gpu = gpureal( tmp_c_gpu , LHS=flt_gpu )
    d_gpu = gpufix( flt_gpu, TYPE=5, LHS = d_gpu)

    d_gpu = gpuSub(ONE_gpu, d_gpu, LHS=d_gpu)
    g_gpu = gpuAdd( g_gpu, d_gpu, LHS=g_gpu)

    app_gpu = gputotal(lam_gpu,2,LHS=app_gpu)
    sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
    gpuGetArr, sum_gpu, sum_xtf

    tmp_gpu = gpumult(sk_gpu, gpulog(tmp_gpu,LHS=tmp_gpu) ,LHS=tmp_gpu)
    app_gpu = gputotal(tmp_gpu,2,LHS=app_gpu)
    sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
    gpuGetArr, sum_gpu, sum_kl

    fv = fv + sum_kl + sum_xtf - flux[i]

    ;;;;;  bounds for the scaling matrices ;;;;;;;;
    tmp_c_gpu = gpuComplex(sk_gpu, LHS=tmp_c_gpu)
    tmp_c_gpu = gpufft(tmp_c_gpu, LHS=tmp_c_gpu, /DESTROYPLAN)
    tmp_c_gpu = gpumult( tmp_c2_gpu, tmp_c_gpu, LHS=tmp_c_gpu )
    tmp_c_gpu = gpufft( tmp_c_gpu, /inverse, LHS=tmp_c_gpu, /DESTROYPLAN)
    flt_gpu = gpureal( tmp_c_gpu , LHS=flt_gpu )
    tmp_gpu = gpufix( flt_gpu, TYPE=5, LHS = tmp_gpu)

    y = gpuGetArr(tmp_gpu)
    y_min_l = min(y[where(y gt 0)])
    y_max_l = max(y)
    y_min_l = y_min_l*(flux[i]/(flux[i] + N*bg))
    y_max_l = y_max_l*(flux[i]/(flux[i] + N*bg))
    y_min = min([y_min, y_min_l])
    y_max = max([y_max, y_max_l])
endfor

X_low_bound = y_min		         ; Lower bound for the scaling matrix
X_upp_bound = y_max    			   ; Upper bound for the scaling matrix
if X_upp_bound/X_low_bound lt 50. then begin
   X_low_bound = X_low_bound/10.
   X_upp_bound = X_upp_bound*10.
endif
; discrepancy
Discr[0] = Discr_coeff * fv

; scaling matrix
if initflag eq 0 then begin
	 gpuPutArr, ONE, XX_gpu
endif else begin
   ; bounds
   gpuPutArr, x, XX_gpu
   XX_gpu = gpuMaxOp(XX_gpu, X_low_bound, LHS=XX_gpu)
   XX_gpu = gpuMinOp(XX_gpu, X_upp_bound, LHS=XX_gpu)
end

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; tolerance for stop criterion
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

loop = 1B
while loop do begin

    Valpha[0:Malpha-2] = Valpha[1:Malpha-1]
    if M gt 1 then Fold[0:M-2] = Fold[1:M-1]
    Fold[M-1] = fv

    ; Step 2.1
    gpuMult, alpha, XX_gpu, double(1.), g_gpu, double(0.), tmp_gpu
    tmp_gpu = gpuSub(X_gpu,tmp_gpu,LHS=tmp_gpu)

    ; projection
		tmp_gpu = gpuMaxOp(tmp_gpu, double(0.), LHS=tmp_gpu)

    d_gpu = gpuSub(tmp_gpu, X_gpu,LHS=d_gpu)
    tmp_gpu = gpumult(d_gpu,g_gpu,lhs=tmp_gpu)
    app_gpu = gputotal(tmp_gpu,2,LHS=app_gpu)
    sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
    gpuGetArr, sum_gpu, gd
    lam = 1.

    ; Step 2.2
    fcontinue = 1B

    ; exploiting linearity
    tmp_c_gpu = gpuComplex(d_gpu, LHS=tmp_c_gpu)
    tmp_c_gpu = gpufft(tmp_c_gpu, LHS=tmp_c_gpu, /DESTROYPLAN)
    for i = 0, dim[2]-1 do begin
        gpuSubArr, TF_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ], tmp_c2_gpu, -1, -1
        tmp_c2_gpu = gpumult( tmp_c2_gpu, tmp_c_gpu, LHS=tmp_c2_gpu )
        tmp_c2_gpu = gpufft( tmp_c2_gpu, /inverse, LHS=tmp_c2_gpu, /DESTROYPLAN)
        flt_gpu = gpureal( tmp_c2_gpu , LHS=flt_gpu )
        lam_gpu = gpufix( flt_gpu, TYPE=5, LHS=lam_gpu)
        gpuSubArr, lam_gpu, -1, -1, d_tf_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ]
    endfor

    fr = max(Fold)

    while fcontinue do begin

        gpuAdd, double(1.), X_gpu, double(lam), d_gpu, double(0.), x_plus_gpu

        gpuAdd, double(1.), x_tf_gpu, double(lam), d_tf_gpu, double(0.), x_tf_try_gpu

        fv = 0.

        for i = 0, dim[2]-1 do begin
            gpuSubArr, x_tf_try_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ], yk_gpu, -1, -1
            gpuAdd, double(1.), yk_gpu, double(bg), ONE_gpu, double(0.), tmp_gpu

            gpuSubArr, gn_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ], sk_gpu, -1, -1
            tmp_gpu = gpuDiv(sk_gpu,tmp_gpu, LHS=tmp_gpu)
            gpuSubArr, tmp_gpu, -1, -1, tmp3_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ]

            XX_gpu = gpulog(tmp_gpu, LHS=XX_gpu)
            XX_gpu = gpumult(sk_gpu, XX_gpu , LHS=XX_gpu)
            app_gpu = gputotal(XX_gpu,2,LHS=app_gpu)
            sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
            gpuGetArr, sum_gpu, sum_kl

            app_gpu = gputotal(yk_gpu,2,LHS=app_gpu)
            sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
            gpuGetArr, sum_gpu, sum_xtf

            fv = fv + sum_kl + sum_xtf - flux[i]

        endfor

        ; Step 2.3
        if ( fv le fr + beta * lam * gd OR lam lt 1.e-12) then begin
            gpuCopy, x_plus_gpu, X_gpu

            gpuAdd, double(0.), ONE_gpu, double(lam), d_gpu, double(0.), sk_gpu

            gpuCopy, x_tf_try_gpu, x_tf_gpu

            for i = 0, dim[2]-1 do begin
                gpuSubArr, tmp3_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ], tmp_gpu, -1, -1
                tmp_c_gpu = gpuComplex(tmp_gpu, LHS=tmp_c_gpu)
                tmp_c_gpu = gpufft(tmp_c_gpu, LHS=tmp_c_gpu, /DESTROYPLAN)
                gpuSubArr, CTF_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ], tmp_c2_gpu, -1, -1
                tmp_c2_gpu = gpumult( tmp_c2_gpu, tmp_c_gpu, LHS=tmp_c2_gpu )
                tmp_c2_gpu = gpufft( tmp_c2_gpu, /inverse, LHS=tmp_c2_gpu, /DESTROYPLAN)
                flt_gpu = gpureal( tmp_c2_gpu , LHS=flt_gpu )
                yk_gpu = gpufix( flt_gpu, TYPE=5, LHS=yk_gpu)
                yk_gpu = gpuSub(ONE_gpu, yk_gpu, LHS=yk_gpu)
                if (i eq 0) then begin
                   gpuCopy, yk_gpu, g_tmp_gpu
                endif else begin
                   g_tmp_gpu = gpuAdd( g_tmp_gpu, yk_gpu, LHS=g_tmp_gpu)
                endelse
            endfor

            yk_gpu = gpuSub(g_tmp_gpu, g_gpu, LHS = yk_gpu)

            gpuCopy, g_tmp_gpu, g_gpu

            fcontinue = 0B
        endif else begin
            lam = lam * theta
        endelse
    endwhile

    if (fv ge fr AND verb gt 0) then print, 'WARNING: fv >= fr'

    ; Step 3

    gpuCopy, X_gpu, XX_gpu

    ; bounds
    XX_gpu = gpuMaxOp(XX_gpu, X_low_bound, LHS=XX_gpu)
    XX_gpu = gpuMinOp(XX_gpu, X_upp_bound, LHS=XX_gpu)

    DX_gpu = gpuDiv(ONE_gpu, XX_gpu, LHS=DX_gpu)

    sk2_gpu = gpuMult(sk_gpu, DX_gpu, LHS=sk2_gpu)
    yk2_gpu = gpuMult(yk_gpu, XX_gpu, LHS=yk2_gpu)

    lam_gpu = gpuMult(sk2_gpu, yk_gpu, LHS=lam_gpu)
    app_gpu = gputotal(lam_gpu,2,LHS=app_gpu)
    sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
    gpuGetArr, sum_gpu, bk

    lam_gpu = gpuMult(yk2_gpu, sk_gpu, LHS=lam_gpu)
    app_gpu = gputotal(lam_gpu,2,LHS=app_gpu)
    sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
    gpuGetArr, sum_gpu, ck

    if (bk le 0) then begin
        alpha1 = min([10.*alpha,alpha_max])
    endif else begin
        lam_gpu = gpuMult(sk2_gpu, sk2_gpu, LHS=lam_gpu)
        app_gpu = gputotal(lam_gpu,2,LHS=app_gpu)
        sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
        gpuGetArr, sum_gpu, alpha1BB
        alpha1BB = alpha1BB/bk
        alpha1 = min([alpha_max, max([alpha_min, alpha1BB])])
    end
    if (ck le 0) then begin
        alpha2 = min([10.*alpha,alpha_max])
    endif else begin
        lam_gpu = gpuMult(yk2_gpu, yk2_gpu, LHS=lam_gpu)
        app_gpu = gputotal(lam_gpu,2,LHS=app_gpu)
        sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
        gpuGetArr, sum_gpu, alpha2BB
        alpha2BB = ck/alpha2BB
        alpha2 = min([alpha_max, max([alpha_min, alpha2BB])])
    end

    Valpha[Malpha-1] = alpha2

    if (iter le 20) then begin
       alpha = min(Valpha)
    endif else begin
    	if (alpha2/alpha1 lt tau) then begin
	       alpha = min(Valpha)
       	   tau = tau*0.9
       	endif else begin
	        alpha = alpha1
    	    tau = tau*1.1
    	endelse
    endelse
    times[iter] = systime(2) - t0
    iter = iter + 1

    alpha = double(float(alpha))

    if errflag then begin
        tmp_gpu = gpuSub(X_gpu, obj_gpu, LHS=tmp_gpu)
        tmp_gpu = gpumult(tmp_gpu,tmp_gpu,LHS=tmp_gpu)
        app_gpu = gputotal(tmp_gpu,2,LHS=app_gpu)
        sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
        gpuGetArr, sum_gpu, sum_err
        err[iter-1] = sqrt(sum_err/obj_sum)
    endif

    Discr[iter-1] = Discr_coeff * fv

    ;;;;;;;;;;;;;;;;;
    ; stop criteria ;
    ;;;;;;;;;;;;;;;;;

    case stopcrit of
        1: begin
              if verb gt 0 then begin
                 print, 'it ', iter-1, ' of ', maxit
              endif
           end
        2: begin
              sk_gpu = gpumult(sk_gpu,sk_gpu,LHS=sk_gpu)
              app_gpu = gputotal(sk_gpu,2,LHS=app_gpu)
              sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
              gpuGetArr, sum_gpu, normstep
              sk_gpu = gpumult(X_gpu,X_gpu,LHS=sk_gpu)
              app_gpu = gputotal(sk_gpu,2,LHS=app_gpu)
              sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
              gpuGetArr, sum_gpu, normx
           		loop = (normstep gt tol*normx)
           		if verb gt 0 then begin
              	 print, 'it ', iter-1, ', || x_k - x_k-1 || ^2 / || x_k || ^2 ', normstep, ', tol ', tol
           		endif
           end
        3: begin
        		  reldecrease = abs(fv - Fold[M-1])/abs(fv)
           		loop = (reldecrease gt tol)
           		if verb gt 0 then begin
              	 print, 'it ', iter-1, ', | f_k - f_k-1 | / | f_k | ', reldecrease, ', tol ', tol
           		endif
           end
        4: begin
            if iter ne 1 then begin
        		  loop = (Discr[iter-1] gt tol)
            endif else begin
              loop = 1
            endelse
           		if verb gt 0 then begin
              		print, 'it ', iter-1, ', D_k ', Discr[iter-1], ', tol ', tol
           		endif
           end
    endcase

    if iter gt maxit then loop = 0B

    if verb gt 0 then begin
      print, 'Iteration:', iter,  '  Fobj:', fv, '  Alpha:', alpha, '  Lambda:', lam
    endif

endwhile

gpuGetArr, X_gpu, x
x = reform(x,obj_size)
x = x * scaling

if errflag then begin
   err = err[0:iter-1]
   gpufree, obj_gpu
endif

Discr = Discr[0:iter-1]
times = times[0:iter-1]
iter = iter - 1

gpufree, [TF_gpu, tmp_c_gpu, tmp_c2_gpu, tmp_c3_gpu, CTF_gpu, X_gpu, gn_gpu, ONE_gpu, tmp_gpu, tmp3_gpu, $
          d_gpu, g_gpu, d_tf_gpu, x_plus_gpu, x_tf_gpu, x_tf_try_gpu, lam_gpu, sk_gpu, $
          sk2_gpu, yk_gpu, yk2_gpu, g_tmp_gpu, DX_gpu, app_gpu, sum_gpu, flt_gpu, XX_gpu ]


END

; ==============================================================================
; End of SGP_deblurring_multi_GPU.pro file - IRMA package
; ==============================================================================