PRO osem_deblurring_multi_gpu, A, gn, x, iter, err, Discr, times, maxit = maxit, bg = bg, bound = bound, $
                               obj = obj, stars_coord = stars_coord, delta_t = delta_t, $
                               stopcrit=stopcrit, tol = tol, verb = verb

; osem_deblurring_multi_bound_gpu - OSEM algorithm (GPU version)
;   This function solves a multiple image deblurring problem by applying the OSEM
;   algorithm.
;
; SYNOPSIS
;    osem_deblurring_multi_gpu, A, gn, x, iter, err, Discr, times [, opts]
;
; MANDATORY INPUT
;   A   (double array) - measuring matrix used to apply the blurring
;                        operator, that is to compute A*x
;   gn  (double array) - measured image
;
; OPTIONAL INPUT
;   The following options must be provided as keyword/value pairs.
;   'OBJ'              - Exact solution, for error calculation (double array)
;   'BG'               - Background value (double)
;                        DEFAULT = 0
;   'MAXIT'            - Maximum number of iterations (integer)
;                        DEFAULT = 1000
;   'VERB'             - Verbosity level (integer)
;                        0 - silent
;                        1 - print configuration parameters at startup and
;                            some information at each iteration
;                        DEFAULT = 0
;   'STOPCRIT'         - Choice for stopping rule (integer)
;                        1 -> iter > MAXIT
;                        3 -> |KL_k - KL_(k-1)| <= tol*|KL_k| OR iter > MAXIT
;                        4 -> (2/N)*KL_k <= tol OR iter > MAXIT
;                        DEFAULT = 1;
;   'TOL'              - Tolerance used in the stopping criterion
;                        DEFAULT = 1e-4 if STOPCRITERION = 3
;                        DEFAULT = 1+1/mean(gn) if STOPCRITERION = 4
;
; OUTPUT
;   x                  - Reconstructed data
;   iter               - Number of iterations
;   err                - Error value at each iteration. If OBJ was not given,
;                        then err is the empty matrix.
;   Discr              - Discrepancy value after each iteration:
;                            D = 2/numel(x_k) * KL( Ax_k + bg, gn)
;   times              - Time elapsed after each iteration
;
; ------------------------------------------------------------------------------
;
; This software is developed within the research project
;
;        PRISMA - Optimization methods and software for inverse problems
;                           http://www.unife.it/prisma
;
; funded by the Italian Ministry for University and Research (MIUR), under
; the PRIN2008 initiative, grant n. 2008T5KA4L, 2010-2012. This software is
; part of the package "IRMA - Image Reconstruction in Microscopy and Astronomy"
; currently under development within the PRISMA project.
;
; Version: 1.0
; Date:    July 2011

; Authors:
;   Roberto Cavicchioli, Marco Prato, Luca Zanni
;    Dept. of Pure Appl. Math., Univ. of Modena and Reggio Emilia, Italy
;    roberto.cavicchioli@unimore.it, marco.prato@unimore.it, luca.zanni@unimore.it
;   Mario Bertero, Patrizia Boccacci
;    DISI (Dipartimento di Informatica e Scienze dell'Informazione), University of Genova, Italy
;    bertero@disi.unige.it, boccacci@disi.unige.it
;
; Software homepage: http://www.unife.it/prisma/software
;
; Copyright (C) 2011 by M. Bertero, P. Boccacci, R. Cavicchioli, M. Prato, L. Zanni.
; ------------------------------------------------------------------------------
; COPYRIGHT NOTIFICATION
;
; Permission to copy and modify this software and its documentation for
; internal research use is granted, provided that this notice is retained
; thereon and on all copies or modifications. The authors and their
; respective Universities makes no representations as to the suitability
; and operability of this software for any purpose. It is provided "as is"
; without express or implied warranty. Use of this software for commercial
; purposes is expressly prohibited without contacting the authors.
;
; This program is free software; you can redistribute it and/or modify it
; under the terms of the GNU General Public License as published by the
; Free Software Foundation; either version 3 of the License, or (at your
; option) any later version.
;
; This program is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
; See the GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License along
; with this program; if not, either visite http://www.gnu.org/licenses/
; or write to
; Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
; ==============================================================================

;initialize GPU device
gpuinit

; start the clock
t0 = systime(2)

;;;;;;;;;;;;;;;;;;;;;;;;;;;
; OSEM default parameters ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;
errflag = 1B
err = 0
if (keyword_set(bg) - 1)          then bg = 0.           ; background value
if (keyword_set(bound) - 1)       then bound = 0B        ; bound effects
if (keyword_set(maxit) - 1)       then maxit = 1000      ; maximum number of iterations
if (keyword_set(stopcrit) - 1)    then stopcrit = 1      ; 1 -> number of iterations
if (keyword_set(verb) - 1)        then verb = 0          ; 0 -> silent
if (keyword_set(obj) - 1)         then errflag = 0B      ; error calculation

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
dim = size(gn,/dimensions)

; GPUlib works only in float with FFT operations, so we convert
; the input A matrix
A = float(A)

TF = dcomplexarr(dim[0],dim[2]*dim[1])

TF_gpu = gpuComplexarr(dim[0], dim[2]*dim[1])
CTF_gpu = gpuComplexarr(dim[0], dim[2]*dim[1])
tmp_c_gpu = gpuComplexarr(dim[0], dim[1])
tmp_c2_gpu = gpuComplexarr(dim[0], dim[1])

flt_gpu = gpuFltarr(dim[0], dim[1])

rec_gpu = gpuDblArr(dim[0], dim[1])
gn_gpu = gpuDblArr(dim[0], dim[2]*dim[1])
x_tf_gpu = gpuDblArr(dim[0], dim[2]*dim[1])
den_gpu = gpuDblArr(dim[0], dim[2]*dim[1])
tmp_gpu = gpuDblArr(dim[0], dim[1])
tmp3_gpu = gpuDblArr(dim[0], dim[2]*dim[1])
temp_gpu = gpuDblArr(dim[0], dim[2]*dim[1])
ONE_gpu = gpuDblArr(dim[0], dim[1])

tot_gpu = gpuDblArr(dim[0], dim[1])
app_gpu = gpuDblarr(dim[0],1)
sum_gpu = gpuDblarr(1,1)

for i = 0, dim[2]-1 do begin
    gpuComplex, reform(A[*,*,i]), tmp_c2_gpu
    tmp_c_gpu = gpushift(tmp_c2_gpu,dim[0]/2,dim[1]/2, LHS=tmp_c_gpu)
    tmp_c_gpu = gpufft(tmp_c_gpu, LHS=tmp_c_gpu, /DESTROYPLAN)
    gpuSubArr, tmp_c_gpu, -1, -1, TF_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ]
endfor
TF = gpuGetArr(TF_gpu)
TF = TF*dim[0]*dim[1]
CTF = conj(TF)
gpuPutArr, TF, TF_gpu
gpuPutArr, CTF, CTF_gpu

; size of the images
obj_size = dim[0:1]

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; initializations and computations that need only once ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
N = n_elements(gn)/dim[2]                  		; pixels in the image
flux = total(total(gn,1),1) - N * bg          ; exact flux
rec = make_array(obj_size, value = mean(flux)/N, /double)

;;;;;;;;;;;;;;;;;;
; stop criterion ;
;;;;;;;;;;;;;;;;;;
if (stopcrit ne 1 and stopcrit ne 3 and stopcrit ne 4) then begin
    message, 'Unknown stopping criterion:', stopcrit
end

if (keyword_set(tol) - 1) then begin
   if (stopcrit eq 3) then tol = 1.e-4
   if (stopcrit eq 4) then tol = 1. + 1./mean(gn)
end

ONE = make_array(dim[0],dim[1],value = 1.,/DOUBLE)

gpuPutArr, gn, gn_gpu
gpuPutArr, rec, rec_gpu
gpuPutArr, ONE, ONE_gpu

;;;;;;;;;;;;;;;;;;;;;
; vector allocation ;
;;;;;;;;;;;;;;;;;;;;;
if errflag then begin
   err = dblarr(maxit+1)
   obj_gpu = gpuDblArr(dim[0], dim[1])
   gpuPutArr, obj, obj_gpu
   tmp_gpu = gpumult(obj_gpu, obj_gpu, LHS=tmp_gpu)
   app_gpu = gputotal(tmp_gpu,2,LHS=app_gpu)
   sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
   gpuGetArr, sum_gpu, obj_sum
endif
Discr = dblarr(maxit+1)
times = dblarr(maxit+1)

;;;;;;;;;;;;;;;;;
; start of OSEM ;
;;;;;;;;;;;;;;;;;
iter=0
loop=1

while loop do begin
	if errflag then begin
	   tmp_gpu = gpuSub( rec_gpu , obj_gpu, LHS=tmp_gpu)
     tmp_gpu = gpumult(tmp_gpu,tmp_gpu, LHS=tmp_gpu)
     app_gpu = gputotal(tmp_gpu,2,LHS=app_gpu)
     sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
     err[iter] = sqrt( gpuGetArr(sum_gpu) /obj_sum )
	endif

  for i = 0, dim[2]-1 do begin
      tmp_c_gpu = gpuComplex(rec_gpu, LHS=tmp_c_gpu)
      tmp_c_gpu = gpufft(tmp_c_gpu, LHS=tmp_c_gpu, /DESTROYPLAN)
      gpuSubArr, TF_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ], tmp_c2_gpu, -1, -1
      tmp_c2_gpu = gpumult( tmp_c2_gpu, tmp_c_gpu, LHS=tmp_c2_gpu )
      tmp_c2_gpu = gpufft( tmp_c2_gpu, /inverse, LHS=tmp_c2_gpu, /DESTROYPLAN)
      flt_gpu = gpureal( tmp_c2_gpu , LHS=flt_gpu )
      tmp_gpu = gpufix( flt_gpu, TYPE=5, LHS=tmp_gpu)
      gpuSubArr, tmp_gpu, -1, -1, x_tf_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ]
      gpuAdd, double(1.), tmp_gpu, double(bg), ONE_gpu, double(0.), den_gpu
      gpuSubArr, gn_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ], tot_gpu, -1, -1
      tmp_gpu = gpuDiv( tot_gpu, den_gpu, LHS=tmp_gpu)
      gpuSubArr, tmp_gpu, -1, -1, temp_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ]
      tmp_c_gpu = gpuComplex(tmp_gpu, LHS=tmp_c_gpu)
      tmp_c_gpu = gpufft(tmp_c_gpu, LHS=tmp_c_gpu, /DESTROYPLAN)
      gpuSubArr, CTF_gpu, -1, [ i*dim[1], (i+1)*dim[1]-1 ], tmp_c2_gpu, -1, -1
      tmp_c2_gpu = gpumult( tmp_c2_gpu, tmp_c_gpu, LHS=tmp_c2_gpu )
      tmp_c2_gpu = gpufft( tmp_c2_gpu, /inverse, LHS=tmp_c2_gpu, /DESTROYPLAN)
      flt_gpu = gpureal( tmp_c2_gpu , LHS=flt_gpu )
      tmp_gpu = gpufix( flt_gpu, TYPE=5, LHS=tmp_gpu)
      rec_gpu = gpuMult( rec_gpu, tmp_gpu, LHS=rec_gpu)
  endfor

  app_gpu = gputotal(x_tf_gpu,2,LHS=app_gpu)
  sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
  gpuGetArr, sum_gpu, sum_xtf

  tmp3_gpu = gpumult(gn_gpu, gpulog(temp_gpu,LHS=tmp3_gpu) ,LHS=tmp3_gpu)
  app_gpu = gputotal(tmp3_gpu,2,LHS=app_gpu)
  sum_gpu = gputotal(app_gpu,1,LHS=sum_gpu)
  gpuGetArr, sum_gpu, sum_kl

  fv =  sum_kl + sum_xtf - total(flux)

  ; discrepancy
  Discr[iter] = 2.*fv/(N*dim[2])

  times[iter] = systime(2) - t0
  iter = iter + 1

  case stopcrit of
       1: if verb gt 0 then begin
             print, 'it ', iter-1, ' of ', maxit
          endif
       3: begin
            if iter ne 1 then begin
               reldecrease = abs(fv - Discr[iter-2]*N/2.)/abs(fv)
            endif else begin
               reldecrease = 1
            endelse
            loop = (reldecrease gt tol)
            if verb gt 0 then begin
               print, 'it ', iter-1, ', | f_k - f_k-1 | / | f_k | ', reldecrease, ', tol ', tol
            endif
          end
       4: begin
            if iter ne 1 then begin
               loop = (Discr[iter-1] gt tol)
            endif else begin
               loop = 1
            endelse
            if verb gt 0 then begin
               print, 'it ', iter-1, ', D_k ', Discr[iter-1], ', tol ', tol
            endif
          end
  endcase

  if iter gt maxit then loop = 0B

endwhile

gpuGetArr, rec_gpu, x
x = reform(x,obj_size)

if errflag then begin
   gpufree, obj_gpu
endif

gpufree, [TF_gpu, tmp_c_gpu, tmp_c2_gpu, CTF_gpu, rec_gpu, gn_gpu, ONE_gpu, tmp_gpu, tmp3_gpu, den_gpu, $
          temp_gpu, x_tf_gpu, app_gpu, sum_gpu, flt_gpu, tot_gpu ]

END

; ==============================================================================
; End of OSEM_deblurring_multi_bound_GPU.pro file - IRMA package
; ==============================================================================
