(* Code accompanying the paper

	Relating FFTW and Split-Radix
        Oleg Kiselyov and Walid Taha
        Embedded Software and Systems, First International Conference,
        ICESS 2004, Hangzhou, China, December 9-10, 2004, 
        Revised Selected Papers
	LNCS 3605, pp. 488-493
	http://dx.doi.org/10.1007/11535409_71

This self-contained MetaOCaml code derives radix-2 FFT kernels with the
number of additions and multiplications *exactly* matching those in the
kernels produced by FFTW. A slight change in the generation algorithm
produces mixed-radix FFT kernels.

Copyright, Oleg Kiselyov and Walid Taha, 2004.

*)


(*
	An attempt at a better FFT
        Use parsing in frequency rather than in time

 Formulas:
    The forward transform from an N-sample sequence
    x[j], j=0..N-1 to an N-sample sequence xf[k], k=0..N-1
    is

    (1) xf[k] = SUM{ x[j] *  w^(j*k), j=0..N-1 }, k=0..N-1
    where w_N = exp(-2*PI*I/N)
    is the N-th primitive root of unity.
    For the inverse transform, w_N=exp(+2*PI*I/N)
    (like FFTW, we omit the scaling factor for the inverse transform).

 We assume that N is an even number, N=2*N2
 We re-write (1) as two equations:
    (2a) xf[2k] = SUM{ x[j] * w^(2*j*k), j=0..N-1 }, k=0..N2-1
                = SUM{ x[j] * w^(2*j*k), j=0..N2-1}
                + SUM{ x[j+N2] * w^(2*(j+N2)*k), j=0..N2-1}
                = SUM{ x[j] * w2^(j*k), j=0..N2-1}
                + SUM{ x[j+N2] * w2^(j*k), j=0..N2-1}
                      w^(2*N2*k) = w^(N*k) = 1^k = 1
                      Here w2 = w^2 = w_N2
                = FFT{ x[j] + x[j+N2], j=0..N2-1 }

    (2b) xf[2k+1] = SUM{ x[j] * w^(2*j*k+j), j=0..N-1 }, k=0..N2-1
                  = SUM{ x[j] * w^(2*j*k+j), j=0..N2-1 }
                  + SUM{ x[j+N2] * w^(2*(j+N2)*k+(j+N2)), j=0..N2-1 }
                  = SUM{ x[j]*w^j * w2^(j*k), j=0..N2-1 }
                  + SUM{-x[j+N2]*w^j * w2^(j*k), j=0..N2-1 }
                    w^N2 = exp(-PI*I) = -1 (the same for the inverse FFT)
                  = FFT{ (x[j] - x[j+N2])*w^j, j=0..N2-1 }

 So, an N-point FFT reduces to two N/2-point FFTs.

 $Id: fft-neo.ml,v 1.13 2004/08/31 07:52:36 oleg Exp $
*)

(* ------------ unstaged nonmonadic list-based representation  ------------ *)
let pi = 4.0 *. atan(1.0)

type dir = Forward | Inverse

let w dir n j = (* exp(-2PI dir/N)^j *)
  let theta = ((float_of_int ((if dir = Forward then -2 else 2)* j)) *. pi)
              /. (float_of_int n) in
  ((cos theta), (sin theta)) 

(* complex arithm *)
let add_u ((r1,i1), (r2, i2)) = ((r1 +. r2), (i1 +. i2))
let sub_u ((r1,i1), (r2, i2)) = ((r1 -. r2), (i1 -. i2))

let mult_u ((r1, i1), (r2, i2)) =
  let rp = (r1 *. r2) -. (i1 *. i2) in
  let ip = (r1 *. i2) +. (r2 *. i1) in
  (rp, ip)

(* multiplex evens and odds. They are of the same size *)
let rec  merge_u  = function
    ([], []) -> []
  | (y0::y0rest,y1::y1rest) -> y0::y1::(merge_u (y0rest,y1rest))

let cleave_list l = (* split a non-empty list l into two equal halves *)
    let rec take_drop n l acc = 
      if n = 0 then (List.rev acc, l)
               else take_drop (n-1) (List.tl l) ((List.hd l)::acc)
    in take_drop ((List.length l) / 2) l []

(* given list l, size n, return (l1+l2, (li-l2)*w^j) see the title comments *)
let rec split_u dir l =
  let n = List.length l in
  let rec comb j = function
      (x::xs, y::ys) ->
	let (a,b) = comb (j+1) (xs,ys) in
	((add_u (x,y))::a, (mult_u ((sub_u (x,y)), w dir n j))::b)
    | _ -> ([],[]) in
  comb 0 (cleave_list l)

let rec fft_u dir (l) =
  if (List.length l = 1) then l
  else
    let (p0,p1)  = split_u dir l in
    let y0     = fft_u dir p0 in
    let y1     = fft_u dir p1 in
    merge_u (y0,y1)

let gen_u = fft_u

(* List of size n with (1,0) at position l *)
let rec impulse n l = 
   if n = 0 then []
   else let t = if l = 0 then (1.0,0.0) else (0.0,0.0)
   in t::(impulse (n-1) (l-1))

let rec unpair = function ((x,y)::tail) -> x::y::(unpair tail)
  | [] -> []

(* Determine the max abs difference (the inf-norm) of two lists of
   pairs.
*)
let inf_norm l1 l2 =
  let rec scan pos loc diff = 
    function ((x::tailx),(y::taily)) ->
      let curr_diff = abs_float (x -. y) in
      if curr_diff >= diff then scan (succ pos) pos curr_diff (tailx,taily)
      else scan (succ pos) loc diff (tailx,taily)
      |	([],[]) -> (loc,diff)
  in scan 0 (-1) 0.0 (l1,l2)

(* Tests *)
let () = assert ((7,0.0) = inf_norm
                 (List.map (fun x -> 4.0 *. x) (unpair (impulse 4 1 )))
		 (unpair ( fft_u Inverse (fft_u Forward (impulse 4 1)) )) )

let () = assert ((15,0.0) = inf_norm
                 (List.map (fun x -> 8.0 *. x) (unpair (impulse 8 1 )))
		 (unpair ( fft_u Inverse (fft_u Forward (impulse 8 1)) )) )


let () = assert ((31,0.0) = inf_norm
                 (List.map (fun x -> 16.0 *. x) (unpair (impulse 16 1 )))
		 (unpair ( fft_u Inverse (fft_u Forward (impulse 16 1)) )) )


(* --- unstaged monadic let-binding list-based representation  ------- *)

(* monad *)
let (ret,bind, y_m, run)  =
  let ret a s k = k s a in
  let bind a f s k = a s (fun s' x -> f x s' k) in
  (* Y combinator *)
  let rec y_m f = f (fun x s k -> y_m f x s k) in
  (* convert code list to list code *)
  let lift l = List.fold_right  (fun (x,y) z -> (x, y) :: z) l [] in
  let run f = f [] (fun s x -> x) in

  (ret,bind, y_m, run)


(* Monadic split *)
let rec split_mu dir l =
  let n = List.length l in
  let rec comb j = function
      (x::xs, y::ys) ->
	bind (comb (j+1) (xs,ys)) (fun (a,b) ->
	ret ((add_u (x,y))::a, (mult_u ((sub_u (x,y)), w dir n j))::b))
    | _ -> ret ([],[]) in
  comb 0 (cleave_list l)

let fft_mu dir f l =
  if List.length l = 1 then ret l
  else
    bind (split_mu dir l) (fun (p0,p1) ->
    bind (f p0) (fun y0 ->
    bind (f p1) (fun y1 ->
    ret (merge_u (y0,y1)))))

let gen_mu dir nums = run ((y_m (fft_mu dir)) nums)

(* Tests *)
let () = assert ((7,0.0) = inf_norm
                 (List.map (fun x -> 4.0 *. x) (unpair (impulse 4 1 )))
		 (unpair ( gen_mu Inverse (gen_mu Forward (impulse 4 1)) )) )

let () = assert ((15,0.0) = inf_norm
                 (List.map (fun x -> 8.0 *. x) (unpair (impulse 8 1 )))
		 (unpair ( gen_mu Inverse (gen_mu Forward (impulse 8 1)) )) )

let () = assert ((31,0.0) = inf_norm
                 (List.map (fun x -> 16.0 *. x) (unpair (impulse 16 1 )))
		 (unpair ( gen_mu Inverse (gen_mu Forward (impulse 16 1)) )) )


(* --- staged monadic let-binding array-based representation ----*)

(* Monadic library. See the EMSOFT paper *)
let (retS,retN,bind,y_sm,run) = 
  let retS a = fun s k -> k s a in 
  let retN a = fun s k -> .<let t = .~a in .~(k s .<t>.)>. in 
  let bind a f = 
    fun s k -> a s (fun s' b -> f b s' k) in
  let rec y_sm f =  f (fun x s k -> y_sm f x s k) in
  let run f (abstract_simple, concretize) size nums =
    (* Write the results into the array *)
    let array_write l arr = 
      let rec lfta l m =
        match l with
          [] -> arr
        | (c::tail) ->
            let sm = m+1 and ssm = m+2 in
	    let (x,y) = concretize c in
            .<((.~arr).(m) <- .~x;
               (.~arr).(sm) <- .~y;
               (.~(lfta tail (ssm))))>.
      in lfta l 0 in
    (* Read from an array *)
    let array_read arr n =
      let rec gl m l =
        if m = n then retS l
        else
          let sm = m+1 in
	  bind (abstract_simple (.<(.~arr).(m)>., .<(.~arr).(sm)>.))
	    (fun c -> gl (m+2) (l @ [c]))
      in gl 0 [] in
    let run_basic m = m [] (fun s x -> array_write x nums)
    in
    run_basic (bind (array_read nums size) f)
  in
  (retS,retN,bind,y_sm,run)

(* Standard auxiliary monadic operators *)

let liftM f = fun x -> retS (f x)

let compM a b =
  fun x -> bind (b x) (fun nx -> a nx)

let liftcM op (x,y) = 
  bind (op x) (fun nx ->
  bind (op y) (fun ny ->
  retS (nx,ny)))


let w_s dir n j =
  let fj = if (dir = Forward) then 
    (float_of_int (-2 * j))
  else
    (float_of_int (2 * j)) in
  let fn = (float_of_int n) in
  let theta = (fj *. pi) /. fn  in
  let ct = cos theta in 
  let st = sin theta in 
  (.<ct>., .<st>.) 

let add_s ((r1,i1), (r2, i2)) = ((.<.~r1 +. .~r2>.), (.<.~i1 +. .~i2>.))
let sub_s ((r1,i1), (r2, i2)) = ((.<.~r1 -. .~r2>.), (.<.~i1 -. .~i2>.))

let mult_s ((r1, i1), (r2, i2)) =
  let rp = .<(.~r1 *. .~r2) -. (.~i1 *. .~i2)>. in
  let ip = .<(.~r1 *. .~i2) +. (.~r2 *. .~i1)>. in
  (rp, ip)

(* Staged Monadic split *)
let split_sm dir l =
  let n = List.length l in
  let rec comb j = function
      (x::xs, y::ys) ->
	bind (comb (j+1) (xs,ys)) (fun (a,b) ->
	retS ((add_s (x,y))::a, (mult_s ((sub_s (x,y)), w_s dir n j))::b))
    | _ -> retS ([],[]) in
  comb 0 (cleave_list l)

let fft_sm dir f l =
  if List.length l = 1 then ret l
  else
    bind (split_sm dir l) (fun (p0,p1) ->
    bind (f p0) (fun y0 ->
    bind (f p1) (fun y1 ->
    retS (merge_u (y0,y1)))))

let gen_marr dir size nums = 
  run (y_sm (fft_sm dir))
    ((liftcM retN),fun x -> x)
    size nums

(* To see the code
.<fun x -> .~(gen_marr Forward  (4 * 2) .<x>.)>.
*)

let test_marr dir lst =
  let n = List.length lst in
  let arr = Array.make (n * 2)  0.0 in
  let _ = List.fold_left (fun n (a,b) -> (arr.(n) <- a; arr.(n+1) <-b; n+2))
      0 lst in
  let rec tolist arr i = if i = Array.length arr then []
      else (arr.(i),arr.(i+1))::(tolist arr (i+2)) in
  let tranc = .<fun x -> .~(gen_marr dir (n * 2) .<x>.)>. in
  let tran = (.!tranc) (arr) in
  tolist tran 0

(* Tests *)
let () = assert ((7,0.0) = inf_norm
                 (List.map (fun x -> 4.0 *. x) (unpair (impulse 4 1 )))
		 (unpair (test_marr Inverse (test_marr Forward (impulse 4 1)))
			  ))

let () = assert ((10, 5.11946041115152184e-12) = inf_norm
                 (List.map (fun x -> 8.0 *. x) (unpair (impulse 8 1 )))
		 (unpair (test_marr Inverse (test_marr Forward (impulse 8 1)))
			  ))

let () = assert ((18, 7.34878824459883617e-12) = inf_norm
                 (List.map (fun x -> 16.0 *. x) (unpair (impulse 16 1 )))
		 (unpair (test_marr Inverse (test_marr Forward (impulse 16 1)))
			  ))


(* --- staged monadic let-binding array-based representation ----*)
(* --- Abstract interpretation ----*)
(* We note that FFT is a linear, and a uniform transform.
 Therefore, all complex expressions have a grammar
   <known_factor> * <code>
   <code> + <code>
Thus the abstraction domain is a pair: 
  known_complex_factor, unknown_piece_of_code
Unknown pieces of complex code are represented as pairs of
real code.
Furthermore, in FFT, all factors are roots of unity. So, we can
represent them as pairs of integers (j,n) where abs(j) < n
A pair (j,n) represents exp(2PI*I j/n)

When performing additions of two pieces of code, we may need to do
some let-binding.
*)

(* Rational arithmetics *)
type rat = int * int (* j/n *)
let rat_one = (0,1)
(*
type  newfloats_aa = Code of rat * (float) code * (float) code
*)
(* *)

(* The abstraction domain *)

type  'a newfloats_aa = Code of rat * ('a,float) code * ('a,float) code

(* *)
let rec gcd a b =
   if a > b then gcd b a
   else if a = 1 then 1
   else if a = 0 then b
   else gcd (b-a) a

let rec rationalize (j,n) =
   if j = 0 then rat_one
   else if j < 0 then rationalize (n+j,n)
   else if j >= n then rationalize (j-n,n)
   else let k = gcd j n in
   if k = 1 then (j,n) else (j/k, n/k)

let add_rat (j1,n1) (j2,n2) = rationalize (j1*n2+j2*n1, n1*n2)
let sub_rat (j1,n1) (j2,n2) = rationalize (j1*n2-j2*n1, n1*n2)
let min_rat ((j1,n1) as r1) ((j2,n2) as r2) =
   if j1*n2 > j2*n1 then r2 else r1

let str_rat (j,n) = "("^ (string_of_int j) ^","^ (string_of_int n) ^")"

(* rotate rat by 90 or 180 degrees to bring it to the 1 quadrant
   Return the rotation factor and the new rat.
   Their sum is the original rat.
*)
let rec rotate_to_QI ((j,n) as f) =
   if 2*j >= n then 
      let (rf,newf) = rotate_to_QI (sub_rat f (1,2))
      in  (add_rat rf (1,2),newf)
   else if 4*j >= n then ((1,4),sub_rat f (1,4)) else (rat_one,f)


(* This is a semi-concretization function that forces
   the multiplication up to trivail factors, +/- 1, +/- I *)
let force_mult (Code (f,rx,ix)) =
  let (newf,(j,n)) = rotate_to_QI f in
  let theta = float_of_int (2*j) *. pi /. float_of_int n in
  let c = cos theta in
  let s = sin theta in
  let () = assert (c > 0.0 && s > 0.0)
  in
  bind (retN .<c *. .~rx -. s *. .~ix>.) (fun rcs ->
  bind (retN .<s *. .~rx +. c *. .~ix>.) (fun ics ->
  ret (Code (newf,rcs,ics))))


let w_aa dir n j = match dir with
     Forward -> rationalize (-j,n)
   | Inverse -> rationalize (j,n)

(* Multiply an abstract value by a known factor: a root of unity *)
let mult_aa (Code (f,re,im)) ru = Code (add_rat f ru, re,im)

let neg_aa c = mult_aa c (1,2) (* second root of unity of order 2 is -1 *)

(* Perform (rx,ix) +/- _complex_fy*(ry,iy) and set the resulting
   factor to fx
   Return a pair: fx* (x + complex_fy*y), fx*(x - complex_fy*y)
   This is where many simplifications happen
*)
let rec do_linear fx rx ix ((j,n) as fy) ry iy =
   if 2*j >= n then (* fy in III and IV quadrants: reflect *)
            bind (do_linear fx rx ix (sub_rat fy (1,2)) ry iy)
            (fun (p,m) -> retS (m,p)) else
   match fy with (* x + y *)
   (0,1) -> bind (retN .<.~rx +. .~ry>.) (fun re ->
            bind (retN .<.~ix +. .~iy>.) (fun im ->
            bind (retN .<.~rx -. .~ry>.) (fun re1 ->
            bind (retN .<.~ix -. .~iy>.) (fun im1 ->
            ret (Code (fx,re,im),Code (fx,re1,im1))))))
  |(1,4) -> (* x + I*y *)
            bind (retN .<.~rx -. .~iy>.) (fun re ->
            bind (retN .<.~ix +. .~ry>.) (fun im ->
            bind (retN .<.~rx +. .~iy>.) (fun re1 ->
            bind (retN .<.~ix -. .~ry>.) (fun im1 ->
            ret (Code (fx,re,im),Code (fx,re1,im1) )))))
  |(j,8) -> (* x + (cos pi/4 + I*sin pi/4)*y *)
            let cs = sin (pi /. 4.0) in
            bind (retN .<cs *. (.~ry -. .~iy)>.) (fun rcs ->
            bind (retN .<cs *. (.~ry +. .~iy)>.) (fun ics ->
            do_linear fx rx ix (rationalize (j-1,8)) rcs ics))
  |(j,n) -> bind (force_mult (Code (fy,ry,iy))) (fun (Code (fy,ry,iy)) ->
	    do_linear fx rx ix fy ry iy)


(* Decide, if multiplication by f is trivial.
   See the table above for the execution of trivial multiplication
*)
let rec trivial_mult ((j,n) as f) =
   if 2*j >= n then (* f in III and IV quadrants: reflect *)
                trivial_mult (rationalize (j-n/2,n))
   else match f with
    (0,1) -> true (* multiplication by 1 *)
   |(1,4) -> true (* multiplication by I *)
   | _    -> false
   

(* This one is monadic: it may need to let-bind something
   Note: the add_sub_aa function is not symmetric!
   We need to compute fx*<code> +/- fy*<code>
   There are two ways to proceed:
    (1) let e1 = fx*<code> and e2 = fy*<code> in e1 +/- e2
    (2) fx*( <code> +/- (fy/fx)*<code> )
   The second way is better when either the multiplication by
   fx is trivial or by (fy/fx) is trivial
   Otherwise, the first way is better
   The second method is performed by do_linear
*)
let rec add_sub_aa ((Code (fx,rx,ix)) as x) ((Code (fy,ry,iy)) as y) =
   let fyx = (sub_rat fy fx) in
   if (trivial_mult fx) || (trivial_mult fyx) then
      do_linear fx rx ix fyx ry iy
   else 
   bind (force_mult x) (fun x ->
   bind (force_mult y) (fun y ->
   add_sub_aa x y))

(* Staged Monadic split *)
let split_aa dir l =
(*
  let () = print_string "\nBefore split: ";
    List.iter (fun (Code (fx,_,_)) ->Printf.printf "%s " (str_rat fx))
      l in
*)
  let n = List.length l in
  let rec comb j = function
      (x::xs, y::ys) ->
        bind (add_sub_aa x y) (fun (xpy,xmy) ->
	bind (comb (j+1) (xs,ys)) (fun (a,b) ->
        retS (xpy::a, (mult_aa xmy (w_aa dir n j))::b)))
    | _ -> retS ([],[]) in
  comb 0 (cleave_list l)

let fft_aa dir f l =
  if List.length l = 1 then ret l
  else
    bind (split_aa dir l) (fun (p0,p1) ->
    bind (f p0) (fun y0 ->
    bind (f p1) (fun y1 ->
    retS (merge_u (y0,y1)))))

let gen_aa dir size nums = 
  run (y_sm (fft_aa dir))
    ((fun p -> (bind (liftcM retN p) 
		  (fun (x,y) -> retS (Code (rat_one, x, y))))),
     (* here we assert c is a simple piece of code!*)
     (* And it is because the last stage of *)
     (* FFT is only a+b and a-b *)
     fun (Code ((0,1),x,y)) -> (x,y))
    size nums

(* To see the code
.<fun x -> .~(gen_aa Forward  (4 * 2) .<x>.)>.;;
*)

let test_aa dir lst =
  let n = List.length lst in
  let arr = Array.make (n * 2)  0.0 in
  let _ = List.fold_left (fun n (a,b) -> (arr.(n) <- a; arr.(n+1) <-b; n+2))
      0 lst in
  let rec tolist arr i = if i = Array.length arr then []
      else (arr.(i),arr.(i+1))::(tolist arr (i+2)) in
  let tranc = .<fun x -> .~(gen_aa dir (n * 2) .<x>.)>. in
  let tran = (.!tranc) (arr) in
  tolist tran 0

(* Tests *)
let () = assert ((7,0.0) = inf_norm
                 (List.map (fun x -> 4.0 *. x) (unpair (impulse 4 1 )))
		 (unpair (test_aa Inverse (test_aa Forward (impulse 4 1)))
			  ))

let () = assert ((10, 5.11946041115152184e-12) = inf_norm
                 (List.map (fun x -> 8.0 *. x) (unpair (impulse 8 1 )))
		 (unpair (test_aa Inverse (test_aa Forward (impulse 8 1)))
			  ))

let () = assert ((18, 9.90851845017459709e-12) = inf_norm
                 (List.map (fun x -> 16.0 *. x) (unpair (impulse 16 1 )))
		 (unpair (test_aa Inverse (test_aa Forward (impulse 16 1)))
			  ))

(* Sample of the genereated code:
 .<fun x -> .~(gen_aa Forward  (16 * 2) .<x>.)>.;;
- : ('a, float array -> float array) code =
.<fun (x_3600) ->
   let x1_3601 = (x_3600).(0) in
   let y1_3602 = (x_3600).(1) in
   let x1_3603 = (x_3600).(2) in
   let y1_3604 = (x_3600).(3) in
   let x1_3605 = (x_3600).(4) in
   let y1_3606 = (x_3600).(5) in
   let x1_3607 = (x_3600).(6) in
   let y1_3608 = (x_3600).(7) in
   let x1_3609 = (x_3600).(8) in
   let y1_3610 = (x_3600).(9) in
   let x1_3611 = (x_3600).(10) in
   let y1_3612 = (x_3600).(11) in
   let x1_3613 = (x_3600).(12) in
   let y1_3614 = (x_3600).(13) in
   let x1_3615 = (x_3600).(14) in
   let y1_3616 = (x_3600).(15) in
   let x1_3617 = (x_3600).(16) in
   let y1_3618 = (x_3600).(17) in
   let x1_3619 = (x_3600).(18) in
   let y1_3620 = (x_3600).(19) in
   let x1_3621 = (x_3600).(20) in
   let y1_3622 = (x_3600).(21) in
   let x1_3623 = (x_3600).(22) in
   let y1_3624 = (x_3600).(23) in
   let x1_3625 = (x_3600).(24) in
   let y1_3626 = (x_3600).(25) in
   let x1_3627 = (x_3600).(26) in
   let y1_3628 = (x_3600).(27) in
   let x1_3629 = (x_3600).(28) in
   let y1_3630 = (x_3600).(29) in
   let x1_3631 = (x_3600).(30) in
   let y1_3632 = (x_3600).(31) in
   let (t_3633) = (x1_3601 +. x1_3617) in
   let (t_3634) = (y1_3602 +. y1_3618) in
   let (t_3635) = (x1_3601 -. x1_3617) in
   let (t_3636) = (y1_3602 -. y1_3618) in
   let (t_3637) = (x1_3603 +. x1_3619) in
   let (t_3638) = (y1_3604 +. y1_3620) in
   let (t_3639) = (x1_3603 -. x1_3619) in
   let (t_3640) = (y1_3604 -. y1_3620) in
   let (t_3641) = (x1_3605 +. x1_3621) in
   let (t_3642) = (y1_3606 +. y1_3622) in
   let (t_3643) = (x1_3605 -. x1_3621) in
   let (t_3644) = (y1_3606 -. y1_3622) in
   let (t_3645) = (x1_3607 +. x1_3623) in
   let (t_3646) = (y1_3608 +. y1_3624) in
   let (t_3647) = (x1_3607 -. x1_3623) in
   let (t_3648) = (y1_3608 -. y1_3624) in
   let (t_3649) = (x1_3609 +. x1_3625) in
   let (t_3650) = (y1_3610 +. y1_3626) in
   let (t_3651) = (x1_3609 -. x1_3625) in
   let (t_3652) = (y1_3610 -. y1_3626) in
   let (t_3653) = (x1_3611 +. x1_3627) in
   let (t_3654) = (y1_3612 +. y1_3628) in
   let (t_3655) = (x1_3611 -. x1_3627) in
   let (t_3656) = (y1_3612 -. y1_3628) in
   let (t_3657) = (x1_3613 +. x1_3629) in
   let (t_3658) = (y1_3614 +. y1_3630) in
   let (t_3659) = (x1_3613 -. x1_3629) in
   let (t_3660) = (y1_3614 -. y1_3630) in
   let (t_3661) = (x1_3615 +. x1_3631) in
   let (t_3662) = (y1_3616 +. y1_3632) in
   let (t_3663) = (x1_3615 -. x1_3631) in
   let (t_3664) = (y1_3616 -. y1_3632) in
   let (t_3665) = ((t_3633) +. (t_3649)) in
   let (t_3666) = ((t_3634) +. (t_3650)) in
   let (t_3667) = ((t_3633) -. (t_3649)) in
   let (t_3668) = ((t_3634) -. (t_3650)) in
   let (t_3669) = ((t_3637) +. (t_3653)) in
   let (t_3670) = ((t_3638) +. (t_3654)) in
   let (t_3671) = ((t_3637) -. (t_3653)) in
   let (t_3672) = ((t_3638) -. (t_3654)) in
   let (t_3673) = ((t_3641) +. (t_3657)) in
   let (t_3674) = ((t_3642) +. (t_3658)) in
   let (t_3675) = ((t_3641) -. (t_3657)) in
   let (t_3676) = ((t_3642) -. (t_3658)) in
   let (t_3677) = ((t_3645) +. (t_3661)) in
   let (t_3678) = ((t_3646) +. (t_3662)) in
   let (t_3679) = ((t_3645) -. (t_3661)) in
   let (t_3680) = ((t_3646) -. (t_3662)) in
   let (t_3681) = ((t_3665) +. (t_3673)) in
   let (t_3682) = ((t_3666) +. (t_3674)) in
   let (t_3683) = ((t_3665) -. (t_3673)) in
   let (t_3684) = ((t_3666) -. (t_3674)) in
   let (t_3685) = ((t_3669) +. (t_3677)) in
   let (t_3686) = ((t_3670) +. (t_3678)) in
   let (t_3687) = ((t_3669) -. (t_3677)) in
   let (t_3688) = ((t_3670) -. (t_3678)) in
   let (t_3689) = ((t_3681) +. (t_3685)) in
   let (t_3690) = ((t_3682) +. (t_3686)) in
   let (t_3691) = ((t_3681) -. (t_3685)) in
   let (t_3692) = ((t_3682) -. (t_3686)) in
   let (t_3693) = ((t_3683) -. (t_3688)) in
   let (t_3694) = ((t_3684) +. (t_3687)) in
   let (t_3695) = ((t_3683) +. (t_3688)) in
   let (t_3696) = ((t_3684) -. (t_3687)) in
   let (t_3697) = ((t_3667) -. (t_3676)) in
   let (t_3698) = ((t_3668) +. (t_3675)) in
   let (t_3699) = ((t_3667) +. (t_3676)) in
   let (t_3700) = ((t_3668) -. (t_3675)) in
   let (t_3701) = ((t_3671) -. (t_3680)) in
   let (t_3702) = ((t_3672) +. (t_3679)) in
   let (t_3703) = ((t_3671) +. (t_3680)) in
   let (t_3704) = ((t_3672) -. (t_3679)) in
   let (t_3705) = (0.707106781187 *. ((t_3703) -. (t_3704))) in
   let (t_3706) = (0.707106781187 *. ((t_3703) +. (t_3704))) in
   let (t_3707) = ((t_3699) -. (t_3706)) in
   let (t_3708) = ((t_3700) +. (t_3705)) in
   let (t_3709) = ((t_3699) +. (t_3706)) in
   let (t_3710) = ((t_3700) -. (t_3705)) in
   let (t_3711) = (0.707106781187 *. ((t_3701) -. (t_3702))) in
   let (t_3712) = (0.707106781187 *. ((t_3701) +. (t_3702))) in
   let (t_3713) = ((t_3697) +. (t_3711)) in
   let (t_3714) = ((t_3698) +. (t_3712)) in
   let (t_3715) = ((t_3697) -. (t_3711)) in
   let (t_3716) = ((t_3698) -. (t_3712)) in
   let (t_3717) = ((t_3635) -. (t_3652)) in
   let (t_3718) = ((t_3636) +. (t_3651)) in
   let (t_3719) = ((t_3635) +. (t_3652)) in
   let (t_3720) = ((t_3636) -. (t_3651)) in
   let (t_3721) = ((t_3639) -. (t_3656)) in
   let (t_3722) = ((t_3640) +. (t_3655)) in
   let (t_3723) = ((t_3639) +. (t_3656)) in
   let (t_3724) = ((t_3640) -. (t_3655)) in
   let (t_3725) = ((t_3643) -. (t_3660)) in
   let (t_3726) = ((t_3644) +. (t_3659)) in
   let (t_3727) = ((t_3643) +. (t_3660)) in
   let (t_3728) = ((t_3644) -. (t_3659)) in
   let (t_3729) = ((t_3647) -. (t_3664)) in
   let (t_3730) = ((t_3648) +. (t_3663)) in
   let (t_3731) = ((t_3647) +. (t_3664)) in
   let (t_3732) = ((t_3648) -. (t_3663)) in
   let (t_3733) = (0.707106781187 *. ((t_3727) -. (t_3728))) in
   let (t_3734) = (0.707106781187 *. ((t_3727) +. (t_3728))) in
   let (t_3735) = ((t_3719) -. (t_3734)) in
   let (t_3736) = ((t_3720) +. (t_3733)) in
   let (t_3737) = ((t_3719) +. (t_3734)) in
   let (t_3738) = ((t_3720) -. (t_3733)) in
   let (t_3739) =
    ((0.382683432365 *. (t_3723)) -. (0.923879532511 *. (t_3724))) in
   let (t_3740) =
    ((0.923879532511 *. (t_3723)) +. (0.382683432365 *. (t_3724))) in
   let (t_3741) =
    ((0.923879532511 *. (t_3731)) -. (0.382683432365 *. (t_3732))) in
   let (t_3742) =
    ((0.382683432365 *. (t_3731)) +. (0.923879532511 *. (t_3732))) in
   let (t_3743) = ((t_3739) +. (t_3741)) in
   let (t_3744) = ((t_3740) +. (t_3742)) in
   let (t_3745) = ((t_3739) -. (t_3741)) in
   let (t_3746) = ((t_3740) -. (t_3742)) in
   let (t_3747) = ((t_3737) -. (t_3744)) in
   let (t_3748) = ((t_3738) +. (t_3743)) in
   let (t_3749) = ((t_3737) +. (t_3744)) in
   let (t_3750) = ((t_3738) -. (t_3743)) in
   let (t_3751) = ((t_3735) +. (t_3745)) in
   let (t_3752) = ((t_3736) +. (t_3746)) in
   let (t_3753) = ((t_3735) -. (t_3745)) in
   let (t_3754) = ((t_3736) -. (t_3746)) in
   let (t_3755) = (0.707106781187 *. ((t_3725) -. (t_3726))) in
   let (t_3756) = (0.707106781187 *. ((t_3725) +. (t_3726))) in
   let (t_3757) = ((t_3717) +. (t_3755)) in
   let (t_3758) = ((t_3718) +. (t_3756)) in
   let (t_3759) = ((t_3717) -. (t_3755)) in
   let (t_3760) = ((t_3718) -. (t_3756)) in
   let (t_3761) =
    ((0.923879532511 *. (t_3721)) -. (0.382683432365 *. (t_3722))) in
   let (t_3762) =
    ((0.382683432365 *. (t_3721)) +. (0.923879532511 *. (t_3722))) in
   let (t_3763) =
    ((0.382683432365 *. (t_3729)) -. (0.923879532511 *. (t_3730))) in
   let (t_3764) =
    ((0.923879532511 *. (t_3729)) +. (0.382683432365 *. (t_3730))) in
   let (t_3765) = ((t_3761) +. (t_3763)) in
   let (t_3766) = ((t_3762) +. (t_3764)) in
   let (t_3767) = ((t_3761) -. (t_3763)) in
   let (t_3768) = ((t_3762) -. (t_3764)) in
   let (t_3769) = ((t_3759) -. (t_3768)) in
   let (t_3770) = ((t_3760) +. (t_3767)) in
   let (t_3771) = ((t_3759) +. (t_3768)) in
   let (t_3772) = ((t_3760) -. (t_3767)) in
   let (t_3773) = ((t_3757) +. (t_3765)) in
   let (t_3774) = ((t_3758) +. (t_3766)) in
   let (t_3775) = ((t_3757) -. (t_3765)) in
   let (t_3776) = ((t_3758) -. (t_3766)) in
   (x_3600).(0) <- (t_3689);
   (x_3600).(1) <- (t_3690);
   (x_3600).(2) <- (t_3749);
   (x_3600).(3) <- (t_3750);
   (x_3600).(4) <- (t_3709);
   (x_3600).(5) <- (t_3710);
   (x_3600).(6) <- (t_3771);
   (x_3600).(7) <- (t_3772);
   (x_3600).(8) <- (t_3695);
   (x_3600).(9) <- (t_3696);
   (x_3600).(10) <- (t_3753);
   (x_3600).(11) <- (t_3754);
   (x_3600).(12) <- (t_3715);
   (x_3600).(13) <- (t_3716);
   (x_3600).(14) <- (t_3775);
   (x_3600).(15) <- (t_3776);
   (x_3600).(16) <- (t_3691);
   (x_3600).(17) <- (t_3692);
   (x_3600).(18) <- (t_3747);
   (x_3600).(19) <- (t_3748);
   (x_3600).(20) <- (t_3707);
   (x_3600).(21) <- (t_3708);
   (x_3600).(22) <- (t_3769);
   (x_3600).(23) <- (t_3770);
   (x_3600).(24) <- (t_3693);
   (x_3600).(25) <- (t_3694);
   (x_3600).(26) <- (t_3751);
   (x_3600).(27) <- (t_3752);
   (x_3600).(28) <- (t_3713);
   (x_3600).(29) <- (t_3714);
   (x_3600).(30) <- (t_3773);
   (x_3600).(31) <- (t_3774);
   (x_3600)>.
*)

(* .!{Trx.run_gcc with compiler_flags="-g -O2"} *)

(* Generating code files.

let path = "/local/rap/oleg/fft-2/fft-output/aa" in
let gencode n = 
  let nr = 2*n in
  let code = .<fun x -> .~(gen_aa Forward  nr .<x>.); 0>. in
  let code_str = Trx.C.toC code in
  let chan = open_out (path ^ (string_of_int n) ^ ".c") in
  let () = output_string chan code_str in
  close_out chan
in
List.map gencode [4;8]
;;

*)

(* ----------------------------------------------------------------------- *)
(* Avoiding initial assignments from array values to variables *)
(* array references become duplicated *)
(* convert input array to list format *)

let gen_ab dir size nums = 
  run (y_sm (fft_aa dir))
    ((fun p -> (bind (liftcM retS p)  (* The only difference is retS here *)
		  (fun (x,y) -> retS (Code (rat_one, x, y))))),
     (* here we assert c is a simple piece of code!*)
     (* And it is because the last stage of *)
     (* FFT is only a+b and a-b *)
     fun (Code ((0,1),x,y)) -> (x,y))
    size nums

(* To see the code
.<fun x -> .~(gen_ab Forward  (4 * 2) .<x>.)>.;;
*)

let test_ab dir lst =
  let n = List.length lst in
  let arr = Array.make (n * 2)  0.0 in
  let _ = List.fold_left (fun n (a,b) -> (arr.(n) <- a; arr.(n+1) <-b; n+2))
      0 lst in
  let rec tolist arr i = if i = Array.length arr then []
      else (arr.(i),arr.(i+1))::(tolist arr (i+2)) in
  let tranc = .<fun x -> .~(gen_ab dir (n * 2) .<x>.)>. in
  let tran = (.!tranc) (arr) in
  tolist tran 0

(* Tests *)
let () = assert ((7,0.0) = inf_norm
                 (List.map (fun x -> 4.0 *. x) (unpair (impulse 4 1 )))
		 (unpair (test_ab Inverse (test_ab Forward (impulse 4 1)))
			  ))

let () = assert ((10, 5.11946041115152184e-12) = inf_norm
                 (List.map (fun x -> 8.0 *. x) (unpair (impulse 8 1 )))
		 (unpair (test_ab Inverse (test_ab Forward (impulse 8 1)))
			  ))

let () = assert ((18, 9.90851845017459709e-12) = inf_norm
                 (List.map (fun x -> 16.0 *. x) (unpair (impulse 16 1 )))
		 (unpair (test_ab Inverse (test_ab Forward (impulse 16 1)))
			  ))

(* ----------------------------------------------------------------------- *)
(* Using complex multiplication with three multiplies *)
(* if x = a + ib and y = c + id  *)
(* we compute t1 = (a+b)*(c-d) *)
(*            t2 = a*d         *)
(*            t3 = b*c         *)
(* so that                     *)
(*   Re(x*y) = t1 + t2 - t3    *)
(*   Im(x*y) = t2 + t3         *)
(* 3 multipllications and 5 additions *)

(* This is a semi-concretization function that forces
   the multiplication up to trivial factors, +/- 1, +/- I *)
let force_mult_ac (Code (f,rx,ix)) =
  let (newf,(j,n)) = rotate_to_QI f in
  let theta = float_of_int (2*j) *. pi /. float_of_int n in
  let c = cos theta in
  let s = sin theta in
  let cs = c +. s in
  let () = assert (c > 0.0 && s > 0.0)
  in
  bind (retN .<cs *. (.~rx -. .~ix)>.) (fun t1 ->
  bind (retN .<c *. .~ix>.) (fun t2 ->
  bind (retN .<s *. .~rx>.) (fun t3 ->
  bind (retN .<.~t1 +. .~t2 -. .~t3>.) (fun rcs ->
  bind (retN .<.~t2 +. .~t3>.) (fun ics ->
  ret (Code (newf,rcs,ics)))))))

(* Perform (rx,ix) +/- _complex_fy*(ry,iy) and set the resulting
   factor to fx
   Return a pair: fx* (x + complex_fy*y), fx*(x - complex_fy*y)
   This is where many simplifications happen
*)
let rec do_linear_ac fx rx ix ((j,n) as fy) ry iy =
   if 2*j >= n then (* fy in III and IV quadrants: reflect *)
            bind (do_linear_ac fx rx ix (sub_rat fy (1,2)) ry iy)
            (fun (p,m) -> retS (m,p)) else
   match fy with (* x + y *)
   (0,1) -> bind (retN .<.~rx +. .~ry>.) (fun re ->
            bind (retN .<.~ix +. .~iy>.) (fun im ->
            bind (retN .<.~rx -. .~ry>.) (fun re1 ->
            bind (retN .<.~ix -. .~iy>.) (fun im1 ->
            ret (Code (fx,re,im),Code (fx,re1,im1))))))
  |(1,4) -> (* x + I*y *)
            bind (retN .<.~rx -. .~iy>.) (fun re ->
            bind (retN .<.~ix +. .~ry>.) (fun im ->
            bind (retN .<.~rx +. .~iy>.) (fun re1 ->
            bind (retN .<.~ix -. .~ry>.) (fun im1 ->
            ret (Code (fx,re,im),Code (fx,re1,im1) )))))
  |(j,8) -> (* x + (cos pi/4 + I*sin pi/4)*y *)
            let cs = sin (pi /. 4.0) in
            bind (retN .<cs *. (.~ry -. .~iy)>.) (fun rcs ->
            bind (retN .<cs *. (.~ry +. .~iy)>.) (fun ics ->
            do_linear_ac fx rx ix (rationalize (j-1,8)) rcs ics))
  |(j,n) -> bind (force_mult_ac (Code (fy,ry,iy))) (fun (Code (fy,ry,iy)) ->
	    do_linear_ac fx rx ix fy ry iy)

(* This one is monadic: it may need to let-bind something
   Note: the add_sub_ac function is not symmetric!
   We need to compute fx*<code> +/- fy*<code>
   There are two ways to proceed:
    (1) let e1 = fx*<code> and e2 = fy*<code> in e1 +/- e2
    (2) fx*( <code> +/- (fy/fx)*<code> )
   The second way is better when either the multiplication by
   fx is trivial or by (fy/fx) is trivial
   Otherwise, the first way is better
   The second method is performed by do_linear
*)
let rec add_sub_ac ((Code (fx,rx,ix)) as x) ((Code (fy,ry,iy)) as y) =
   let fyx = (sub_rat fy fx) in
   if (trivial_mult fx) || (trivial_mult fyx) then
      do_linear_ac fx rx ix fyx ry iy
   else 
   bind (force_mult_ac x) (fun x ->
   bind (force_mult_ac y) (fun y ->
   add_sub_ac x y))

(* Staged Monadic split *)
let split_ac dir l =
(*
  let () = print_string "\nBefore split: ";
    List.iter (fun (Code (fx,_,_)) ->Printf.printf "%s " (str_rat fx))
      l in
*)
  let n = List.length l in
  let rec comb j = function
      (x::xs, y::ys) ->
        bind (add_sub_ac x y) (fun (xpy,xmy) ->
	bind (comb (j+1) (xs,ys)) (fun (a,b) ->
        retS (xpy::a, (mult_aa xmy (w_aa dir n j))::b)))
    | _ -> retS ([],[]) in
  comb 0 (cleave_list l)

let fft_ac dir f l =
  if List.length l = 1 then ret l
  else
    bind (split_ac dir l) (fun (p0,p1) ->
    bind (f p0) (fun y0 ->
    bind (f p1) (fun y1 ->
    retS (merge_u (y0,y1)))))

let gen_ac dir size nums = 
  run (y_sm (fft_ac dir))
    ((fun p -> (bind (liftcM retN p) 
		  (fun (x,y) -> retS (Code (rat_one, x, y))))),
     (* here we assert c is a simple piece of code!*)
     (* And it is because the last stage of *)
     (* FFT is only a+b and a-b *)
     fun (Code ((0,1),x,y)) -> (x,y))
    size nums

(* To see the code
.<fun x -> .~(gen_ac Forward  (4 * 2) .<x>.)>.;;
*)

let test_ac dir lst =
  let n = List.length lst in
  let arr = Array.make (n * 2)  0.0 in
  let _ = List.fold_left (fun n (a,b) -> (arr.(n) <- a; arr.(n+1) <-b; n+2))
      0 lst in
  let rec tolist arr i = if i = Array.length arr then []
      else (arr.(i),arr.(i+1))::(tolist arr (i+2)) in
  let tranc = .<fun x -> .~(gen_ac dir (n * 2) .<x>.)>. in
  let tran = (.!tranc) (arr) in
  tolist tran 0

(* Tests *)
let () = assert ((7,0.0) = inf_norm
                 (List.map (fun x -> 4.0 *. x) (unpair (impulse 4 1 )))
		 (unpair (test_ac Inverse (test_ac Forward (impulse 4 1)))
			  ))

let () = assert ((10, 5.11946041115152184e-12) = inf_norm
                 (List.map (fun x -> 8.0 *. x) (unpair (impulse 8 1 )))
		 (unpair (test_ac Inverse (test_ac Forward (impulse 8 1)))
			  ))

let () = assert ((2, 6.30464569439936895e-11) = inf_norm
                 (List.map (fun x -> 16.0 *. x) (unpair (impulse 16 1 )))
		 (unpair (test_ac Inverse (test_ac Forward (impulse 16 1)))
			  ))

(* Sample of the genereated code:
 .<fun x -> .~(gen_ac Forward  (16 * 2) .<x>.)>.;;
.<fun (x_3856) ->
   let (t_3857) = (x_3856).(0) in
   let (t_3858) = (x_3856).(1) in
   let (t_3859) = (x_3856).(2) in
   let (t_3860) = (x_3856).(3) in
   let (t_3861) = (x_3856).(4) in
   let (t_3862) = (x_3856).(5) in
   let (t_3863) = (x_3856).(6) in
   let (t_3864) = (x_3856).(7) in
   let (t_3865) = (x_3856).(8) in
   let (t_3866) = (x_3856).(9) in
   let (t_3867) = (x_3856).(10) in
   let (t_3868) = (x_3856).(11) in
   let (t_3869) = (x_3856).(12) in
   let (t_3870) = (x_3856).(13) in
   let (t_3871) = (x_3856).(14) in
   let (t_3872) = (x_3856).(15) in
   let (t_3873) = (x_3856).(16) in
   let (t_3874) = (x_3856).(17) in
   let (t_3875) = (x_3856).(18) in
   let (t_3876) = (x_3856).(19) in
   let (t_3877) = (x_3856).(20) in
   let (t_3878) = (x_3856).(21) in
   let (t_3879) = (x_3856).(22) in
   let (t_3880) = (x_3856).(23) in
   let (t_3881) = (x_3856).(24) in
   let (t_3882) = (x_3856).(25) in
   let (t_3883) = (x_3856).(26) in
   let (t_3884) = (x_3856).(27) in
   let (t_3885) = (x_3856).(28) in
   let (t_3886) = (x_3856).(29) in
   let (t_3887) = (x_3856).(30) in
   let (t_3888) = (x_3856).(31) in
   let (t_3889) = ((t_3857) +. (t_3873)) in
   let (t_3890) = ((t_3858) +. (t_3874)) in
   let (t_3891) = ((t_3857) -. (t_3873)) in
   let (t_3892) = ((t_3858) -. (t_3874)) in
   let (t_3893) = ((t_3859) +. (t_3875)) in
   let (t_3894) = ((t_3860) +. (t_3876)) in
   let (t_3895) = ((t_3859) -. (t_3875)) in
   let (t_3896) = ((t_3860) -. (t_3876)) in
   let (t_3897) = ((t_3861) +. (t_3877)) in
   let (t_3898) = ((t_3862) +. (t_3878)) in
   let (t_3899) = ((t_3861) -. (t_3877)) in
   let (t_3900) = ((t_3862) -. (t_3878)) in
   let (t_3901) = ((t_3863) +. (t_3879)) in
   let (t_3902) = ((t_3864) +. (t_3880)) in
   let (t_3903) = ((t_3863) -. (t_3879)) in
   let (t_3904) = ((t_3864) -. (t_3880)) in
   let (t_3905) = ((t_3865) +. (t_3881)) in
   let (t_3906) = ((t_3866) +. (t_3882)) in
   let (t_3907) = ((t_3865) -. (t_3881)) in
   let (t_3908) = ((t_3866) -. (t_3882)) in
   let (t_3909) = ((t_3867) +. (t_3883)) in
   let (t_3910) = ((t_3868) +. (t_3884)) in
   let (t_3911) = ((t_3867) -. (t_3883)) in
   let (t_3912) = ((t_3868) -. (t_3884)) in
   let (t_3913) = ((t_3869) +. (t_3885)) in
   let (t_3914) = ((t_3870) +. (t_3886)) in
   let (t_3915) = ((t_3869) -. (t_3885)) in
   let (t_3916) = ((t_3870) -. (t_3886)) in
   let (t_3917) = ((t_3871) +. (t_3887)) in
   let (t_3918) = ((t_3872) +. (t_3888)) in
   let (t_3919) = ((t_3871) -. (t_3887)) in
   let (t_3920) = ((t_3872) -. (t_3888)) in
   let (t_3921) = ((t_3889) +. (t_3905)) in
   let (t_3922) = ((t_3890) +. (t_3906)) in
   let (t_3923) = ((t_3889) -. (t_3905)) in
   let (t_3924) = ((t_3890) -. (t_3906)) in
   let (t_3925) = ((t_3893) +. (t_3909)) in
   let (t_3926) = ((t_3894) +. (t_3910)) in
   let (t_3927) = ((t_3893) -. (t_3909)) in
   let (t_3928) = ((t_3894) -. (t_3910)) in
   let (t_3929) = ((t_3897) +. (t_3913)) in
   let (t_3930) = ((t_3898) +. (t_3914)) in
   let (t_3931) = ((t_3897) -. (t_3913)) in
   let (t_3932) = ((t_3898) -. (t_3914)) in
   let (t_3933) = ((t_3901) +. (t_3917)) in
   let (t_3934) = ((t_3902) +. (t_3918)) in
   let (t_3935) = ((t_3901) -. (t_3917)) in
   let (t_3936) = ((t_3902) -. (t_3918)) in
   let (t_3937) = ((t_3921) +. (t_3929)) in
   let (t_3938) = ((t_3922) +. (t_3930)) in
   let (t_3939) = ((t_3921) -. (t_3929)) in
   let (t_3940) = ((t_3922) -. (t_3930)) in
   let (t_3941) = ((t_3925) +. (t_3933)) in
   let (t_3942) = ((t_3926) +. (t_3934)) in
   let (t_3943) = ((t_3925) -. (t_3933)) in
   let (t_3944) = ((t_3926) -. (t_3934)) in
   let (t_3945) = ((t_3937) +. (t_3941)) in
   let (t_3946) = ((t_3938) +. (t_3942)) in
   let (t_3947) = ((t_3937) -. (t_3941)) in
   let (t_3948) = ((t_3938) -. (t_3942)) in
   let (t_3949) = ((t_3939) -. (t_3944)) in
   let (t_3950) = ((t_3940) +. (t_3943)) in
   let (t_3951) = ((t_3939) +. (t_3944)) in
   let (t_3952) = ((t_3940) -. (t_3943)) in
   let (t_3953) = ((t_3923) -. (t_3932)) in
   let (t_3954) = ((t_3924) +. (t_3931)) in
   let (t_3955) = ((t_3923) +. (t_3932)) in
   let (t_3956) = ((t_3924) -. (t_3931)) in
   let (t_3957) = ((t_3927) -. (t_3936)) in
   let (t_3958) = ((t_3928) +. (t_3935)) in
   let (t_3959) = ((t_3927) +. (t_3936)) in
   let (t_3960) = ((t_3928) -. (t_3935)) in
   let (t_3961) = (0.707106781187 *. ((t_3959) -. (t_3960))) in
   let (t_3962) = (0.707106781187 *. ((t_3959) +. (t_3960))) in
   let (t_3963) = ((t_3955) -. (t_3962)) in
   let (t_3964) = ((t_3956) +. (t_3961)) in
   let (t_3965) = ((t_3955) +. (t_3962)) in
   let (t_3966) = ((t_3956) -. (t_3961)) in
   let (t_3967) = (0.707106781187 *. ((t_3957) -. (t_3958))) in
   let (t_3968) = (0.707106781187 *. ((t_3957) +. (t_3958))) in
   let (t_3969) = ((t_3953) +. (t_3967)) in
   let (t_3970) = ((t_3954) +. (t_3968)) in
   let (t_3971) = ((t_3953) -. (t_3967)) in
   let (t_3972) = ((t_3954) -. (t_3968)) in
   let (t_3973) = ((t_3891) -. (t_3908)) in
   let (t_3974) = ((t_3892) +. (t_3907)) in
   let (t_3975) = ((t_3891) +. (t_3908)) in
   let (t_3976) = ((t_3892) -. (t_3907)) in
   let (t_3977) = ((t_3895) -. (t_3912)) in
   let (t_3978) = ((t_3896) +. (t_3911)) in
   let (t_3979) = ((t_3895) +. (t_3912)) in
   let (t_3980) = ((t_3896) -. (t_3911)) in
   let (t_3981) = ((t_3899) -. (t_3916)) in
   let (t_3982) = ((t_3900) +. (t_3915)) in
   let (t_3983) = ((t_3899) +. (t_3916)) in
   let (t_3984) = ((t_3900) -. (t_3915)) in
   let (t_3985) = ((t_3903) -. (t_3920)) in
   let (t_3986) = ((t_3904) +. (t_3919)) in
   let (t_3987) = ((t_3903) +. (t_3920)) in
   let (t_3988) = ((t_3904) -. (t_3919)) in
   let (t_3989) = (0.707106781187 *. ((t_3983) -. (t_3984))) in
   let (t_3990) = (0.707106781187 *. ((t_3983) +. (t_3984))) in
   let (t_3991) = ((t_3975) -. (t_3990)) in
   let (t_3992) = ((t_3976) +. (t_3989)) in
   let (t_3993) = ((t_3975) +. (t_3990)) in
   let (t_3994) = ((t_3976) -. (t_3989)) in
   let (t_3995) = (1.30656296488 *. ((t_3979) -. (t_3980))) in
   let (t_3996) = (0.382683432365 *. (t_3980)) in
   let (t_3997) = (0.923879532511 *. (t_3979)) in
   let (t_3998) = (((t_3995) +. (t_3996)) -. (t_3997)) in
   let (t_3999) = ((t_3996) +. (t_3997)) in
   let (t_4000) = (1.30656296488 *. ((t_3987) -. (t_3988))) in
   let (t_4001) = (0.923879532511 *. (t_3988)) in
   let (t_4002) = (0.382683432365 *. (t_3987)) in
   let (t_4003) = (((t_4000) +. (t_4001)) -. (t_4002)) in
   let (t_4004) = ((t_4001) +. (t_4002)) in
   let (t_4005) = ((t_3998) +. (t_4003)) in
   let (t_4006) = ((t_3999) +. (t_4004)) in
   let (t_4007) = ((t_3998) -. (t_4003)) in
   let (t_4008) = ((t_3999) -. (t_4004)) in
   let (t_4009) = ((t_3993) -. (t_4006)) in
   let (t_4010) = ((t_3994) +. (t_4005)) in
   let (t_4011) = ((t_3993) +. (t_4006)) in
   let (t_4012) = ((t_3994) -. (t_4005)) in
   let (t_4013) = ((t_3991) +. (t_4007)) in
   let (t_4014) = ((t_3992) +. (t_4008)) in
   let (t_4015) = ((t_3991) -. (t_4007)) in
   let (t_4016) = ((t_3992) -. (t_4008)) in
   let (t_4017) = (0.707106781187 *. ((t_3981) -. (t_3982))) in
   let (t_4018) = (0.707106781187 *. ((t_3981) +. (t_3982))) in
   let (t_4019) = ((t_3973) +. (t_4017)) in
   let (t_4020) = ((t_3974) +. (t_4018)) in
   let (t_4021) = ((t_3973) -. (t_4017)) in
   let (t_4022) = ((t_3974) -. (t_4018)) in
   let (t_4023) = (1.30656296488 *. ((t_3977) -. (t_3978))) in
   let (t_4024) = (0.923879532511 *. (t_3978)) in
   let (t_4025) = (0.382683432365 *. (t_3977)) in
   let (t_4026) = (((t_4023) +. (t_4024)) -. (t_4025)) in
   let (t_4027) = ((t_4024) +. (t_4025)) in
   let (t_4028) = (1.30656296488 *. ((t_3985) -. (t_3986))) in
   let (t_4029) = (0.382683432365 *. (t_3986)) in
   let (t_4030) = (0.923879532511 *. (t_3985)) in
   let (t_4031) = (((t_4028) +. (t_4029)) -. (t_4030)) in
   let (t_4032) = ((t_4029) +. (t_4030)) in
   let (t_4033) = ((t_4026) +. (t_4031)) in
   let (t_4034) = ((t_4027) +. (t_4032)) in
   let (t_4035) = ((t_4026) -. (t_4031)) in
   let (t_4036) = ((t_4027) -. (t_4032)) in
   let (t_4037) = ((t_4021) -. (t_4036)) in
   let (t_4038) = ((t_4022) +. (t_4035)) in
   let (t_4039) = ((t_4021) +. (t_4036)) in
   let (t_4040) = ((t_4022) -. (t_4035)) in
   let (t_4041) = ((t_4019) +. (t_4033)) in
   let (t_4042) = ((t_4020) +. (t_4034)) in
   let (t_4043) = ((t_4019) -. (t_4033)) in
   let (t_4044) = ((t_4020) -. (t_4034)) in
   (x_3856).(0) <- (t_3945);
   (x_3856).(1) <- (t_3946);
   (x_3856).(2) <- (t_4011);
   (x_3856).(3) <- (t_4012);
   (x_3856).(4) <- (t_3965);
   (x_3856).(5) <- (t_3966);
   (x_3856).(6) <- (t_4039);
   (x_3856).(7) <- (t_4040);
   (x_3856).(8) <- (t_3951);
   (x_3856).(9) <- (t_3952);
   (x_3856).(10) <- (t_4015);
   (x_3856).(11) <- (t_4016);
   (x_3856).(12) <- (t_3971);
   (x_3856).(13) <- (t_3972);
   (x_3856).(14) <- (t_4043);
   (x_3856).(15) <- (t_4044);
   (x_3856).(16) <- (t_3947);
   (x_3856).(17) <- (t_3948);
   (x_3856).(18) <- (t_4009);
   (x_3856).(19) <- (t_4010);
   (x_3856).(20) <- (t_3963);
   (x_3856).(21) <- (t_3964);
   (x_3856).(22) <- (t_4037);
   (x_3856).(23) <- (t_4038);
   (x_3856).(24) <- (t_3949);
   (x_3856).(25) <- (t_3950);
   (x_3856).(26) <- (t_4013);
   (x_3856).(27) <- (t_4014);
   (x_3856).(28) <- (t_3969);
   (x_3856).(29) <- (t_3970);
   (x_3856).(30) <- (t_4041);
   (x_3856).(31) <- (t_4042);
   (x_3856)>.
*)

(* Generating code files.
let path = "/local/rap/oleg/fft-2/fft-output/ac" in
let gencode n = 
  let nr = 2*n in
  let code = .<fun x -> .~(gen_ac Forward  nr .<x>.); 0>. in
  let code_str = Trx.C.toC code in
  let chan = open_out (path ^ (string_of_int n) ^ ".c") in
  let () = output_string chan code_str in
  close_out chan
in
List.map gencode [4;8;16;32;64;128;256]
*)


(* ----------------------------------------------------------------------- *)
(* Using complex multiplication with three multiplies *)
(* A more optimal way! *)
(* if x = a + ib and y = c + id  *)
(* we compute t1 = a*(c+d) *)
(*            t2 = d*(b+a) *)
(*            t3 = c*(b-a) *)
(* so that                     *)
(*   Re(x*y) = t1 - t2    *)
(*   Im(x*y) = t1 + t3    *)
(* 3 multipllications and 5 additions *)
(* Note that if 'x' is a known factor, we can pre-compute
   a+b and a-b at compile time, so at run time we'll have only
   three multiplies and three additions *)

(* This is a semi-concretization function that forces
   the multiplication up to trivial factors, +/- 1, +/- I *)
let force_mult_ad (Code (f,rx,ix)) =
  let (newf,(j,n)) = rotate_to_QI f in
  let theta = float_of_int (2*j) *. pi /. float_of_int n in
  let c = cos theta in
  let s = sin theta in
  let spc = s +. c in
  let smc = s -. c in
  let () = assert (c > 0.0 && s > 0.0 )
  in
  bind (retN .<c *. (.~rx +. .~ix)>.) (fun t1 ->
  bind (retN .<spc *. .~ix>.) (fun t2 ->
    if smc > 0.0 then
      bind (retN .<smc *. .~rx>.) (fun t3 ->
      bind (retN .<.~t1 -. .~t2>.) (fun rcs ->
      bind (retN .<.~t1 +. .~t3>.) (fun ics ->
      ret (Code (newf,rcs,ics)))))
    else
      let cms = c -. s in (* keep all the constants positive *)
      bind (retN .<cms *. .~rx>.) (fun nt3 ->
      bind (retN .<.~t1 -. .~t2>.) (fun rcs ->
      bind (retN .<.~t1 -. .~nt3>.) (fun ics ->
      ret (Code (newf,rcs,ics)))))
	))
 

(* Perform (rx,ix) +/- _complex_fy*(ry,iy) and set the resulting
   factor to fx
   Return a pair: fx* (x + complex_fy*y), fx*(x - complex_fy*y)
   This is where many simplifications happen
*)
let rec do_linear_ad fx rx ix ((j,n) as fy) ry iy =
   if 2*j >= n then (* fy in III and IV quadrants: reflect *)
            bind (do_linear_ad fx rx ix (sub_rat fy (1,2)) ry iy)
            (fun (p,m) -> retS (m,p)) else
   match fy with (* x + y *)
   (0,1) -> bind (retN .<.~rx +. .~ry>.) (fun re ->
            bind (retN .<.~ix +. .~iy>.) (fun im ->
            bind (retN .<.~rx -. .~ry>.) (fun re1 ->
            bind (retN .<.~ix -. .~iy>.) (fun im1 ->
            ret (Code (fx,re,im),Code (fx,re1,im1))))))
  |(1,4) -> (* x + I*y *)
            bind (retN .<.~rx -. .~iy>.) (fun re ->
            bind (retN .<.~ix +. .~ry>.) (fun im ->
            bind (retN .<.~rx +. .~iy>.) (fun re1 ->
            bind (retN .<.~ix -. .~ry>.) (fun im1 ->
            ret (Code (fx,re,im),Code (fx,re1,im1) )))))
  |(j,8) -> (* x + (cos pi/4 + I*sin pi/4)*y *)
            let cs = sin (pi /. 4.0) in
            bind (retN .<cs *. (.~ry -. .~iy)>.) (fun rcs ->
            bind (retN .<cs *. (.~ry +. .~iy)>.) (fun ics ->
            do_linear_ad fx rx ix (rationalize (j-1,8)) rcs ics))
  |(j,n) -> bind (force_mult_ad (Code (fy,ry,iy))) (fun (Code (fy,ry,iy)) ->
	    do_linear_ad fx rx ix fy ry iy)

(* This one is monadic: it may need to let-bind something
   Note: the add_sub_ad function is not symmetric!
   We need to compute fx*<code> +/- fy*<code>
   There are two ways to proceed:
    (1) let e1 = fx*<code> and e2 = fy*<code> in e1 +/- e2
    (2) fx*( <code> +/- (fy/fx)*<code> )
   The second way is better when either the multiplication by
   fx is trivial or by (fy/fx) is trivial
   Otherwise, the first way is better
   The second method is performed by do_linear
*)
let rec add_sub_ad ((Code (fx,rx,ix)) as x) ((Code (fy,ry,iy)) as y) =
   let fyx = (sub_rat fy fx) in
   if (trivial_mult fx) || (trivial_mult fyx) then
      do_linear_ad fx rx ix fyx ry iy
   else 
   bind (force_mult_ad x) (fun x ->
   bind (force_mult_ad y) (fun y ->
   add_sub_ad x y))

(* Staged Monadic split *)
let split_ad dir l =
(*
  let () = print_string "\nBefore split: ";
    List.iter (fun (Code (fx,_,_)) ->Printf.printf "%s " (str_rat fx))
      l in
*)
  let n = List.length l in
  let rec comb j = function
      (x::xs, y::ys) ->
        bind (add_sub_ad x y) (fun (xpy,xmy) ->
	bind (comb (j+1) (xs,ys)) (fun (a,b) ->
        retS (xpy::a, (mult_aa xmy (w_aa dir n j))::b)))
    | _ -> retS ([],[]) in
  comb 0 (cleave_list l)

let fft_ad dir f l =
  if List.length l = 1 then ret l
  else
    bind (split_ad dir l) (fun (p0,p1) ->
    bind (f p0) (fun y0 ->
    bind (f p1) (fun y1 ->
    retS (merge_u (y0,y1)))))

let gen_ad dir size nums = 
  run (y_sm (fft_ad dir))
    ((fun p -> (bind (liftcM retN p) 
		  (fun (x,y) -> retS (Code (rat_one, x, y))))),
     (* here we assert c is a simple piece of code!*)
     (* And it is because the last stage of *)
     (* FFT is only a+b and a-b *)
     fun (Code ((0,1),x,y)) -> (x,y))
    size nums

(* To see the code
.<fun x -> .~(gen_ad Forward  (4 * 2) .<x>.)>.;;
*)

let test_ad dir lst =
  let n = List.length lst in
  let arr = Array.make (n * 2)  0.0 in
  let _ = List.fold_left (fun n (a,b) -> (arr.(n) <- a; arr.(n+1) <-b; n+2))
      0 lst in
  let rec tolist arr i = if i = Array.length arr then []
      else (arr.(i),arr.(i+1))::(tolist arr (i+2)) in
  let tranc = .<fun x -> .~(gen_ad dir (n * 2) .<x>.)>. in
  let tran = (.!tranc) (arr) in
  tolist tran 0

(* Tests *)
let () = assert ((7,0.0) = inf_norm
                 (List.map (fun x -> 4.0 *. x) (unpair (impulse 4 1 )))
		 (unpair (test_ad Inverse (test_ad Forward (impulse 4 1)))
			  ))

let () = assert ((10, 5.11946041115152184e-12) = inf_norm
                 (List.map (fun x -> 8.0 *. x) (unpair (impulse 8 1 )))
		 (unpair (test_ad Inverse (test_ad Forward (impulse 8 1)))
			  ))

let () = assert ((2, 2.12345696581905941e-11) = inf_norm
                 (List.map (fun x -> 16.0 *. x) (unpair (impulse 16 1 )))
		 (unpair (test_ad Inverse (test_ad Forward (impulse 16 1)))
			  ))

(* Sample of the genereated code:
 .<fun x -> .~(gen_ad Forward  (16 * 2) .<x>.)>.;;
.<fun (x_20462) ->
   let (t_20463) = (x_20462).(0) in
   let (t_20464) = (x_20462).(1) in
   let (t_20465) = (x_20462).(2) in
   let (t_20466) = (x_20462).(3) in
   let (t_20467) = (x_20462).(4) in
   let (t_20468) = (x_20462).(5) in
   let (t_20469) = (x_20462).(6) in
   let (t_20470) = (x_20462).(7) in
   let (t_20471) = (x_20462).(8) in
   let (t_20472) = (x_20462).(9) in
   let (t_20473) = (x_20462).(10) in
   let (t_20474) = (x_20462).(11) in
   let (t_20475) = (x_20462).(12) in
   let (t_20476) = (x_20462).(13) in
   let (t_20477) = (x_20462).(14) in
   let (t_20478) = (x_20462).(15) in
   let (t_20479) = (x_20462).(16) in
   let (t_20480) = (x_20462).(17) in
   let (t_20481) = (x_20462).(18) in
   let (t_20482) = (x_20462).(19) in
   let (t_20483) = (x_20462).(20) in
   let (t_20484) = (x_20462).(21) in
   let (t_20485) = (x_20462).(22) in
   let (t_20486) = (x_20462).(23) in
   let (t_20487) = (x_20462).(24) in
   let (t_20488) = (x_20462).(25) in
   let (t_20489) = (x_20462).(26) in
   let (t_20490) = (x_20462).(27) in
   let (t_20491) = (x_20462).(28) in
   let (t_20492) = (x_20462).(29) in
   let (t_20493) = (x_20462).(30) in
   let (t_20494) = (x_20462).(31) in
   let (t_20495) = ((t_20463) +. (t_20479)) in
   let (t_20496) = ((t_20464) +. (t_20480)) in
   let (t_20497) = ((t_20463) -. (t_20479)) in
   let (t_20498) = ((t_20464) -. (t_20480)) in
   let (t_20499) = ((t_20465) +. (t_20481)) in
   let (t_20500) = ((t_20466) +. (t_20482)) in
   let (t_20501) = ((t_20465) -. (t_20481)) in
   let (t_20502) = ((t_20466) -. (t_20482)) in
   let (t_20503) = ((t_20467) +. (t_20483)) in
   let (t_20504) = ((t_20468) +. (t_20484)) in
   let (t_20505) = ((t_20467) -. (t_20483)) in
   let (t_20506) = ((t_20468) -. (t_20484)) in
   let (t_20507) = ((t_20469) +. (t_20485)) in
   let (t_20508) = ((t_20470) +. (t_20486)) in
   let (t_20509) = ((t_20469) -. (t_20485)) in
   let (t_20510) = ((t_20470) -. (t_20486)) in
   let (t_20511) = ((t_20471) +. (t_20487)) in
   let (t_20512) = ((t_20472) +. (t_20488)) in
   let (t_20513) = ((t_20471) -. (t_20487)) in
   let (t_20514) = ((t_20472) -. (t_20488)) in
   let (t_20515) = ((t_20473) +. (t_20489)) in
   let (t_20516) = ((t_20474) +. (t_20490)) in
   let (t_20517) = ((t_20473) -. (t_20489)) in
   let (t_20518) = ((t_20474) -. (t_20490)) in
   let (t_20519) = ((t_20475) +. (t_20491)) in
   let (t_20520) = ((t_20476) +. (t_20492)) in
   let (t_20521) = ((t_20475) -. (t_20491)) in
   let (t_20522) = ((t_20476) -. (t_20492)) in
   let (t_20523) = ((t_20477) +. (t_20493)) in
   let (t_20524) = ((t_20478) +. (t_20494)) in
   let (t_20525) = ((t_20477) -. (t_20493)) in
   let (t_20526) = ((t_20478) -. (t_20494)) in
   let (t_20527) = ((t_20495) +. (t_20511)) in
   let (t_20528) = ((t_20496) +. (t_20512)) in
   let (t_20529) = ((t_20495) -. (t_20511)) in
   let (t_20530) = ((t_20496) -. (t_20512)) in
   let (t_20531) = ((t_20499) +. (t_20515)) in
   let (t_20532) = ((t_20500) +. (t_20516)) in
   let (t_20533) = ((t_20499) -. (t_20515)) in
   let (t_20534) = ((t_20500) -. (t_20516)) in
   let (t_20535) = ((t_20503) +. (t_20519)) in
   let (t_20536) = ((t_20504) +. (t_20520)) in
   let (t_20537) = ((t_20503) -. (t_20519)) in
   let (t_20538) = ((t_20504) -. (t_20520)) in
   let (t_20539) = ((t_20507) +. (t_20523)) in
   let (t_20540) = ((t_20508) +. (t_20524)) in
   let (t_20541) = ((t_20507) -. (t_20523)) in
   let (t_20542) = ((t_20508) -. (t_20524)) in
   let (t_20543) = ((t_20527) +. (t_20535)) in
   let (t_20544) = ((t_20528) +. (t_20536)) in
   let (t_20545) = ((t_20527) -. (t_20535)) in
   let (t_20546) = ((t_20528) -. (t_20536)) in
   let (t_20547) = ((t_20531) +. (t_20539)) in
   let (t_20548) = ((t_20532) +. (t_20540)) in
   let (t_20549) = ((t_20531) -. (t_20539)) in
   let (t_20550) = ((t_20532) -. (t_20540)) in
   let (t_20551) = ((t_20543) +. (t_20547)) in
   let (t_20552) = ((t_20544) +. (t_20548)) in
   let (t_20553) = ((t_20543) -. (t_20547)) in
   let (t_20554) = ((t_20544) -. (t_20548)) in
   let (t_20555) = ((t_20545) -. (t_20550)) in
   let (t_20556) = ((t_20546) +. (t_20549)) in
   let (t_20557) = ((t_20545) +. (t_20550)) in
   let (t_20558) = ((t_20546) -. (t_20549)) in
   let (t_20559) = ((t_20529) -. (t_20538)) in
   let (t_20560) = ((t_20530) +. (t_20537)) in
   let (t_20561) = ((t_20529) +. (t_20538)) in
   let (t_20562) = ((t_20530) -. (t_20537)) in
   let (t_20563) = ((t_20533) -. (t_20542)) in
   let (t_20564) = ((t_20534) +. (t_20541)) in
   let (t_20565) = ((t_20533) +. (t_20542)) in
   let (t_20566) = ((t_20534) -. (t_20541)) in
   let (t_20567) = (0.707106781187 *. ((t_20565) -. (t_20566))) in
   let (t_20568) = (0.707106781187 *. ((t_20565) +. (t_20566))) in
   let (t_20569) = ((t_20561) -. (t_20568)) in
   let (t_20570) = ((t_20562) +. (t_20567)) in
   let (t_20571) = ((t_20561) +. (t_20568)) in
   let (t_20572) = ((t_20562) -. (t_20567)) in
   let (t_20573) = (0.707106781187 *. ((t_20563) -. (t_20564))) in
   let (t_20574) = (0.707106781187 *. ((t_20563) +. (t_20564))) in
   let (t_20575) = ((t_20559) +. (t_20573)) in
   let (t_20576) = ((t_20560) +. (t_20574)) in
   let (t_20577) = ((t_20559) -. (t_20573)) in
   let (t_20578) = ((t_20560) -. (t_20574)) in
   let (t_20579) = ((t_20497) -. (t_20514)) in
   let (t_20580) = ((t_20498) +. (t_20513)) in
   let (t_20581) = ((t_20497) +. (t_20514)) in
   let (t_20582) = ((t_20498) -. (t_20513)) in
   let (t_20583) = ((t_20501) -. (t_20518)) in
   let (t_20584) = ((t_20502) +. (t_20517)) in
   let (t_20585) = ((t_20501) +. (t_20518)) in
   let (t_20586) = ((t_20502) -. (t_20517)) in
   let (t_20587) = ((t_20505) -. (t_20522)) in
   let (t_20588) = ((t_20506) +. (t_20521)) in
   let (t_20589) = ((t_20505) +. (t_20522)) in
   let (t_20590) = ((t_20506) -. (t_20521)) in
   let (t_20591) = ((t_20509) -. (t_20526)) in
   let (t_20592) = ((t_20510) +. (t_20525)) in
   let (t_20593) = ((t_20509) +. (t_20526)) in
   let (t_20594) = ((t_20510) -. (t_20525)) in
   let (t_20595) = (0.707106781187 *. ((t_20589) -. (t_20590))) in
   let (t_20596) = (0.707106781187 *. ((t_20589) +. (t_20590))) in
   let (t_20597) = ((t_20581) -. (t_20596)) in
   let (t_20598) = ((t_20582) +. (t_20595)) in
   let (t_20599) = ((t_20581) +. (t_20596)) in
   let (t_20600) = ((t_20582) -. (t_20595)) in
   let (t_20601) = (0.382683432365 *. ((t_20585) +. (t_20586))) in
   let (t_20602) = (1.30656296488 *. (t_20586)) in
   let (t_20603) = (0.541196100146 *. (t_20585)) in
   let (t_20604) = ((t_20601) -. (t_20602)) in
   let (t_20605) = ((t_20601) +. (t_20603)) in
   let (t_20606) = (0.923879532511 *. ((t_20593) +. (t_20594))) in
   let (t_20607) = (1.30656296488 *. (t_20594)) in
   let (t_20608) = (0.541196100146 *. (t_20593)) in
   let (t_20609) = ((t_20606) -. (t_20607)) in
   let (t_20610) = ((t_20606) -. (t_20608)) in
   let (t_20611) = ((t_20604) +. (t_20609)) in
   let (t_20612) = ((t_20605) +. (t_20610)) in
   let (t_20613) = ((t_20604) -. (t_20609)) in
   let (t_20614) = ((t_20605) -. (t_20610)) in
   let (t_20615) = ((t_20599) -. (t_20612)) in
   let (t_20616) = ((t_20600) +. (t_20611)) in
   let (t_20617) = ((t_20599) +. (t_20612)) in
   let (t_20618) = ((t_20600) -. (t_20611)) in
   let (t_20619) = ((t_20597) +. (t_20613)) in
   let (t_20620) = ((t_20598) +. (t_20614)) in
   let (t_20621) = ((t_20597) -. (t_20613)) in
   let (t_20622) = ((t_20598) -. (t_20614)) in
   let (t_20623) = (0.707106781187 *. ((t_20587) -. (t_20588))) in
   let (t_20624) = (0.707106781187 *. ((t_20587) +. (t_20588))) in
   let (t_20625) = ((t_20579) +. (t_20623)) in
   let (t_20626) = ((t_20580) +. (t_20624)) in
   let (t_20627) = ((t_20579) -. (t_20623)) in
   let (t_20628) = ((t_20580) -. (t_20624)) in
   let (t_20629) = (0.923879532511 *. ((t_20583) +. (t_20584))) in
   let (t_20630) = (1.30656296488 *. (t_20584)) in
   let (t_20631) = (0.541196100146 *. (t_20583)) in
   let (t_20632) = ((t_20629) -. (t_20630)) in
   let (t_20633) = ((t_20629) -. (t_20631)) in
   let (t_20634) = (0.382683432365 *. ((t_20591) +. (t_20592))) in
   let (t_20635) = (1.30656296488 *. (t_20592)) in
   let (t_20636) = (0.541196100146 *. (t_20591)) in
   let (t_20637) = ((t_20634) -. (t_20635)) in
   let (t_20638) = ((t_20634) +. (t_20636)) in
   let (t_20639) = ((t_20632) +. (t_20637)) in
   let (t_20640) = ((t_20633) +. (t_20638)) in
   let (t_20641) = ((t_20632) -. (t_20637)) in
   let (t_20642) = ((t_20633) -. (t_20638)) in
   let (t_20643) = ((t_20627) -. (t_20642)) in
   let (t_20644) = ((t_20628) +. (t_20641)) in
   let (t_20645) = ((t_20627) +. (t_20642)) in
   let (t_20646) = ((t_20628) -. (t_20641)) in
   let (t_20647) = ((t_20625) +. (t_20639)) in
   let (t_20648) = ((t_20626) +. (t_20640)) in
   let (t_20649) = ((t_20625) -. (t_20639)) in
   let (t_20650) = ((t_20626) -. (t_20640)) in
   (x_20462).(0) <- (t_20551);
   (x_20462).(1) <- (t_20552);
   (x_20462).(2) <- (t_20617);
   (x_20462).(3) <- (t_20618);
   (x_20462).(4) <- (t_20571);
   (x_20462).(5) <- (t_20572);
   (x_20462).(6) <- (t_20645);
   (x_20462).(7) <- (t_20646);
   (x_20462).(8) <- (t_20557);
   (x_20462).(9) <- (t_20558);
   (x_20462).(10) <- (t_20621);
   (x_20462).(11) <- (t_20622);
   (x_20462).(12) <- (t_20577);
   (x_20462).(13) <- (t_20578);
   (x_20462).(14) <- (t_20649);
   (x_20462).(15) <- (t_20650);
   (x_20462).(16) <- (t_20553);
   (x_20462).(17) <- (t_20554);
   (x_20462).(18) <- (t_20615);
   (x_20462).(19) <- (t_20616);
   (x_20462).(20) <- (t_20569);
   (x_20462).(21) <- (t_20570);
   (x_20462).(22) <- (t_20643);
   (x_20462).(23) <- (t_20644);
   (x_20462).(24) <- (t_20555);
   (x_20462).(25) <- (t_20556);
   (x_20462).(26) <- (t_20619);
   (x_20462).(27) <- (t_20620);
   (x_20462).(28) <- (t_20575);
   (x_20462).(29) <- (t_20576);
   (x_20462).(30) <- (t_20647);
   (x_20462).(31) <- (t_20648);
   (x_20462)>.

*)

(* Generating code files.
let path = "/local/rap/oleg/fft-2/fft-output/ad" in
let gencode n = 
  let nr = 2*n in
  let code = .<fun x -> .~(gen_ad Forward  nr .<x>.); 0>. in
  let code_str = Trx.C.toC code in
  let chan = open_out (path ^ (string_of_int n) ^ ".c") in
  let () = output_string chan code_str in
  close_out chan
in
List.map gencode [4;8;16;32;64;128;256]
*)