Logo Search packages:      
Sourcecode: blender version File versions  Download package

q1fv_4.c

/*
 * Copyright (c) 2003, 2006 Matteo Frigo
 * Copyright (c) 2003, 2006 Massachusetts Institute of Technology
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */

/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Sat Jul  1 22:36:34 EDT 2006 */

#include "codelet-dft.h"

#ifdef HAVE_FMA

/* Generated by: ../../../genfft/gen_twidsq_c -fma -reorder-insns -schedule-for-pipeline -simd -compact -variables 4 -pipeline-latency 8 -n 4 -dif -name q1fv_4 -include q1f.h */

/*
 * This function contains 44 FP additions, 32 FP multiplications,
 * (or, 36 additions, 24 multiplications, 8 fused multiply/add),
 * 38 stack variables, and 32 memory accesses
 */
/*
 * Generator Id's : 
 * $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
 * $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
 * $Id: gen_twidsq_c.ml,v 1.8 2006-02-12 23:34:12 athena Exp $
 */

#include "q1f.h"

static const R *q1fv_4(R *ri, R *ii, const R *W, stride is, stride vs, INT m, INT dist)
{
     INT i;
     R *x;
     x = ri;
     for (i = 0; i < m; i = i + VL, x = x + (VL * dist), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(vs)) {
        V Tb, Tm, Tx, TI;
        {
             V Tc, T9, T3, TG, TA, TH, TD, Ta, T6, Td, Tn, To, Tq, Tr, Tf;
             V Tg;
             {
                V T1, T2, Ty, Tz, TB, TC, T4, T5;
                T1 = LD(&(x[0]), dist, &(x[0]));
                T2 = LD(&(x[WS(is, 2)]), dist, &(x[0]));
                Ty = LD(&(x[WS(vs, 3)]), dist, &(x[WS(vs, 3)]));
                Tz = LD(&(x[WS(vs, 3) + WS(is, 2)]), dist, &(x[WS(vs, 3)]));
                TB = LD(&(x[WS(vs, 3) + WS(is, 1)]), dist, &(x[WS(vs, 3) + WS(is, 1)]));
                TC = LD(&(x[WS(vs, 3) + WS(is, 3)]), dist, &(x[WS(vs, 3) + WS(is, 1)]));
                T4 = LD(&(x[WS(is, 1)]), dist, &(x[WS(is, 1)]));
                T5 = LD(&(x[WS(is, 3)]), dist, &(x[WS(is, 1)]));
                Tc = LD(&(x[WS(vs, 1)]), dist, &(x[WS(vs, 1)]));
                T9 = VADD(T1, T2);
                T3 = VSUB(T1, T2);
                TG = VADD(Ty, Tz);
                TA = VSUB(Ty, Tz);
                TH = VADD(TB, TC);
                TD = VSUB(TB, TC);
                Ta = VADD(T4, T5);
                T6 = VSUB(T4, T5);
                Td = LD(&(x[WS(vs, 1) + WS(is, 2)]), dist, &(x[WS(vs, 1)]));
                Tn = LD(&(x[WS(vs, 2)]), dist, &(x[WS(vs, 2)]));
                To = LD(&(x[WS(vs, 2) + WS(is, 2)]), dist, &(x[WS(vs, 2)]));
                Tq = LD(&(x[WS(vs, 2) + WS(is, 1)]), dist, &(x[WS(vs, 2) + WS(is, 1)]));
                Tr = LD(&(x[WS(vs, 2) + WS(is, 3)]), dist, &(x[WS(vs, 2) + WS(is, 1)]));
                Tf = LD(&(x[WS(vs, 1) + WS(is, 1)]), dist, &(x[WS(vs, 1) + WS(is, 1)]));
                Tg = LD(&(x[WS(vs, 1) + WS(is, 3)]), dist, &(x[WS(vs, 1) + WS(is, 1)]));
             }
             {
                V Tk, Te, Tv, Tp, Tw, Ts, Tl, Th, T7, TE, Tu, TF;
                ST(&(x[0]), VADD(T9, Ta), dist, &(x[0]));
                Tk = VADD(Tc, Td);
                Te = VSUB(Tc, Td);
                Tv = VADD(Tn, To);
                Tp = VSUB(Tn, To);
                Tw = VADD(Tq, Tr);
                Ts = VSUB(Tq, Tr);
                Tl = VADD(Tf, Tg);
                Th = VSUB(Tf, Tg);
                ST(&(x[WS(is, 3)]), VADD(TG, TH), dist, &(x[WS(is, 1)]));
                T7 = BYTWJ(&(W[0]), VFNMSI(T6, T3));
                TE = BYTWJ(&(W[0]), VFNMSI(TD, TA));
                {
                   V Tt, Ti, Tj, T8;
                   T8 = BYTWJ(&(W[TWVL * 4]), VFMAI(T6, T3));
                   ST(&(x[WS(is, 2)]), VADD(Tv, Tw), dist, &(x[0]));
                   Tt = BYTWJ(&(W[0]), VFNMSI(Ts, Tp));
                   ST(&(x[WS(is, 1)]), VADD(Tk, Tl), dist, &(x[WS(is, 1)]));
                   Ti = BYTWJ(&(W[0]), VFNMSI(Th, Te));
                   Tj = BYTWJ(&(W[TWVL * 4]), VFMAI(Th, Te));
                   ST(&(x[WS(vs, 1)]), T7, dist, &(x[WS(vs, 1)]));
                   ST(&(x[WS(vs, 1) + WS(is, 3)]), TE, dist, &(x[WS(vs, 1) + WS(is, 1)]));
                   ST(&(x[WS(vs, 3)]), T8, dist, &(x[WS(vs, 3)]));
                   Tu = BYTWJ(&(W[TWVL * 4]), VFMAI(Ts, Tp));
                   ST(&(x[WS(vs, 1) + WS(is, 2)]), Tt, dist, &(x[WS(vs, 1)]));
                   TF = BYTWJ(&(W[TWVL * 4]), VFMAI(TD, TA));
                   ST(&(x[WS(vs, 1) + WS(is, 1)]), Ti, dist, &(x[WS(vs, 1) + WS(is, 1)]));
                   ST(&(x[WS(vs, 3) + WS(is, 1)]), Tj, dist, &(x[WS(vs, 3) + WS(is, 1)]));
                }
                Tb = BYTWJ(&(W[TWVL * 2]), VSUB(T9, Ta));
                Tm = BYTWJ(&(W[TWVL * 2]), VSUB(Tk, Tl));
                Tx = BYTWJ(&(W[TWVL * 2]), VSUB(Tv, Tw));
                ST(&(x[WS(vs, 3) + WS(is, 2)]), Tu, dist, &(x[WS(vs, 3)]));
                TI = BYTWJ(&(W[TWVL * 2]), VSUB(TG, TH));
                ST(&(x[WS(vs, 3) + WS(is, 3)]), TF, dist, &(x[WS(vs, 3) + WS(is, 1)]));
             }
        }
        ST(&(x[WS(vs, 2)]), Tb, dist, &(x[WS(vs, 2)]));
        ST(&(x[WS(vs, 2) + WS(is, 1)]), Tm, dist, &(x[WS(vs, 2) + WS(is, 1)]));
        ST(&(x[WS(vs, 2) + WS(is, 2)]), Tx, dist, &(x[WS(vs, 2)]));
        ST(&(x[WS(vs, 2) + WS(is, 3)]), TI, dist, &(x[WS(vs, 2) + WS(is, 1)]));
     }
     return W;
}

static const tw_instr twinstr[] = {
     VTW(1),
     VTW(2),
     VTW(3),
     {TW_NEXT, VL, 0}
};

static const ct_desc desc = { 4, "q1fv_4", twinstr, &GENUS, {36, 24, 8, 0}, 0, 0, 0 };

void X(codelet_q1fv_4) (planner *p) {
     X(kdft_difsq_register) (p, q1fv_4, &desc);
}
#else                   /* HAVE_FMA */

/* Generated by: ../../../genfft/gen_twidsq_c -simd -compact -variables 4 -pipeline-latency 8 -n 4 -dif -name q1fv_4 -include q1f.h */

/*
 * This function contains 44 FP additions, 24 FP multiplications,
 * (or, 44 additions, 24 multiplications, 0 fused multiply/add),
 * 22 stack variables, and 32 memory accesses
 */
/*
 * Generator Id's : 
 * $Id: algsimp.ml,v 1.9 2006-02-12 23:34:12 athena Exp $
 * $Id: fft.ml,v 1.4 2006-01-05 03:04:27 stevenj Exp $
 * $Id: gen_twidsq_c.ml,v 1.8 2006-02-12 23:34:12 athena Exp $
 */

#include "q1f.h"

static const R *q1fv_4(R *ri, R *ii, const R *W, stride is, stride vs, INT m, INT dist)
{
     INT i;
     R *x;
     x = ri;
     for (i = 0; i < m; i = i + VL, x = x + (VL * dist), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(vs)) {
        V T3, T9, TA, TG, TD, TH, T6, Ta, Te, Tk, Tp, Tv, Ts, Tw, Th;
        V Tl;
        {
             V T1, T2, Ty, Tz;
             T1 = LD(&(x[0]), dist, &(x[0]));
             T2 = LD(&(x[WS(is, 2)]), dist, &(x[0]));
             T3 = VSUB(T1, T2);
             T9 = VADD(T1, T2);
             Ty = LD(&(x[WS(vs, 3)]), dist, &(x[WS(vs, 3)]));
             Tz = LD(&(x[WS(vs, 3) + WS(is, 2)]), dist, &(x[WS(vs, 3)]));
             TA = VSUB(Ty, Tz);
             TG = VADD(Ty, Tz);
        }
        {
             V TB, TC, T4, T5;
             TB = LD(&(x[WS(vs, 3) + WS(is, 1)]), dist, &(x[WS(vs, 3) + WS(is, 1)]));
             TC = LD(&(x[WS(vs, 3) + WS(is, 3)]), dist, &(x[WS(vs, 3) + WS(is, 1)]));
             TD = VBYI(VSUB(TB, TC));
             TH = VADD(TB, TC);
             T4 = LD(&(x[WS(is, 1)]), dist, &(x[WS(is, 1)]));
             T5 = LD(&(x[WS(is, 3)]), dist, &(x[WS(is, 1)]));
             T6 = VBYI(VSUB(T4, T5));
             Ta = VADD(T4, T5);
        }
        {
             V Tc, Td, Tn, To;
             Tc = LD(&(x[WS(vs, 1)]), dist, &(x[WS(vs, 1)]));
             Td = LD(&(x[WS(vs, 1) + WS(is, 2)]), dist, &(x[WS(vs, 1)]));
             Te = VSUB(Tc, Td);
             Tk = VADD(Tc, Td);
             Tn = LD(&(x[WS(vs, 2)]), dist, &(x[WS(vs, 2)]));
             To = LD(&(x[WS(vs, 2) + WS(is, 2)]), dist, &(x[WS(vs, 2)]));
             Tp = VSUB(Tn, To);
             Tv = VADD(Tn, To);
        }
        {
             V Tq, Tr, Tf, Tg;
             Tq = LD(&(x[WS(vs, 2) + WS(is, 1)]), dist, &(x[WS(vs, 2) + WS(is, 1)]));
             Tr = LD(&(x[WS(vs, 2) + WS(is, 3)]), dist, &(x[WS(vs, 2) + WS(is, 1)]));
             Ts = VBYI(VSUB(Tq, Tr));
             Tw = VADD(Tq, Tr);
             Tf = LD(&(x[WS(vs, 1) + WS(is, 1)]), dist, &(x[WS(vs, 1) + WS(is, 1)]));
             Tg = LD(&(x[WS(vs, 1) + WS(is, 3)]), dist, &(x[WS(vs, 1) + WS(is, 1)]));
             Th = VBYI(VSUB(Tf, Tg));
             Tl = VADD(Tf, Tg);
        }
        ST(&(x[0]), VADD(T9, Ta), dist, &(x[0]));
        ST(&(x[WS(is, 1)]), VADD(Tk, Tl), dist, &(x[WS(is, 1)]));
        ST(&(x[WS(is, 2)]), VADD(Tv, Tw), dist, &(x[0]));
        ST(&(x[WS(is, 3)]), VADD(TG, TH), dist, &(x[WS(is, 1)]));
        {
             V T7, Ti, Tt, TE;
             T7 = BYTWJ(&(W[0]), VSUB(T3, T6));
             ST(&(x[WS(vs, 1)]), T7, dist, &(x[WS(vs, 1)]));
             Ti = BYTWJ(&(W[0]), VSUB(Te, Th));
             ST(&(x[WS(vs, 1) + WS(is, 1)]), Ti, dist, &(x[WS(vs, 1) + WS(is, 1)]));
             Tt = BYTWJ(&(W[0]), VSUB(Tp, Ts));
             ST(&(x[WS(vs, 1) + WS(is, 2)]), Tt, dist, &(x[WS(vs, 1)]));
             TE = BYTWJ(&(W[0]), VSUB(TA, TD));
             ST(&(x[WS(vs, 1) + WS(is, 3)]), TE, dist, &(x[WS(vs, 1) + WS(is, 1)]));
        }
        {
             V T8, Tj, Tu, TF;
             T8 = BYTWJ(&(W[TWVL * 4]), VADD(T3, T6));
             ST(&(x[WS(vs, 3)]), T8, dist, &(x[WS(vs, 3)]));
             Tj = BYTWJ(&(W[TWVL * 4]), VADD(Te, Th));
             ST(&(x[WS(vs, 3) + WS(is, 1)]), Tj, dist, &(x[WS(vs, 3) + WS(is, 1)]));
             Tu = BYTWJ(&(W[TWVL * 4]), VADD(Tp, Ts));
             ST(&(x[WS(vs, 3) + WS(is, 2)]), Tu, dist, &(x[WS(vs, 3)]));
             TF = BYTWJ(&(W[TWVL * 4]), VADD(TA, TD));
             ST(&(x[WS(vs, 3) + WS(is, 3)]), TF, dist, &(x[WS(vs, 3) + WS(is, 1)]));
        }
        {
             V Tb, Tm, Tx, TI;
             Tb = BYTWJ(&(W[TWVL * 2]), VSUB(T9, Ta));
             ST(&(x[WS(vs, 2)]), Tb, dist, &(x[WS(vs, 2)]));
             Tm = BYTWJ(&(W[TWVL * 2]), VSUB(Tk, Tl));
             ST(&(x[WS(vs, 2) + WS(is, 1)]), Tm, dist, &(x[WS(vs, 2) + WS(is, 1)]));
             Tx = BYTWJ(&(W[TWVL * 2]), VSUB(Tv, Tw));
             ST(&(x[WS(vs, 2) + WS(is, 2)]), Tx, dist, &(x[WS(vs, 2)]));
             TI = BYTWJ(&(W[TWVL * 2]), VSUB(TG, TH));
             ST(&(x[WS(vs, 2) + WS(is, 3)]), TI, dist, &(x[WS(vs, 2) + WS(is, 1)]));
        }
     }
     return W;
}

static const tw_instr twinstr[] = {
     VTW(1),
     VTW(2),
     VTW(3),
     {TW_NEXT, VL, 0}
};

static const ct_desc desc = { 4, "q1fv_4", twinstr, &GENUS, {44, 24, 0, 0}, 0, 0, 0 };

void X(codelet_q1fv_4) (planner *p) {
     X(kdft_difsq_register) (p, q1fv_4, &desc);
}
#endif                        /* HAVE_FMA */

Generated by  Doxygen 1.6.0   Back to index