// major polynomial functions

#define DEBUG 0

#define M_INT long long int
#define LONG long long int
#define ULONG unsigned long long int
#define UINT32 unsigned int
#define UINT64 unsigned long long
typedef enum{false, true} bool;

#include "int128g.c"

#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#include <stdlib.h>

LONG * array(int n) { return (LONG *) malloc(n*sizeof(LONG)); }

LONG * array64s( LONG n ) {
    LONG *A;
if(n<0) { printf("array64s: n=%lld must be >= 0\n"); exit(1); }
    n = n * sizeof( LONG );
    A = (LONG *) malloc( n );
    if( A==0 ) { printf("array64s: malloc failed\n"); exit(1); }
    return A;
}

int * arrayI(int n) { return (int *) malloc(n*sizeof(int)); }

extern ULONG seed, mult;

LONG rand64s(LONG p) {
    LONG x,y;
    seed = mult*seed;
    x = seed >> 32;
    seed = mult*seed;
    y = seed >> 32;
    x = (x<<31) | y;
    x = x % p;
    return(x);
}

/******************************************************************************************/
/*  Roman's assembler utilities                                                           */
/******************************************************************************************/

#define WORDSIZE 64
typedef unsigned long M_UINT;

/* [x0:x1] := a*b */
#define MUL211(x0,x1,a,b) __asm__(\
"       mulq    %3      \n\t"   \
: "=a"(x0), "=d"(x1) : "0"(a), "r"(b) : "cc")

/* x0 := [x0:x1] / a */
/* x1 := [x0:x1] % a */
#define DIV21H(x0,x1,a) __asm__(\
"       divq    %2      \n\t"   \
: "=a"(x0), "=d"(x1) : "r"(a), "0"(x0), "1"(x1) : "cc")

/* floor of log2 */
M_UINT ilog2new(M_UINT x)
{
        M_UINT y;
        for (y=0; x >>= 1; y++);
        return y;
}

/* reciprocal of full word */
M_UINT recipnew(M_UINT p)
{
        M_UINT u0, u1;
        u1 = -(p+1);
        u0 = -1;
        DIV21H(u0,u1,p);
        return u0;
}

/* compute reciprocal and shift */
void init(M_UINT d, M_UINT *r, M_UINT *s)
{
        int l;
        l = ilog2new(d);
        if( d != (M_UINT)1 << l ) l++;
        d = d << (WORDSIZE-l);
        *r = recipnew(d);
        *s = l-1;
}

/* compute remainder via reciprocal */
inline M_UINT rem(M_UINT n, M_UINT d, M_UINT r, M_UINT s)
{
        M_UINT u0,u1,q;
        MUL211(u0,u1,n,r);
        q = (n - u1) >> 1;
        q = (q + u1) >> s;
        return n - q*d;
}

        /* z += a1:a0 */
        #define zadd(z,a0,a1) __asm__(\
        "       addq    %4, %0  \n\t" \
        "       adcq    %5, %1  \n\t" \
                : "=&r"(z[0]), "=&r"(z[1]) : "0"(z[0]), "1"(z[1]), "r"(a0), "r"(a1))

        /* z -= a1:a0 */
        #define zsub(z,a0,a1) __asm__(\
        "       subq    %4, %0  \n\t" \
        "       sbbq    %5, %1  \n\t" \
                : "=&r"(z[0]), "=&r"(z[1]) : "0"(z[0]), "1"(z[1]), "r"(a0), "r"(a1))

        /* z = a*b */
        #define zmul(z,a,b) __asm__(\
        "       mulq    %%rdx   \n\t" \
                : "=a"(z[0]), "=d"(z[1]) : "a"(a), "d"(b))

        /* z += a*b */
        #define zfma(z,a,b) do {        \
        unsigned long u,v;              \
        __asm__(                        \
        "       mulq    %%rdx           \n\t" \
        "       addq    %%rax, %0       \n\t" \
        "       adcq    %%rdx, %1       \n\t" \
                : "=&r"(z[0]), "=&r"(z[1]), "=a"(u), "=d"(v) : "0"(z[0]), "1"(z[1]), "a"(a), "d"(b));\
        } while (0)

        /* z -= a*b */
        #define zfms(z,a,b) do {        \
        unsigned long u,v;              \
        __asm__(                        \
        "       mulq    %%rdx           \n\t" \
        "       subq    %%rax, %0       \n\t" \
        "       sbbq    %%rdx, %1       \n\t" \
                : "=&r"(z[0]), "=&r"(z[1]), "=a"(u), "=d"(v) : "0"(z[0]), "1"(z[1]), "a"(a), "d"(b));\
        } while (0)

        /* z[0] = z % p */
        /* z[1] = z / p */
        /* quotient can overflow */
        #define zdiv(z,p) __asm__(\
        "       divq    %4      \n\t" \
                : "=a"(z[1]), "=d"(z[0]) : "a"(z[0]), "d"(z[1]), "r"(p))

        /* z = z % p safe */
        #define zmod(z,p) __asm__(\
        "       divq    %4      \n\t" \
        "       xorq    %0, %0  \n\t" \
                : "=a"(z[1]), "=d"(z[0]) : "a"(z[0]), "d"(z[1] < p ? z[1] : z[1] % p), "r"(p))

        /* z = z << s */
        #define zshl(z,s) __asm__(\
        "       shldq   %%cl, %0, %1    \n\t" \
        "       shlq    %%cl, %0        \n\t" \
                : "=&r"(z[0]), "=&r"(z[1]) : "0"(z[0]), "1"(z[1]), "c"(s))

typedef struct P_INT P_INT;
struct P_INT {
 	M_INT p;	/* prime */
 	M_INT s;	/* shift */
 	M_INT d;	/* normalized */
 	M_INT v;	/* reciprocal */
};

/* initialize mod p struct */
inline P_INT modinit64s(M_INT p)
{
        P_INT m;

        /* most significant bit location 63..0 */
        __asm__ __volatile__("bsrq %0, %0" : "=r"(m.s) : "0"(p));
        m.p = p;
        m.s = 64-m.s-1;
        m.d = p << m.s;

        /* reciprocal: (2^128-1)/n - 2^64 */
        /* [(2^64-n-1)*2^64 + (2^64-1)]/n */
        __asm__ __volatile__(\
        "       movq    %1, %%rdx       \n\t" \
        "       addq    $1, %%rdx       \n\t" \
        "       negq    %%rdx           \n\t" \
        "       movq    $-1, %%rax      \n\t" \
        "       divq    %1              \n\t" \
                : "=a"(m.v) : "r"(m.d) : "%rdx");
        return m;
}

inline M_INT mulmodinv64s(M_INT a, M_INT b, P_INT m)
{
        M_INT q, r;
        /* 0: compute  a*b = u1:u0, lshift by s, r9:r8 = (u1+1):u0 */
        /* 1: compute u1*v = q1:q0, q1:q0 += (u1+1):u0, in rdx:rax */
        /* 2: compute    r = u0-q1*d in r8 */
        /* 3: if r > q0 then q1 = q1-1, r = r+d (possible) */
        /* 4: if r >= d then q1 = q1+1, r = r-d (unlikely) */
        __asm__ __volatile__(\
        "0:     mulq    %%rdx                   \n\t" \
        "       shldq   %%cl,  %%rax, %%rdx     \n\t" \
        "       shlq    %%cl,  %%rax            \n\t" \
        "       leaq    1(%%rdx), %%r9          \n\t" \
        "       movq    %%rax, %%r8             \n\t" \
        "1:     movq    %%rdx, %%rax            \n\t" \
        "       mulq    %5                      \n\t" \
        "       addq    %%r8,  %%rax            \n\t" \
        "       adcq    %%r9,  %%rdx            \n\t" \
        "2:     imulq   %4,    %%rdx            \n\t" \
        "       subq    %%rdx, %%r8             \n\t" \
        "3:     cmpq    %%rax, %%r8             \n\t" \
        "       jbe     4f                      \n\t" \
        "       addq    %4,    %%r8             \n\t" \
        "       decq    %%rax                   \n\t" \
        "4:     cmpq    %4,    %%r8             \n\t" \
        "       jae     5f                      \n\t" \
        "       jmp     6f                      \n\t" \
        "5:     subq    %4,    %%r8             \n\t" \
        "       incq    %%rax                   \n\t" \
        "6:     shrq    %%cl,  %%r8             \n\t" \
        "       movq    %%r8,  %%rdx            \n\t" \
        : "=a"(q), "=d"(r)
        :  "0"(a),  "1"(b), "r"(m.d), "r"(m.v), "c"(m.s)
        : "%r8", "%r9", "%r10");
        /* q = quotient */
        return r;
}

//#define UINT64 unsigned long long
//
//typedef struct {
//	UINT64 s;	/* shift */
//	UINT64 v;	/* reciprocal of d */
//	UINT64 d0;	/* divisor shifted up */
//	UINT64 d1;
//} recint;
//


recint recip1(UINT64  p);
ULONG mulrec64(ULONG  a, ULONG  b, recint  v);


/******************************************************************************************/
/*  Zp utilities                                                                          */
/******************************************************************************************/

#define add32s add32util
#define sub32s sub32util
#define max32s max32util
#define min32s min32util
#define add64s add64util
#define sub64s sub64util
#define mul64s mul64util
#define neg64s neg64util

inline int add32s(int a, int b, int p) { int t; t = (a-p)+b; t += (t>>31) & p; return t; }
inline int sub32s(int a, int b, int p) { int t; t = (a-b);   t += (t>>31) & p; return t; }
inline int mul32s(int a, int b, int p) { int t; t = (LONG) a * b % p; return t; }
inline int neg32s(int a, int p) { return (a==0) ? 0 : p-a; }
inline int submul32s(int a, int b, int c, int p) { int t; t = ((LONG) p+a-b)*c % p; return t; }
inline int muladd32s(int a, int b, int c, int p) { int t; t = ((LONG) a*b+c) % p; return t; }
inline int max32s(int a, int b) { if(a>b) return a; else return b; }
inline int min32s(int a, int b) { if(a<b) return a; else return b; }
inline LONG add64s(LONG a, LONG b, LONG p) { LONG t; t = (a-p)+b; t += (t>>63) & p; return t; }
inline LONG sub64s(LONG a, LONG b, LONG p) { LONG t; t = a-b; t += (t>>63) & p; return t; }
inline LONG mul64s(LONG a, LONG b, LONG p) {
        LONG q, r;
        __asm__ __volatile__(           \
        "       mulq    %%rdx           \n\t" \
        "       divq    %4              \n\t" \
        : "=a"(q), "=d"(r) : "0"(a), "1"(b), "rm"(p));
        return r;
}

inline LONG neg64s(LONG a, LONG p) { LONG t; t = -a; t += (t>>63) & p; return t; }
inline LONG submul64s(LONG a, LONG b, LONG c, LONG p) { return mul64s(sub64s(a,b,p),c,p); } // This is faster.
inline LONG muladd64s(LONG a, LONG b, LONG c, LONG p, recint P) { return add64s(mulrec64(a,b,P),c,p); }
inline LONG max64s(LONG a, LONG b) { if(a>b) return a; else return b; }
inline LONG min64s(LONG a, LONG b) { if(a<b) return a; else return b; }

LONG log2N(LONG n) {

   LONG i,j;

   i=0;j=1;
   while(j<n){
      i++; j *= 2;
   }
   return i;

}

LONG ceil2(LONG n){
  if (n%2 == 0){ return n/2;}
  else if (n==1){return 0;}
  else {return (n/2)+1;}
}

int modinv32s( int c, int p )
{
    int d,r,q,r1,c1,d1;
    d = p;
    c1 = 1;
    d1 = 0;
    while( d != 0 ) {
        q = c / d;
        r = c - q*d; r1 = c1 - q*d1;
        c = d; c1 = d1;
        d = r; d1 = r1;
    }
    if( c!=1 ) return( 0 );
    if( c1 < 0 ) c1 += p;
    return( c1 );
}

/* c^(-1) mod p assuming 0 < c < p < 2^63 */
LONG modinv64s( LONG c, LONG p )
{   LONG d,r,q,r1,c1,d1;
    d = p;
    c1 = 1;
    d1 = 0;
    while( d != 0 ) {
        q = c / d;
        r = c - q*d; r1 = c1 - q*d1;
        c = d; c1 = d1;
        d = r; d1 = r1;
    }
    if( c!=1 ) return( 0 );
    if( c1 < 0 ) c1 += p;
    return( c1 );
}

/* a^n mod p assuming 0 <= a < p < 2^63 */
LONG powmod64s( LONG a, LONG n, LONG p )
{   LONG r,s;
    a += (a>>63) & p; // protect from bad input
    if( n==0 ) return 1;
    if( n==1 ) return a;
    for( r=1, s=a; n>0; n /= 2 ) { if( n & 1 ) r = mul64s(r,s,p); s = mul64s(s,s,p); }
    return r;
}

/* a^n mod p assuming 0 <= a < p < 2^63 */
LONG powmodrec64s( LONG a, LONG n, LONG p, recint P )
{   LONG r,s;
    a += (a>>63) & p; // protect from bad input
    if( n==0 ) return 1;
    if( n==1 ) return a;
    for( r=1, s=a; n>0; n /= 2 ) { if( n & 1 ) r = mulrec64(r,s,P); s = mulrec64(s,s,P); }
    return r;
}


/******************************************************************************************/
/*  Z  utilities                                                                          */
/******************************************************************************************/

void evalbeta( LONG *A, int d, LONG beta, recint P )
{   int i;
    if( d<0 ) return;
    A[0] = 1;
    if( d==0 ) return;
    A[1] = beta;
    for( i=2; i<=d; i++ ) A[i] = mulrec64(beta,A[i-1],P);
    return;
}


LONG gcd64s( LONG a, LONG b )
{   LONG d,r,q,r1,c1,d1;
    if( a<0 ) a = -a;
    if( b<0 ) b = -b;
    while( b!=0 ) { r = a % b; a = b; b = r; }
    return a;
}

/******************************************************************************************/
/* Polynomial utilities                                                                   */
/******************************************************************************************/

void vecfill64s( LONG x, LONG *A, int n )
{   int i;
    for( i=0; i<n; i++ ) A[i] = x;
    return;
}

void polcopy64s( LONG *A, int d, LONG *B )
{   int i;
    for( i=0; i<=d; i++) B[i]=A[i];
    return;
}

/* print an array in form [a0,a1,...,an-1] */
void vecprint32s( int *A, int n )
{   int i;
    printf("[");
    for( i=0; i<n; i++ ) { printf("%d",A[i]); if( i<n-1 ) printf(", "); }
    printf("]");
    return;
}

/* print an array in form [a0,a1,...,an-1] */
void vecprint64s( LONG *A, int n )
{   int i;
    printf("[");
    for( i=0; i<n; i++ ) { printf("%lld",A[i]); if( i<n-1 ) printf(", "); }
    printf("]");
    return;
}

/* print an array in form [a0,a1,...,an-1] */
void vecprint64u( ULONG *A, int n )
{   int i;
    printf("[");
    for( i=0; i<n; i++ ) { printf("%llu",A[i]); if( i<n-1 ) printf(", "); }
    printf("]");
    return;
}

/* print an array [a0,a1,...,ad] in form ad*x^d+...+a1*x+a0 */
void polprint64s( LONG *A, int d ) {
    int i;
    if( d==-1 ) { printf("0;\n"); return; }
    for( i=d; i>0; i-- ) if( A[i]!=0 ) printf("%lld*x^%d+",A[i],i);
/* print an array [a0,a1,...,ad] in form ad*x^d+...+a1*x+a0 */
    printf("%lld",A[0]);
    return;
}

void polrev64s( LONG *f, LONG d ) {
    LONG i; LONG t;
    for( i=0; i<=d/2; i++ ) { t = f[i]; f[i] = f[d-i]; f[d-i] = t; }
    return;
}

//print a 2D array in 2 variables
 void polprintbivar64s( LONG *A, int dx, int dy, char * x, char * y){

    int i,j;


    if(dx == -1 || dy == -1) { printf("0;\n"); return; }

    for (i=dy;i>=0;i--){
       for(j=dx;j>=0;j--){
          if(A[i*(dx+1)+j] != 0 && i*(dx+1)+j != 0){
             printf("%lld*%s^%d*%s^%d+",A[i*(dx+1)+j],x,j,y,i);
          }
       }
    }
    printf("%lld",A[0]);
    return;
 }

/* for C = [c1,c2,...,cn] and X = [e1,e2,...,en] print  c1*x^e1+...+cn*x^en;  */
void polsparseprint64s( LONG *C, LONG *X, int n ) {
    int i;
    if( n==0 ) { printf("0;\n"); return; }
    for( i=n-1; i>0; i-- ) printf("%lld*x^%lld+",C[i],X[i]);
    printf("%lld*x^%lld;\n",C[i],X[i]);
    return;
}

void pol3sparseprint64s( LONG *C, LONG *X, int t ) {
    int i,dx,dy,dz,mask;
    LONG m;
    if( t==0 ) { printf("0;\n"); return; }
    mask = (1<<21)-1;
    for( i=0; i<t; i++ ) {
        m = X[i]; dz = m & mask; m = m>>21; dy = m & mask; dx = m>>21;
        printf("%lld*x^%d*y^%d*z^%d",C[i],dx,dy,dz);
        if( i!=t-1 ) printf("+"); else printf(";\n");
    }
    return;
}

void polnsparseprint64s( LONG *C, LONG *X, int t, int n ) {
    int numbits,i,j;
    LONG m,d,mask,D[63];
    if( t==0 ) { printf("0;\n"); return; }
    numbits = 63/n;
    mask = (1LL<<numbits)-1;
    for( i=0; i<t; i++ ) {
        m = X[i];
        printf("%lld",C[i]);
        for( j=0; j<n; j++ ) {
            D[j] = m & mask;
            m = m >> numbits;
        }
        for( j=1; j<=n; j++ ) {
            d = D[n-j];
            if( d!=0 ) printf("*x[%d]^%lld",j,d);
        }
        if( i!=t-1 ) printf("+");
    }
    return;
}

void pol2sort64s( LONG *C, LONG *X, int n ) {
    LONG x,c;
    if( n<2 ) return;
    n--;
    x = X[n];
    c = C[n];
    pol2sort64s( C, X, n );
    while( n>0 && X[n-1]>x ) { X[n] = X[n-1]; C[n] = C[n-1]; n--; }
    X[n] = x;
    C[n] = c;
    return;
}

LONG pol2simpNoSort64s( LONG *C, LONG *X, int n, LONG p ) {
    LONG i,j,k,c,x;
    k = 0;
    i = 0;
    for( k=0,i=0; i<n; k++ ) {
       c = C[i];
       x = X[i];
       for( j=i+1; j<n && X[j]==x; j++ ) c = add64s(c,C[j],p);
       C[k] = c; X[k] = x;
       i = j;
    }
    return k;
}

void polscamul64s( LONG x, LONG *A, int d, LONG p )
{
    int i;
    if( x==1 ) return;
    if( x==-1 ) for( i=0; i<=d; i++ ) A[i] = neg64s(A[i],p);
    else for( i=0; i<=d; i++ ) A[i] = mul64s(x,A[i],p);
    return;
}

void polscamulrec64s( LONG x, LONG *A, int d, LONG p, recint P )
{
    int i;
    if( x==1 ) return;
    if( x==-1 ) for( i=0; i<=d; i++ ) A[i] = neg64s(A[i],p);
    else for( i=0; i<=d; i++ ) A[i] = mulrec64(x,A[i],P);
    return;
}

/* make polynomial in A monic */
void monic64s( LONG *A, int d, LONG p ) {
    int i; LONG inv;
    recint P;
    P = recip1(p);
    if( d<0 || A[d]==1 ) return;
    inv = modinv64s(A[d],p);
    for( i=0; i<d; i++ ) A[i] = mulrec64(inv, A[i], P);
    A[d] = 1;
    return;
}

/* make powers in A = [1, alpha, alpha^2, ..., alpha^(n-1)] */
void vecpow64s( int n, LONG *A, LONG alpha, LONG p )
{
   int i;
   //M_UINT recip, shift;  // Not used yet.
   LONG a4;
   recint P;

   if( n<1 ) return;
   //init( (M_UINT) p, &recip, &shift );  // Not used yet.
   P = recip1(p);

   A[0] = 1;
   for( i=1; i<4 && i<n; i++ ) A[i] = mulrec64(alpha,A[i-1],P);
   if( n<4 ) return;

   a4 = mulrec64(alpha,A[3],P);
   for( i=4; i<n-3; i+=4 ) { A[i+0] = mulrec64(a4,A[i-4],P);
                             A[i+1] = mulrec64(a4,A[i-3],P);
                             A[i+2] = mulrec64(a4,A[i-2],P);
                             A[i+3] = mulrec64(a4,A[i-1],P); }
   for( ; i<n; i++ ) A[i] = mulrec64(alpha,A[i-1],P);
   return;
}

/******************************************************************************************/
/* Polynomial routines                                                                    */
/******************************************************************************************/


int poldeg64s(LONG *A, int d){

   int i;
   for(i=d;i>=0;i--){
      if(A[i]!=0){ return i; }
   }

   return -1;

}

LONG poleval64s(LONG *a, int d, LONG x, LONG p) {
    int i;
    LONG r;
    recint P;
    if( d==-1 ) return 0;
    P = recip1(p);
    for( r=a[d],i=d-1; i>=0; i-- ) r = muladd64s(r, x, a[i], p, P);
    return r;
}

LONG evalhorner( LONG *f, LONG d, LONG x, recint P, LONG p ) {
    // f(x) mod p
    int i; LONG c;
    if( d<0 ) return 0;
    c = f[d];
    for( i=d-1; i>=0; i-- ) c = add64s(f[i],mulrec64(c,x,P),p);
    return c;
}

void newt2stand( LONG *V, int d, LONG *B, recint P, LONG p ) {
    int i,j; LONG alpha;
    B[0] = V[d];
    for( i=0,alpha=d-1; alpha>=0; alpha--,i++ ) {
        // B = (x-alpha) B mod p
        B[i+1] = B[i];
        for( j=i; j>0; j-- ) B[j] = sub64s(B[j-1],mulrec64(alpha,B[j],P),p);
        B[0] = sub64s(V[alpha],mulrec64(alpha,B[0],P),p);
        //printf(" B = "); vecprint64s(B,i+2); printf("\n");
    }
    return;
}

int newtoncofs( LONG *U, int d, LONG *minv, LONG *V, recint P, LONG p ) {
// Let V = V[0]+(x-0)V[1]+(x-1)(x-0)V[2]+(x-2)(x-1)(x-0)V[3]+...+(x-d-1)...(x-1)(x-0)V[d]
// Let V(i) = U[i] so that
// V[i]=(U[i] - (V[0]+iV[1]+i(i-1)V[2]+...+i!V[i-1])) / i!  which in horner form is
// V[i]=(U[i] - (V[0]+i(V[1]+(i-1)(V[2]+...+3(V[i-2]+2(V[i-1]))))) / i!
    int i,j; LONG t,m;
    if( d<0 ) return d;
    V[0] = U[0];
    for( i=1; i<=d; i++ ) {
        t = V[i-1];
        for( m=2,j=i-2; j>=0; j--,m++ )
            t = add64s( mulrec64(m,t,P), V[j], p );
        t = mulrec64( sub64s(U[i],t,p), minv[i], P );
        V[i] = t;
    }
    while( d>=0 && V[d]==0 ) d--;
    return d;
}

int polnsparsedeg64s( LONG *X, int t, int n, int xi ) {
    int numbits,j,d;
    LONG x,mask,shift;
    // degree(f,x[i])
    if( xi<1 || xi>n ) return -1;
    numbits = 63/n;
    mask = (1LL<<numbits)-1;
    shift = numbits*(n-xi);
    d = -1;
    for( j=0; j<t; j++ ) {
        x = X[j] >> shift;
        if( xi>1 ) x = x & mask;
        if( x>d ) d = x;
    }
    return d;
}


void polnsparsevecdeg64s( LONG *X, int t, int n, int *D ) {
    int numbits,i,j,d,e;
    LONG x,y,mask,shift;
    // D[i] = degree(f,x[i]) for 0<=i<n with variables read from the right.
    for(i=0; i<n; i++ ) D[i] = 0;
    numbits = 63/n;
    mask = (1LL<<numbits)-1;
    for( j=0; j<t; j++ ) {
        x = X[j];
        for( i=0; i<n; i++ ) {
            if( i<n-1 ) d = x & mask; else d = x;
            if( d>D[i] ) D[i] = d;
            x = x >> numbits;
        }
    }
    return;
}


LONG polnsparseeval64s(LONG *C, LONG *X, int t, LONG *A, int n, LONG p ) {
// Evaluate f(A[0],A[1],...,A[n-1]) mod p where f(x1,x2,...,xn) = sum( c[i]*X[i], i=0..t-1 )
    int numbits,i,j;
    LONG mask,s,x,u;
    recint P;
    P = recip1(p);
    numbits = 63/n;
    mask = (1LL<<numbits)-1;
    s = 0;
    for( i=0; i<t; i++ ) {
        x = X[i];
        u = C[i];
        for( j=0; j<n; j++ ) { u = mulrec64(u,powmodrec64s(A[j],x&mask,p,P),P); x = x >> numbits; }
        s = add64s(s,u,p);
    }
    return s;
}

int vec2partition64s(LONG *D, LONG *C, int n ) {
    int i,j,m;
    LONG x,t,c;
    m = n/2;
    x = D[m]; D[m] = D[0];
    c = C[m]; C[m] = C[0];
    for( i=1,j=n-1; j>=i ; ) {
       if( D[i]<x ) { D[i-1] = D[i]; C[i-1] = C[i]; i++; }
       else { t = D[i]; D[i] = D[j]; D[j] = t; t = C[i]; C[i] = C[j]; C[j] = t; j--; }
    }
    D[i-1] = x;
    C[i-1] = c;
    return i-1;
}


void sort2vec64s(LONG *D, LONG *C, int n ) {
// sort on D into ascending order
    int i;
    LONG x,c;
    if( n<2 ) return;
    if( n>40 ) {  // QUICKSORT
                  i = vec2partition64s(D, C, n);
                  sort2vec64s(D,C,i);
                  sort2vec64s(D+i+1,C+i+1,n-i-1);
                  return;
                 }
    sort2vec64s(D,C,n-1);
    x = D[n-1];
    c = C[n-1];
    for( i=n-1; i>0 && x<D[i-1]; i-- ) { D[i] = D[i-1]; C[i] = C[i-1]; }
    D[i] = x;
    C[i] = c;
    return;
}

LONG pol2simp64s( LONG *C, LONG *X, int n, LONG p ) {
    LONG i,j,k,c,x;
    sort2vec64s(X,C,n);
    k = 0;
    i = 0;
    for( k=0,i=0; i<n; ) {
       c = C[i];
       x = X[i];
       for( j=i+1; j<n && X[j]==x; j++ ) c = add64s(c,C[j],p);
       if( c!=0 ) { C[k] = c; X[k] = x; k++; }
       i = j;
    }
    return k;
}


// Evaluate g(x1) = f(x1,A[0],A[1],...,A[n-2]) where f = sum( A[i]*X[i], i=0..t-1 )
// where g(x1) = sum( B[i]*Y[i], i=0..d ) and return d;
int poln2nm1sparseeval64s( LONG *C, LONG *X, int t, LONG *A, int n, LONG *B, LONG *Y, LONG p )
{
    int numbits,i,j,k;
    LONG mask,s,x,u;
    recint P;
    P = recip1(p);
    numbits = 63/n;
    mask = (1LL<<numbits)-1;
    s = numbits*(n-1);
    mask = (1LL<<s) - 1;

    for( i=0; i<t; i++ ) {
        x = X[i];
        u = C[i];
        for( j=0; j<n-1; j++ ) { u = mulrec64(u,powmodrec64s(A[j],x&mask,p,P),P); x = x >> numbits; }
        Y[i] = x;
        B[i] = u;
    }
    sort2vec64s(Y,B,t);
    k=0;
    i=0;
    while( i<t ) {
       s = B[i];
       for( j=i+1; j<t && Y[i]==Y[j]; j++ ) s = add64s(B[j],s,p);
       if( s!= 0 ) { B[k] = s; Y[k] = Y[i]; k++; }
       i = j;
    }
    return k;
}

// So beta = alpha^i and we evaluate f(x1,y) at y=beta
int poln2nm1kroneval64s( LONG *C, LONG *X, int t, LONG beta, int n, LONG *B, LONG *Y, LONG p )
{
    int numbits,i,j,k;
    LONG pow[63],mask,s,x,u;
    recint P;
    P = recip1(p);
    numbits = 63/n;
    s = numbits*(n-1);
    mask = (1LL<<s) - 1;

    pow[0] = 1;
    pow[1] = beta;
    for( i=2; i<s; i++ ) pow[i] = mulrec64(pow[i-1],pow[i-1],P);

    for( i=0; i<t; i++ ) {
        x = X[i];
        Y[i] = x >> s;
        x = x & mask;
        //u = powmod64s(beta,x,p);
        for( u=1,j=1; x>0; x=x>>1,j++ ) if( x&1 ) u = mulrec64(u,pow[j],P);
        B[i] = mulrec64(C[i],u,P);
    }

    sort2vec64s(Y,B,t);
    k=0;
    i=0;
    while( i<t ) {
       s = B[i];
       for( j=i+1; j<t && Y[i]==Y[j]; j++ ) s = add64s(B[j],s,p);
       if( s!= 0 ) { B[k] = s; Y[k] = Y[i]; k++; }
       i = j;
    }
    return k;
}

void randpoly64s( int n, int *DEG, int t, LONG *A, LONG *X, LONG p )
{   // monic in x[n-1]
    int numbits,i,j;
    LONG x,c,y;
    numbits = 63/n;
    for( i=0; i<t; i++ ) {
        for( c=0; c==0; c = rand64s(p) );
        A[i] = c;
        x = rand64s(DEG[0]);
        for( j=1; j<n; j++ ) { y = rand64s(DEG[j]+1); x = (x << numbits) | y; }
        X[i] = x;
    }
    // make it monic in x[0] and not be divisible by x[0]
    A[t-1] = 1;
    X[t-1] = (LONG)DEG[0] << (numbits*(n-1));
    X[0  ] = X[0] & ( (1LL << numbits*(n-1)) - 1 );
}


void randpolymonic64s( int n, int *DEG, int t, LONG *A, LONG *X, LONG p )
{   // monic in x[n-1]
    int numbits,i,j;
    LONG x,c,y;
    numbits = 63/n;
    for( i=0; i<t; i++ ) {
        for( c=0; c==0; c = rand64s(p) );
        A[i] = c;
        x = rand64s(DEG[0]+1);
        for( j=1; j<n; j++ ) { y = rand64s(DEG[j]+1); x = (x << numbits) | y; }
        X[i] = x;
    }
    // make it monic in x[0] and not be divisible by x[0]
    A[t-1] = 1;
    X[t-1] = (LONG)(DEG[0]+1) << (numbits*(n-1));
    X[0  ] = X[0] & ( (1LL << numbits*(n-1)) - 1 );
}


void randpolyNOTmonic64s( int n, int *DEG, int t, LONG *A, LONG *X, LONG p )
{
    int numbits,i,j;
    LONG x,c,y;
    numbits = 63/n;
    for( i=0; i<t; i++ ) {
        for( c=0; c==0; c = rand64s(p) );
        A[i] = c;
        x = rand64s(DEG[0]+1);
        for( j=1; j<n; j++ ) { y = rand64s(DEG[j]+1); x = (x << numbits) | y; }
        X[i] = x;
    }
    // make it have a constant
    X[0] = 0;
}

void randpolyNOTmonic64s2( int n, int *DEG, int D, int t, LONG *A, LONG *X, LONG p )
{   // with total degree at most D
    int numbits,i,j;
    LONG x,c,y,d;
    numbits = 63/n;
    for( i=0; i<t; i++ ) {
        for( c=0; c==0; c = rand64s(p) );
        A[i] = c;
        x = rand64s(DEG[0]+1);
        d = x;
        for( d=x,j=1; j<n; j++,d+=y ) { y = rand64s(DEG[j]+1); x = (x << numbits) | y; }
        if( d<=D ) X[i] = x; else i--;
    }
    // make it have a constant
    X[0] = 0;
    return;
}

LONG polsparseeval64s(LONG *c, LONG *a, int n, LONG x, LONG p) {
    int i;
    LONG d, y,r;
    recint P;
    if( n==0 ) return 0;
    P = recip1(p);
    r = c[n-1];
    for( i=n-1; i>0; i-- ) {
        d = a[i]-a[i-1];
        if( d<=0 ) return 0; // should not happen
        if( d==1 ) y = x; else y = powmodrec64s(x,d,p,P);
        //r = (y * r + c[i-1]) % p;
        r = muladd64s(y,r,c[i-1],p,P);
    }
    y = powmodrec64s(x,a[0],p,P);
    r = mulrec64(y,r,P);
    return r;
}


LONG evalnext2( LONG *A, LONG *X, LONG *G, int na, LONG *B, LONG *C, LONG *Y, LONG *Z, LONG p, int MAX )
{   int i,j,k,m,n;
    LONG c,d,t,x;
    recint P;
    P = recip1(p);
    k = m = 0;
    for( i=0; i<na; i=n ) {
        for( x=X[i],n=i+1; n<na && X[n]==x; n++ );
        //c = d = 0;
        //for( j=i; j<n; j++ ) {
        //   t = mulrec64(A[j],G[j],P); c = add64s(t,c,p);
        //   t = mulrec64(t,G[j],P); A[j] = t; d = add64s(t,d,p);
        //}
        for( c=0,j=i; j<n; j++ ) { t = mulrec64(A[j],G[j],P); A[j] = t; c = add64s(t,c,p); }
        for( d=0,j=i; j<n; j++ ) { t = mulrec64(A[j],G[j],P); A[j] = t; d = add64s(t,d,p); }
        if( c!=0 ) { B[k] = c; Y[k] = x; k++; }
        if( d!=0 ) { C[m] = d; Z[m] = x; m++; }
    }
    if( DEBUG ) if ( k>MAX ) printf("ERROR in evalnext2: k out of range k=%d  max=%d\n",k,MAX);
    if( DEBUG ) if ( m>MAX ) printf("ERROR in evalnext2: m out of range m=%d  max=%d\n",m,MAX);
    t = ((LONG) k << 32) + m;
    return t;
}


/******************************************************************************/
/**             Polynomial routines in Zp[x]                                 **/
/******************************************************************************/

int poladd64s(LONG *a, LONG *b, LONG *c, int da, int db, LONG p) {
// c = a + b mod p
   int i,m;
   m = min32s(da,db);
   for( i=0; i<=m; i++ ) c[i] = add64s(a[i],b[i],p);
   if( da==db ) { while( da>=0 && c[da]==0 ) da--; return da; }
   if( da<db ) { for( i=da+1; i<=db; i++ ) c[i] = b[i]; return db; }
   for( i=db+1; i<=da; i++ ) c[i] = a[i]; return da;
}


int poladdsca64s(LONG *a, LONG *b, LONG *c, int da, int db, LONG alpha, LONG p, recint P) {
// c = a + alpha b mod p  --  also supports  a = a + alpha b mod p
   int i,m;
   m = min32s(da,db);
   for( i=0; i<=m; i++ ) c[i] = add64s(a[i],mulrec64(alpha,b[i],P),p);
   if( da==db ) { while( da>=0 && c[da]==0 ) da--; return da; }
   if( da>db ) { for ( i=db+1; i<=da; i++ ) c[i] = a[i]; return da; }
   for( i=da+1; i<=db; i++ ) c[i] = mulrec64(alpha,b[i],P);
   return db;
}


int polsub64s(LONG *a, LONG *b, LONG *c, int da, int db, LONG p) {
// c = a-b mod p
   int i,m;
   m = min32s(da,db);
   //if( da<0 ) printf("SUB da=%d\n",da);
   //if( db<0 ) printf("SUB db=%d\n",db);
   for( i=0; i<=m; i++ ) c[i] = sub64s(a[i],b[i],p);
   if( da==db ) { while( da>=0 && c[da]==0 ) da--; return da; }
   if( da>db ) { for ( i=db+1; i<=da; i++ ) c[i] = a[i]; return da; }
   else { for( i=da+1; i<=db; i++ ) c[i] = neg64s(b[i],p); return db; }
}


int polsubsca64s(LONG *a, LONG *b, LONG *c, int da, int db, LONG alpha, LONG p, recint P) {
// c = a - alpha b mod p  --  also supports  a = a - alpha b mod p
   int i,m;
   m = min32s(da,db);
   for( i=0; i<=m; i++ ) c[i] = sub64s(a[i],mulrec64(alpha,b[i],P),p);
   if( da==db ) { while( da>=0 && c[da]==0 ) da--; return da; }
   if( da>db ) { for ( i=db+1; i<=da; i++ ) c[i] = a[i]; return da; }
   for( i=da+1; i<=db; i++ ) c[i] = neg64s(mulrec64(alpha,b[i],P),p);
   return db;
}


/* compute C(x) = A(x)^2 mod p and return deg(C) */
/* we allow C to overwrite A i.e. polsqr64s(A,A,d,p) */
int polsqr64s( LONG * A, LONG * C, int d, LONG p )
{
    int i,k,m; ULONG z[2];
    if( d<0 ) return d;
    for( k=2*d; k>=0; k-- ) {
       m = min32s(k,d);
       i = max32s(0,k-d);
       z[0] ^= z[0]; // = 0?
       z[1] ^= z[1]; // = 0?
       while( i<m-2 ) {
            zfma(z,A[i++],A[m--]);
            if( z[1]>=p ) z[1] -= p;
            zfma(z,A[i++],A[m--]);
       }
       if( i<m ) {
            zfma(z,A[i++],A[m--]);
            if( z[1]>=p ) z[1] -= p;
       }
       zadd(z,z[0],z[1]);
       if( z[1]>=p ) z[1] -= p;
       if( i==m ) zfma(z,A[i],A[i]);
       zmod(z,p);
       C[k] = z[0];
    }
    //for( dc = 2*d; dc>=0 && C[dc]==0; dc-- );
    // Why is this loop here? Z_p has no zero-divisors.
    return( 2*d );
}

int min(LONG a, LONG b){ if(a<b){return a;} else {return b;} }
int max(LONG a, LONG b){ if(a>b){return a;} else {return b;} }

int polmul32s( LONG * A, LONG * B, LONG * C, LONG da, LONG db, LONG p )
{
    LONG dc,i,j,k,m; LONG t,M;
    if( da<0 || db<0 ) return -1;
    //if( A==B ) return polsqr32s(A,C,da,p);
    M = ((LONG) p)*p*2;
    dc = da+db;
    for( k=dc; k>=0; k-- ) {
       m = min(k,da);
       i = max(0,k-db);
       t = M;
       while( i<m ) {
            t -= ((LONG) A[i])*B[k-i]; i++;
            t -= ((LONG) A[i])*B[k-i]; i++;
            t += (t>>63) & M;
       }
       if( i==m ) { t -= ((LONG) A[i])*B[k-i]; t += (t>>63) & M; }
       t = -t;
       t += (t>>63) & M;
       t = t % p;
       C[k] = t;
    }
    return( dc );
}

/* compute C(x) = A(x) * B(x) mod p and return deg(C) */
/* we allow C to overwrite either A or B i.e. polmul64s(A,B,A,da,db,p) */
int polmul64s( LONG * A, LONG * B, LONG * C, int da, int db, LONG p)
{
    int i,k,m;
    int overflow;
    //recint P; P = recip1(p);
    ULONG z[2];
    if( da<0 || db<0 ) return -1;
    int dc = da+db;
    for( k=dc; k>=0; k-- ) {
       i = max32s(0,k-db);
       m = min32s(k,da);
       //{ LONG t;
       //   //for( t=0; i<=m; i++ ) t = add64s(t,mul64s(A[i],B[k-i],p),p);
       //   for( t=0; i<=m; i++ ) t = add64s(t,mulrec64(A[i],B[k-i],P),p);
       //   C[k] = t;
       //}
       z[0] = 0;
       z[1] = 0;
       //if( p < 1LL << 31 ) { // can't overflow - 20% faster on a Core i7 2600
       if( p < 1LL << 50 ) { // can't overflow - 20% faster on a Core i7 2600 #if p < 2^50
          while( i<m-2 ) {
             zfma(z,A[i],B[k-i]); i++;
             zfma(z,A[i],B[k-i]); i++;
             zfma(z,A[i],B[k-i]); i++;
          }
          while( i<=m ) { zfma(z,A[i],B[k-i]); i++; }
       }
       else {
          while( i<m-3 ) {
             zfma(z,A[i],B[k-i]); i++; if( z[1]>=p ) z[1] -= p;
             zfma(z,A[i],B[k-i]); i++;
             zfma(z,A[i],B[k-i]); i++; if( z[1]>=p ) z[1] -= p;
             zfma(z,A[i],B[k-i]); i++;
          }
          while( i<=m ) { zfma(z,A[i],B[k-i]); i++; if( z[1]>=p ) z[1] -= p; }
       }
       zmod(z,p);
       C[k] = z[0];
    }

    return( dc );
}

/* divide A by B and put the remainder and quotient in A */
/* return the degree of the remainder                    */
int poldiv64s( LONG * A, LONG * B, int da, int db, LONG p )
{
    int dq,dr,k,j,m; LONG t,inv;
    ULONG z[2];
    //recint P; P = recip1(p);
    if( db<0 ) { printf("division by zero\n"); exit(1); }
    if( da<db ) return da; else { dq = da-db; dr = db-1; }
    inv = modinv64s(B[db],p);
    for( k=da; k>=0; k-- ) {
        m = min32s(dr,k);
        j = max32s(0,k-dq);
        //for( t=0; j<=m; j++ ) t = add64s(t,mul64s(B[j],A[k-j+db],p),p);
        //for( t=0; j<=m; j++ ) t = add64s(t,mulrec64(B[j],A[k-j+db],P),p);
        //t = sub64s(A[k],t,p);
        z[0] = 0;
        z[1] = 0;
        //if( p< 1LL << 31 ) { // can't overflow - 20% faster on a Core i7 2600
        if( p< 1LL << 50 ) { // can't overflow - 20% faster on a Core i7 2600
            while( j<m-2 ) {
                zfma(z,B[j],A[k-j+db]); j++;
                zfma(z,B[j],A[k-j+db]); j++;
                zfma(z,B[j],A[k-j+db]); j++;
            }
            while( j<=m ) { zfma(z,B[j],A[k-j+db]); j++; }
        }
        else {
            while( j<m-3 ) {
                zfma(z,B[j],A[k-j+db]); j++; if( z[1]>=p ) z[1] -= p;
                zfma(z,B[j],A[k-j+db]); j++;
                zfma(z,B[j],A[k-j+db]); j++; if( z[1]>=p ) z[1] -= p;
                zfma(z,B[j],A[k-j+db]); j++;
            }
            while( j<=m ) { zfma(z,B[j],A[k-j+db]); j++; if( z[1]>=p ) z[1] -= p; }
        }
        zmod(z,p);
        t = sub64s(A[k],z[0],p); // t = A[k] - z[0]; t += (t>>63) & p;
        if( k>=db && inv!=1 ) t = mul64s(t,inv,p);
        A[k] = t;
    }
    while( dr>=0 && A[dr]==0 ) dr--;
    return( dr );
}


/* compute gcd(A,B) and put gcd in A and return it's degree */
int polsubmulP( LONG *A, LONG *B, LONG a, LONG b, int dA, int dB, LONG p, recint P ) {

   // compute A = A - (ax+b) B efficiently

   LONG s,t; int i, d;

   if( dB==-1 ) return dA; // B = 0

   d = dA;

   // if deg(A) <= deg(B) then pad A with zeroes
   while( dA<=dB ) A[++dA] = 0;

   // constant term is special
   t = mulrec64(b,B[0],P);
   A[0] = sub64s(A[0],t,p);

   //for( i=1; i<=dB; i++ ) { t = mul64s(a,B[i-1],p); t = add64s(t,mul64s(b,B[i],p),p); A[i] = sub64s(A[i],t,p); }
   for( i=1; i<=dB; i++ ) { t = mulrec64(a,B[i-1],P); t = add64s(t,mulrec64(b,B[i],P),p); A[i] = sub64s(A[i],t,p); }

   // update leading term from B
   t = mulrec64(a,B[dB],P);
   A[dB+1] = sub64s(A[dB+1],t,p);

   // compute and return degree
   while( dA>=0 && (A[dA]==0 || A[dA]==p) ) dA--;

   if( dA==d ) printf("polsubmul failure: dAin=%d dAout=%d\n",d,dA);
   return dA;
}

int polsubmul( LONG *A, LONG *B, LONG a, LONG b, int dA, int dB,
               LONG p, M_UINT recip, M_UINT shift ) {
   // compute A = A - (ax+b) B efficiently

   LONG t; int i; ULONG z[2];

   if( dB==-1 ) return dA; // B = 0
   z[0] = z[1] = 0LL;

   // if deg(A) <= deg(B) then pad A with zeroes
   while( dA<=dB ) A[++dA] = 0;

   // constant term is special
   t = mul64s(b,B[0],p) ;
   A[0] = sub64s(A[0],t,p);

   for( i=1; i<=dB; i++ ) { zmul(z,a,B[i-1]); zfma(z,b,B[i]); zmod(z,p); t = A[i]-z[0]; A[i] = t + ((t>>63)&p); }

   // update leading term from B
   t = mul64s(a,B[dB],p);
   A[dB+1] = sub64s(A[dB+1],t,p);

   // compute and return degree
   while( dA>=0 && (A[dA]==0 || A[dA]==p) ) dA--;
   return dA;
}


/* compute gcd(A,B) and put gcd in A and return it's degree */
/* Both A and B are destroyed */
int polgcd64s( LONG * A, LONG * B, int da, int db, LONG p ) {
    int dr; LONG *C, *D, *R, u, a, b;
    recint P;
    if( db<0 ) { printf("division by zero\n"); exit(1); }
    P = recip1(p);
    C = A; D = B;
    if( da<db ) { R = C; C = D; D = R; dr = da; da = db; db = dr; }
    while( 1 ) {
        if( db>0 && da-db==1 ) { // normal case
        //if( 0 ) {
            u = modinv64s(D[db],p);
            a = mulrec64(C[da],u,P);
            b = mulrec64(a,D[db-1],P);
            b = mulrec64(u,sub64s(C[da-1],b,p),P);  // quotient = a x + b
            dr = polsubmulP(C,D,a,b,da,db,p,P);  // C = C - (a x + b) D
            //dr = polsubmul(C,D,a,b,da,db,p,1,1);  // C = C - (a x + b) D
        }
        else dr = poldiv64s(C,D,da,db,p);
        if( dr<0 ) { /* D|C so gcd(A,B)=D */
            if( D!=A ) polcopy64s(D,db,A);
            monic64s( A, db, p );
            return db;
        }
        R = C; C = D; D = R; da = db; db = dr;
        //printf("da=%d db=%d\n",da,db);
    }
}

void polgcdext64s( LONG *A, LONG *B, int da, int db,
                  LONG *G, LONG *S, LONG *T, int *dG, int *dS, int *dT,
                  LONG *W, LONG p )
{
    // Solve S A + T B = G = monic gcd(A,B) for G,S,T in Zp[x]
    // The arrays A and B are used for the remainder sequence so are destroyed
    // G,S,T must be of size max(da+1,db+1)
    // W is working storage of size 2max(da+1,db+1)
    // if S==0 or T==0 then S (and/or T) are not computed

    int m,dr,ds,dt,dq,ds1,ds2,dt1,dt2; LONG a,b,u, k;
    LONG *q,*r,*r1,*r2,*s,*s1,*s2,*t,*t1,*t2;

    //P_INT P = modinit(p);
    recint P = recip1(p);

//printf("p = %lld  da=%d  db=%d\n",p,da,db);
//printf("GCDEX: A := "); polprint64s(A,da); printf(";\n");
//printf("GCDEX: B := "); polprint64s(B,db); printf(";\n");
//if( S!=NULL ) printf("S");
//if( T!=NULL ) printf("T");
//printf("\n");

    if( da<0 || db<0 ) { printf("inputs must be non-zero\n"); exit(1); }
    m = max32s(da+1,db+1);
    r1 = A; r2 = B;
    if(S) { s1 = S; s2 = W;   s1[0]=1; ds1=0; ds2=-1; }
    if(T) { t1 = T; t2 = W+m; t2[0]=1; dt2=0; dt1=-1; }
    //M_UINT recip, shift;
    //init( (M_UINT) p, &recip, &shift );
    M_UINT recip = 1; M_UINT shift = 1;
    while( 1 ) {
        if( db>0 && da-db==1 ) { // normal case
            u = modinv64s(r2[db],p);
            a = mulrec64(r1[da],u,P);
            b = mulrec64(a,r2[db-1],P);
            b = mulrec64(u,sub64s(r1[da-1],b,p),P);                       // quotient = a x + b
            dr = polsubmul(r1,r2,a,b,da,db,p,recip,shift);              // r1 = r1 - (a x + b) r2
            if(S) ds = polsubmul(s1,s2,a,b,ds1,ds2,p,recip,shift);      // s1 = s1 - (a x + b) s2
            if(T) dt = polsubmul(t1,t2,a,b,dt1,dt2,p,recip,shift);      // t1 = t1 - (a x + b) t2
        }
        else {
            dr = poldiv64s(r1,r2,da,db,p);                 // r1 = [remainder,quotient]
            q  = r1+db; dq = da-db;
            if(S) ds = polmul64s(q,s2,G,dq,ds2,p);
            if(S) ds = polsub64s(s1,G,s1,ds1,ds,p);        // s1 = s1 - q s2
            if(T) dt = polmul64s(q,t2,G,dq,dt2,p);
            if(T) dt = polsub64s(t1,G,t1,dt1,dt,p);        // t1 = t1 - q t2
        }
        if( dr<0 ) { /* D|C so gcd(A,B)=D */
            polcopy64s(r2,db,G);
            if(S) if( s2!=S ) polcopy64s(s2,ds2,S);
            if(T) if( t2!=T ) polcopy64s(t2,dt2,T);
            if( G[db]!=1 ) {
                u = modinv64s(G[db],p);
                polscamulrec64s(u,G,db,p,P);
                if(S) polscamulrec64s(u,S,ds2,p,P);
                if(T) polscamulrec64s(u,T,dt2,p,P);
            }
            dG[0] = db;
            if(S) dS[0] = ds2;
            if(T) dT[0] = dt2;
//if( S ) { printf("GCDEX: S = "); polprint64s(S,ds2); printf(";\n"); }
//if( T ) { printf("GCDEX: T = "); polprint64s(T,dt2); printf(";\n"); }
            return;
        }
        r = r1; r1 = r2; r2 = r;  da = db;   db = dr;
        if(S) { s = s1; s1 = s2; s2 = s; ds1 = ds2; ds2 = ds; }
        if(T) { t = t1; t1 = t2; t2 = t; dt1 = dt2; dt2 = dt; }
    }
}


int polsparseadd64s( LONG *A, LONG *X, int m, LONG *B, LONG *Y, int n, LONG p, LONG *C, LONG *Z)
// assumes X and Y are sorted so we do a merge into Z
{
   int i,j,k,t;
   LONG c;
   for( i=j=k=0; i<m && j<n; ) {
       if( X[i]<Y[j] ) { Z[k] = X[i]; C[k] = A[i]; i++; k++; }
       else if ( X[i]>Y[j] ) { Z[k] = Y[j]; C[k] = B[j]; j++; k++; }
       else {
          c = add64s(A[i],B[j],p);
          if( c!=0 ) { Z[k] = X[i]; C[k] = c; k++; }
          i++; j++;
       }
   }
   while( i<m ) { Z[k] = X[i]; C[k] = A[i]; i++; k++; }
   while( j<n ) { Z[k] = Y[j]; C[k] = B[j]; j++; k++; }
   return k;
}

int polsparsemul64s( LONG *A, LONG *X, int m, LONG *B, LONG *Y, int n, LONG p , LONG *C, LONG *Z) {
   int i,j,k,t;
   LONG s;
   recint P;
   P = recip1(p);
   for( i=0; i<m; i++ )
      for( j=0; j<n; j++ ) { Z[n*i+j] = X[i]+Y[j]; C[n*i+j] = mulrec64(A[i],B[j],P); }
   t = m*n;
   sort2vec64s(Z,C,t);
   k=0;
   i=0;
   while( i<t ) {
       s = C[i];
       for( j=i+1; j<t && Z[i]==Z[j]; j++ ) s = add64s(C[j],s,p);
       if( s!= 0 ) { C[k] = s; Z[k] = Z[i]; k++; }
       i = j;
   }
   return k;
}

//polsparsemul64s without mod p
int polymulti( LONG *A, LONG *X, int m, LONG *B, LONG *Y, int n, LONG p , LONG *C, LONG *Z) {
   int i,j,k,t;
   LONG s;
   for( i=0; i<m; i++ )
      for( j=0; j<n; j++ ) { Z[n*i+j] = X[i]+Y[j]; C[n*i+j] = A[i]*B[j]; }
   t = m*n;
   sort2vec64s(Z,C,t);
   k=0;
   i=0;
   while( i<t ) {
       s = C[i];
       for( j=i+1; j<t && Z[i]==Z[j]; j++ ) s = C[j]+s;
       if( s!= 0 ) { C[k] = s; Z[k] = Z[i]; k++; }
       i = j;
   }
   return k;
}


//get the coefficient for the x^i term
int COEFF64s( LONG *A, LONG *X, int t, int n, int i, LONG **C, LONG **Z ) {
    int j,k,d,numbits,xshift;
    LONG mask,*B,*Y;
    numbits = 63/n;
    xshift = numbits*(n-1);
    mask = (((LONG) 1) << xshift) - 1;
    //count number of matches
    for( k=0,j=0; j<t; j++ ) {
        d = X[j] >> xshift;
        if( d==i ) k++;
    }
    if( k==0 ) return 0;
    //alocate memory for the matched terms
    B = array(k); C[0] = B;
    Y = array(k); Z[0] = Y;
    //put all matched terms inside the memory
    //no degree should be changed
    for( k=0,j=0; j<t; j++ ) {
        d = X[j] >> xshift;
        if( d==i ) { B[k] = A[j]; Y[k] = X[j] & mask; k++; }
    }
    return k; // the number of terms of the coefficient
}


int poldiff64s( LONG *A, LONG *X, int t, int n, int j, LONG p ) {
// diff(a,x[j]) a is stored as pair (A,X) with t terms in n variables
   int i, k, numbits, shift; LONG mask, x, one; recint P;
   P = recip1( p );
   numbits = 63/n;
   shift = numbits*(n-j);
   mask = (1LL<<numbits)-1;
   one = 1LL << shift;
   k = 0;
   for( i=0; i<t; i++ ) {
       x = (X[i] >> shift) & mask;
       if( x>0 ) { A[k] = mulrec64(x,A[i],P); X[k] = X[i] - one; k++; }
   }
   return k; // number of terms in derivative
}


int polbivareval64s( LONG *A, LONG *X, int na, LONG *W, LONG *B, int dx, int dy, LONG p ) {
// eval(a(x,y),y=j) a is stored as pair (A,X) with t terms in 2 variables
// Output is a dense array of coefficients of degree dx
   int i,j,shift,x,y;
   LONG t,mask,z[2];
   recint P;
   P = recip1(p);
//printf("BIVAR  dx=%d  dy=%d  p=%lld", dx,dy,p); vecprint64s(W,dy); printf(";\n");
   for( i=0; i<=dx; i++ ) B[i] = 0;
   shift = 31;
   mask = (1LL<<31)-1;
   for( i=0; i<na; i=j ) {
       x = X[i] >> shift;
       for( j=i+1; j<na && ((X[j] >> shift) == x); j++ );
       //for( t=B[x]; i<j; i++ ) { y = X[i] & mask; t = add64s(t,mulrec64(W[y],A[i],P),p); }
       //B[x] = t;
       z[0] = B[x];
       z[1] = 0;
       while( i<j-1 ) {
            y = X[i] & mask; zfma(z,W[y],A[i]); if( z[1]>=p ) z[1] -= p; i++;
            // 20% faster to have if( z[1]>=p ) here in the middle than at the beginning or end of the loop
            y = X[i] & mask; zfma(z,W[y],A[i]); i++;
       }
       if( i==j-1 ) { y = X[i] & mask; zfma(z,W[y],A[i]); if( z[1]>=p ) z[1] -= p; i++; }
       zmod(z,p);
       B[x] = z[0];
   }
   while( dx>=0 && B[dx]==0 ) dx--;
   return dx;
}

//My Algorithms
void cleanArray64s(LONG *A, int len){
   int i;
   for(i=0;i<len;i++)A[i] = 0;
}

ULONG convol64s(LONG *A, LONG *B, LONG min, LONG max, LONG d, LONG p){

   int i;
   ULONG z[2];
   z[0] =0; z[1] = 0;

   if( p < 1LL << 50 ) { // if p < 2^50
      i = min;
      while(i<max-2){
             zfma(z,A[i],B[d-i]);i++;
             zfma(z,A[i],B[d-i]);i++;
             zfma(z,A[i],B[d-i]);i++;
      }
      while(i<=max){
         zfma(z,A[i],B[d-i]);i++;
      }
   }
   else{
     i = min;
     while( i<max-3 ) {
        zfma(z,A[i],B[d-i]); i++; if( z[1]>=p ) z[1] -= p;
        zfma(z,A[i],B[d-i]); i++;
        zfma(z,A[i],B[d-i]); i++; if( z[1]>=p ) z[1] -= p;
        zfma(z,A[i],B[d-i]); i++;
            }
            while( i<=max ) { zfma(z,A[i],B[d-i]); i++; if( z[1]>=p ) z[1] -= p; }

   }

   zmod(z,p);
   return z[0];


}

void poladdFast(LONG *A,LONG *B, LONG *C,int dx, LONG p){
   int i;
   for(i=0;i<=dx;i++)C[i] = add64s(A[i],B[i],p);
}

void addPol(LONG *A, int dz, LONG p){

   int j,k;

   for(k=0;k<dz;k++) for(j=dz-1;j>=k;j--) A[j] = add64s(A[j],A[j+1],p);

}

void createProblem(LONG *A, LONG *F0, int n, int dx, int dz, int dxA, int dzA, LONG alpha, LONG p, LONG *W){

    //Note: dxA = dx*n, dzA = dz*n

   LONG *TEMP,*TEMP2,*TEMP3,t1,t2,alpha2,r;
   int i,j,k,l,m,p1,p2,p3,MIN,MAX,da,db,dc,w;
   ULONG z[2];

   //return the degree of dxA,dzA


   TEMP = array((dxA+1)*(dzA+1));
   TEMP2 = array((dxA+1)*(dzA+1));
   TEMP3 = array((dxA+1)*(dzA+1));
   cleanArray64s(TEMP,(dxA+1)*(dzA+1));
   cleanArray64s(TEMP2,(dxA+1)*(dzA+1));
   cleanArray64s(TEMP3,(dxA+1)*(dzA+1));
   cleanArray64s(A,(dzA+1)*(dxA+1));

   //generate the polynomials of F (A = f_1*f_2*...*f_n)
   for(i=0;i<n;i++){
      p1 = i*(dx+1)*(dz+1);
      for(j=0;j<=dz;j++){
         p2 = j*(dx+1);
         for(k=0;k<dx;k++){
            r = rand64s(p);
            TEMP[p1+p2+k] = r;
         }
      }
      TEMP[p1+dx] = 1;
   }

   //Generate F0 (ie. F_i(z=alpha))
   for(i=0;i<n;i++){
      cleanArray64s(TEMP2,(dxA+1)*(dzA+1));
      p1 = i*(dx+1)*(dz+1);
      //move 1 of the n factors to TEMP2
      for(j=0;j<=dz;j++){
         p2 = j*(dx+1);
         for(k=0;k<=dx;k++){
            t1 = TEMP[p1+p2+k];
            TEMP2[p2+k] = t1;
         }
      }

      //Evalute it at z=alpha
      alpha2=1;
      for(j=1;j<=dz;j++){
         alpha2 = mul64s(alpha2,alpha,p);
         p2 =  j*(dx+1);
         for(k=0;k<=dx;k++){
            t1 = mul64s(alpha2,TEMP2[p2+k],p);
            TEMP2[k] = add64s(TEMP2[k],t1,p);
         }
      }

      //move results to F0
      p1 = i*(dx+1);
      for(j=0;j<=dx;j++){
         t1 = TEMP2[j];
         F0[p1+j] = t1;
      }

      //move first F to TEMP3
      for(j=0;j<=dz;j++){
         for(k=0;k<=dx;k++){
            t1 = TEMP[j*(dx+1)+k];
            TEMP3[j*(dx+1)+k] = t1;
         }
      }
   }

    for(i=1;i<n;i++){
        cleanArray64s(TEMP2,(dxA+1)*(dzA+1));
        p1 = i*(dx+1)*(dz+1); //small
        da = i*dz;//big
        db = dz;//small
        dc = da+db;//combined
        for(j=0;j<=dz;j++){
           p2 = j*(dx+1); //small
           for(k=0;k<=i*dz;k++){
               p3 = k*(i*dx+1); //big
               for(l=dc;l>=0;l--){
                  MIN = max32s(0,l-db);
                  MAX = min32s(l,da);


                  z[0] = convol64s(&TEMP3[p3], &TEMP[p1+p2],MIN,MAX,l,p);
                  TEMP2[(k+j)*((i+1)*dx+1)+l] = add64s(z[0],TEMP2[(k+j)*((i+1)*dx+1)+l],p);
               }
           }
        }

      //move current polynomial back to TEMP1
      for(j=0;j<=(i+1)*dz;j++){
         p2 = j*((i+1)*dx+1);
         for(k=0;k<=(i+1)*dx;k++){
            t1 = TEMP2[p2+k];
            TEMP3[p2+k] = t1;
         }
      }
   }
   //move to A
   for(i=0;i<=dzA;i++){
   p1 = i*(dxA+1);
      for(j=0;j<=dxA;j++){
         t1 = TEMP3[p1+j];
         A[p1+j] = t1;
      }
   }

   for(i=0;i<n;i++){
      //printf("F%d:=",i);polprintbivar64s(&(TEMP[i*(dx+1)*(dz+1)]),dx,dz,"x","y");printf(":\n\n");
   }

   free(TEMP);
   free(TEMP2);
   free(TEMP3);

   return;

}

//Expands a polynomial with dy factors (y-alpha)
void ExpandOutBivar(LONG *A,int dx, int dy, LONG alpha, LONG p){

   //local variables
   LONG alpha2,*MUL,*TEMP,t;
   int p1,p2,i,j,k;

   MUL = array(dy+1);
   TEMP = array(dy+1);

   cleanArray64s(MUL,dy+1);

   for (i=1;i<=dy;i++)
   {

      //initial iteration
      if(i==1)
      {

         //setup initial y-alpha
         alpha2 = p - alpha;
         MUL[0] = alpha2;
         MUL[1] = 1;

         //do multiplication
         p1 = i*(dx+1);
         for(j=0;j<i;j++){
            p2 = j*(dx+1);
            for(k=0;k<=dx;k++){
               t = mul64s(A[p1+k],MUL[j],p);
               t = add64s(A[p2+k],t,p);
               A[p2+k] = t;
            }
         }
         for(k=0;k<=dx;k++){
               t = mul64s(A[p1+k],MUL[i],p);
               A[p1+k] = t;
         }

      }

      else
      {
         cleanArray64s(TEMP,dy+1);
         //Expand y-alpha^i
         for(j=0;j<i;j++){
            TEMP[j+1] = MUL[j];
         }
         for(j=0;j<i;j++){
            t = mul64s(alpha2,MUL[j],p);
            t = add64s(t,TEMP[j],p);
            MUL[j] = t;
         }
         t = TEMP[i]; MUL[i] = t;

         //do multiplication
         p1 = i*(dx+1);
         for(j=0;j<i;j++){
            p2 = j*(dx+1);
            for(k=0;k<=dx;k++){
               t = mul64s(A[p1+k],MUL[j],p);
               t = add64s(A[p2+k],t,p);
               A[p2+k] = t;
            }
         }
         for(k=0;k<=dx;k++){
               t = mul64s(A[p1+k],MUL[i],p);
               A[p1+k] = t;
         }
      }
   }

   free(MUL);
   free(TEMP);

   return;
}

void ComparePolyFactorsBivar(LONG *A,LONG *F,int dx,int dz,int n,LONG alpha,LONG p){

   LONG *TEMP,*TEMP2,t1,t2;
   int i,j,k,l,m,p1,p2,p3,p4,check;

   TEMP = array(n*(dx+1)*(dz+1));
   TEMP2 = array(n*(dx+1)*(dz+1));

   cleanArray64s(TEMP,n*(dx+1)*(dz+1));
   cleanArray64s(TEMP2,n*(dx+1)*(dz+1));

   //Expand factors from (y-alpha) -> y
   for(i=0;i<n;i++){
      p1 = i*(dx+1)*(dz+1);
      for(j=0;j<=dz;j++){
         p2 = j*(dx+1);
         for(k=0;k<=dx;k++){
            t1 = F[p1 + p2 + k];
            TEMP[p2 + k] = t1;
         }
      }
      //Store them in TEMP
      ExpandOutBivar(TEMP,dx,dz,alpha,p);
      if(i<n-1){
          p1 = (n-1-i)*(dx+1)*(dz+1);
          for(j=0;j<=dz;j++){
             p2 = j*(dx+1);
             for(k=0;k<=dx;k++){
                t1 = TEMP[p2 + k];
                TEMP[p1 + p2 + k] = t1;
             }
          }
       }
   }
   //printf("AllEXP:=");polprintbivar64s(TEMP,dx,n*dz,"x","y");printf(":\n");

   //Expand the factors into 1 large polynomial
   for(i=1;i<n;i++){

      cleanArray64s(TEMP2,n*(dx+1)*(dz+1));
      //multiply factors
      p1 = i*(dx+1)*(dz+1);
      for(j=0;j<=dz;j++){
         p2 = j*(dx+1);
         for(k=0;k<=dx;k++){
            t1 = TEMP[p1+p2+k];
            if(t1 != 0){
               for(l=0;l<=dz;l++){
                  p3 = l*(dx+1);
                  for(m=0;m<=dx;m++){
                     t2 = TEMP[p3+m];
                     if(t2 != 0){
                        t2 = mul64s(t1,t2,p);
                        TEMP2[(j+l)*(dx+1)+(k+m)] = add64s(t2,TEMP2[(j+l)*(dx+1)+(k+m)],p);;
                     }
                  }
               }
            }
         }
      }

      //move current polynomial back to TEMP1
      for(j=0;j<=dz;j++){
         p2 = j*(dx+1);
         for(k=0;k<=dx;k++){
            t1 = TEMP2[p2+k];
            TEMP[p2+k] = t1;
         }
      }

      //printf("this iteration:"); polprintbivar64s(TEMP,dx,dz,"x","y");printf("\n");

   }

   //Compare Check A - mul(F) = 0
   for(j=0;j<=dz;j++){
      p1 = j*(dx+1);
      for(k=0;k<=dx;k++){
         t1 = sub64s(A[p1+k],TEMP[p1+k],p);
         TEMP[p1+k] = t1;
      }
   }
   printf("\n");
   polprintbivar64s(TEMP,dx,dz,"x","y");
   printf("\n");

   free(TEMP);
   free(TEMP2);

   return;

}

LONG DotProduct(LONG *A, LONG *B, int d, LONG p){

   int i,j,k;
   ULONG z[2];

   z[0] = 0; z[1] = 0;

   i=0;

   while(i<=d-3){
      zfma(z,A[i],B[i]);i++;
      zfma(z,A[i],B[i]);i++;
      zfma(z,A[i],B[i]);i++;
   }

   while(i<=d){
      zfma(z,A[i],B[i]);i++;
   }
   zmod(z,p);
   return z[0];

}

LONG DotProductP(LONG *A, LONG *B, int d, LONG p){

   int i,j,k;
   ULONG z[2];

   z[0] = 0; z[1] = 0;

   i=0;

   while(i<=d-2){
      zfma(z,A[i],B[i]);i++;//if(z[1]>=p){z[1] -=p;}
      zfma(z,A[i],B[i]);i++;if(z[1]>=p){z[1] -=p;}
   }

   while(i<=d){
      zfma(z,A[i],B[i]);i++;if(z[1]>=p){z[1] -=p;}
   }
   zmod(z,p);
   return z[0];

}

LONG * extendSpace(LONG *A, LONG ** H, LONG * Space, LONG d1, LONG d2, LONG k){

    LONG * B,a;
    int i;
    B = A;

    if (d1 + 1 >  *Space){
        B = *H;
        a=1;
        for(i=0;i<k;i++){a *= 2;}
        *Space = a;
        *H += a;
        for(i=0;i<=d2;i++){ B[i] = A[i]; }
        for(i=d2+1;i<*Space;i++){ B[i] = 0;}

    }

    return B;

}


ULONG fastPolyMult(LONG *poly1,LONG *poly2, LONG *T, int deg1, int deg2, LONG p){

    //local variables
    int k,m,min,max;
    ULONG z[2];

    //clean T
    //cleanArray64s(T,deg1+deg2);

    for(k=0;k<=deg1+deg2;k++){
       min = max32s(0,k-deg2);
       max = min32s(k,deg1);
       z[0] = 0; z[1] = 0;
       m = min;
       while(m<=max-3){
          zfma(z,poly1[m],poly2[k-m]);m++;
          zfma(z,poly1[m],poly2[k-m]);m++;
          zfma(z,poly1[m],poly2[k-m]);m++;
       }
       while(m<=max){
          zfma(z,poly1[m],poly2[k-m]);m++;
       }
       zmod(z,p);
       T[k] = add64s(z[0],T[k],p);
     }
}

//perform the Extended Euclidean Algorithm
int MultiEEA(LONG *U, int d, int n, LONG *M, LONG *W, LONG p){
//size of U = n*(d+1)
//size of M = ...
//size of W = 2*((n-1)*d+1) + d+1

   //local variables
   int i,j,du,dm,ds,dt,dg,dmCalc,mPos,point;
   LONG *u,*m,*mCalc,*g,*Wptr,t,alpha2;

   //clean array
   cleanArray64s(W,4*(n*d+1) + (d+1));

   //set initial array pointers
   mCalc = W;
   g = mCalc + (n-1)*d+1;

   //define initial position in the M array
   mPos = 0;
   for(i=n-1;i>1;i--){
      mPos = mPos + i*d+1;
   }

   mCalc[0] = 1;
   dmCalc = 0;

   //Main loop
   u = U + (n-1)*(d+1);
   for (i=n;i>=2;i--){

      //calculate mcalc*U_i
      dmCalc = polmul64s(mCalc,u,mCalc,dmCalc,d,p);
      //add new mcalc to M
      polcopy64s(mCalc,dmCalc,M+mPos);
      mPos = mPos - ((n-i+2)*d + 1);
      //update u pointer
      u -= d+1;
   }


   //make sure gcd(u_i,M_i) = 1 for all M
   u = g + (n-1)*d+1;
   Wptr = u + d+1;
   dmCalc = (n-1)*d;
   mPos = 0;

   for(i=0;i<n;i++){

      //move data to u,mCalc
      polcopy64s(U+i*(d+1),d,u);
      polcopy64s(M+mPos,dmCalc,mCalc);
      polgcdext64s(u,mCalc,d,dmCalc,g,0,0,&dg,&ds,&dt,Wptr,p);

      //fail state
      if(dg>0) return -1;


      dmCalc -= d;
      mPos += (n-i-1)*d+1;
   }

   return 0;

}

void generateGcdexS(LONG *U, int n, int du, LONG *M, int dx, LONG *S, int *sDeg, LONG *W, LONG p){

    //local variables
    int i,j,k,du2,dm,dg,ds,wTemp,dt,mPos;
    LONG *uTemp,*mTemp,*g,*wPtr;

    cleanArray64s(W,4*n*(du+1));

    uTemp = W;
    mTemp = uTemp + du+1;
    g = mTemp + (n-1)*du+1;
    wPtr = g + n*du+1;

    mPos = 0;

    //Calculate the sigma term and subtract it from C
    for (i=1;i<n;i++){

       //extract M_i,U_i
       polcopy64s(U+(i-1)*(du+1),du,uTemp);
       du2 = poldeg64s(uTemp,du);
       polcopy64s(M+mPos,(n-i)*du,mTemp);
       dm = poldeg64s(mTemp,(n-i)*du);
       mPos = mPos + (n-i)*du+1;

       //perform EEA to solve s*U[i] + t*M[i] = 1 for s in Z[x]
       polgcdext64s(mTemp,uTemp,dm,du2,g,S+(i-1)*(dx+1),0,&dg,&ds,&dt,wPtr,p);
       sDeg[i-1] = ds;

    }

}

void Dionvar(LONG *U, int n, int d, LONG *c, int dx, LONG *M, LONG *S, int *sDeg, LONG *W, LONG *Sigmas, LONG p, LONG *numMuls){
//Enter list of polynomials U of size (n*(d+1)), solution C of size at most (d+1), and polynomial M <- U[2]*U[3]...U[n]

//local variables
    int i,j,k,du,dm,dg,ds,dt,mPos,dck,drem,point;
    LONG t,*u,*uTemp,*m,*mTemp,*g,*s,*ck,*Wptr,*V;

    cleanArray64s(W,2*(d+1) + 4*(n*d+1) + 2*(dx+1));
    cleanArray64s(Sigmas,n*(d+1));

      for(i=0;i<n;i++){
      for(j=0;j<=d;j++){
      }
   }

    V = W;
    u = V;       V+= d+1;
    uTemp = V;   V+=  d+1;
    m = V;       V+=  n*d+1;
    mTemp = V;   V+=  n*d+1;
    s = V;       V+=  2*(dx+1);
    ck = V;      V+=  n*d+1;
    g = V;       V+= n*d+1;

    mPos = 0;

    //move c to ck
    dck = 0;
    for (j=0;j<=dx;j++){
       t = c[j];
       ck[j] = t;
       if(t != 0){
          dck = j;
       }

    }

    //Calculate the sigma term and subtract it from C
    for (i=1;i<n;i++){

       cleanArray64s(uTemp,d+1);
       cleanArray64s(mTemp,(n-1)*d+1);
       cleanArray64s(s,2*(dx+1));
       cleanArray64s(g,n*d+1);
       cleanArray64s(V,2*(dx+1));

       //extract M_i,U_i
       point = (d+1)*(i-1);
       for (j=0;j<=d;j++){
          u[j] = U[point + j];
          uTemp[j] = U[point + j];
          if(U[point + j] != 0){
             du = j;
          }
       }

       for (j=0;j<(n-i)*d+1;j++){
          m[j] = M[mPos + j];
          mTemp[j] = M[mPos + j];
           if(M[mPos + j] != 0){
              dm = j;
            }
        }
        mPos = mPos + (n-i)*d+1;

       ds = polmul64s(ck,&S[(i-1)*(dx+1)],s,dck,sDeg[i-1],p);
       *numMuls = *numMuls + (dck+1)*(sDeg[i-1]+1);

       drem = poldiv64s(s,u,ds,du,p);
       *numMuls = *numMuls + (du)*(ds-du+1);

       ds = ds - drem - d;

       for(j=0;j<=drem;j++){
          t = s[j];
          Sigmas[(i-1)*(d)+j] = t;
       }
       for(j=0;j<=ds;j++){
          t = s[drem+j+d];
          s[j] = t;
          s[drem+j+d] = 0;
       }

       //calculate c - r*M_i
       dm = polmul64s(m,&Sigmas[(i-1)*(d)],m,dm,drem,p);
       dm = polsub64s(ck, m, m, dck, dm, p);

       //c is reduced to 0, so the remaining sigmas must be 0.
       if(dm==-1){return;}

       //take quotient of c - r*M_i and U_i;
       drem = poldiv64s(m,u,dm,du,p);
       for(j=0;j<=dm-du;j++){
          t = m[du+j];
          ck[j] = t;
          if(t != 0){
             dck = j;
          }
       }
    }

    //Save final sigmas
    for (j=0;j<=dck;j++){ Sigmas[(n-1)*d+j] = ck[j]; }


    return;
}

LONG getCoeff3(LONG ** G, LONG * H, LONG * Space, LONG *degrees,  int n, int k, LONG p){

     //denote local variables
     LONG i,d1,d2,t,s;
     LONG MIN,MAX;
     ULONG Z[2];
     clock_t T1,T2;

     t = n; s = 0;
     while(t>1){
        i=0;
        while(i<t/2){
            d1 = degrees[s+2*i]; d2 = degrees[s+2*i+1];
            G[t+s+i] = extendSpace(G[t+s+i],&H,&Space[t+s+i],d1+d2,degrees[t+s+i],log2N(d1+d2)+1);
            degrees[s+t+i] = d1+d2;
            if(k <= d1+d2){
                MIN = max32s(0,k-d2); MAX = min32s(k,d1);
                G[t+s+i][k] = convol64s(G[s+2*i], G[s+2*i+1], MIN, MAX, k,p);
            }
            i++;
        }
       if(t%2==1){
           G[t+s+i] = extendSpace(G[t+s+i],&H,&Space[t+s+i],degrees[s+2*i],degrees[t+s+i],log2N(degrees[s+2*i])+1);
            degrees[s+t+i] = degrees[s+2*i];
            if(degrees[s+2*i]>=k){
               G[s+t+i][k] = G[s+2*i][k];
            }
       }
        s += t; t = ceil2(t);
     }

     if(degrees[s] >= k){return G[s][k];}
     else{ return 0;}


}


void LagInterpSetup(LONG *A,LONG *xPoints,int dx, LONG *W, LONG p){

    //local variables
    LONG t,a,b,*p1,*p2;
    int i,j,k,m,dm,point1,point2,size;
    ULONG z[2];

    size = (dx+1)/2+1;

    cleanArray64s(W,2*dx+6);
    p1 = &W[dx+2];
    p2 = &W[dx+4];

    //calculate master polynomial
    W[1] = 1;
    p1[1] = 1;
    for(i=1;i<=dx;i++){
       p1[0] = p-xPoints[i];
       polmul64s(W,p1,W,i,1,p);
    }

    //calculate Lagrange polynomials
    for(i=0;i<=dx;i++){
       //copy the master polynomial to spare space
       for(j=0;j<=dx+1;j++){p2[j] = W[j]; }
       p1[0] = p-xPoints[i];
       //perform division
       dm = poldiv64s(p2,p1,dx+1,1,p);
       //move polynomial to A
       for(j=0;j<=dx;j++){ A[i*(dx+1)+j] = p2[j+1]; }

    }

    //calculate inverse and multiply every term by it
    p1 = A;
    for(i=0;i<=dx;i++){
        //calculate denom
        a = 1;
        b = xPoints[i];
        for (j=0;j<=dx;j++){
            if (i!=j){
                a = mul64s(a,sub64s(b,xPoints[j],p),p);
            }
        }
        a = modinv64s(a,p);
        //multiply every term by a
        for(j=0;j<=dx;j++){
            if(p1[j]!=0){
                p1[j] = mul64s(p1[j],a,p);
            }
        }
        p1 = p1 + dx+1;
    }

    //transpose matrix (efficiently)
    cleanArray64s(W,2*(size)*(size-1) + 1);


    if(dx%2 ==0){

        //single entry,
        W[0] = A[0];

        //even case
        p1 = &W[1];
        //firstRow
        for(i=2,k=0;i<=dx+1;i+=2,k++){ p1[k*size] = A[i]; }
        //each column (even elements)
        for(i=2,k=1;i<=dx+1;i+=2,k++){
            for(j=2,m=0;j<=dx+1;j+=2,m++){
                p1[m*(size)+k] = A[(i-1)*(dx+1) + j];
            }
        }

        //odd
        p1 = p1 + size*(size-1);
        //firstRow
        for(i=1,k=0;i<=dx+1;i+=2,k++){ p1[k*size] = A[i]; }
        //each column (even elements)
        for(i=2,k=1;i<=dx+1;i+=2,k++){
            for(j=1,m=0;j<=dx+1;j+=2,m++){
                p1[m*(size)+k] = A[(i-1)*(dx+1) + j];
            }
        }
         //move elements back to A
         for(i=0;i<2*size*(size-1)+1;i++){ A[i] = W[i]; }
    }
    else{
       //transpose matrix A
       for(i=0;i<=dx;i++){
          for(j=i+1;j<=dx;j++){
              t = A[i*(dx+1)+j]; A[i*(dx+1)+j] = A[j*(dx+1)+i]; A[j*(dx+1)+i] = t;
          }
       }
    }

    return;

}

//perform Lagrangian interpolation. The Lagrangian polynomials are stored in LagPolys
//and the interpolation points are stored in yPoints.
//The resulting polynomial is stored in Delta
//The size of W is dx+1
void LagInterpEval(LONG *LagPolys,LONG *yPoints,LONG *Delta, int dx, LONG *W, LONG p, LONG *nummuls){
//Lagpolys - holds the Lagrangian polynomials used for lagrange interpolation - matrix has size (dx+1)^2
//yPoints - array of length dx+1 - contains the y points for Lagrange interpolation (the x points are implied to be 0,1,-1,2,-2,...
//Delta - the interpolated polynomial of degree at most dx
//dx - degree of the main polynomial A in x
//W - additional space needed - size = dx+1
//p - prime

    //local variables
    int i,j,k,size,point;
    LONG *p1,*V;
    ULONG z[2];

    if(dx%2==0){

        size = (dx+1)/2 + 1;
        V = W;
        cleanArray64s(V,size);
        V[0] = yPoints[0];

        //solve first solution (alpha = 0)
        Delta[0] = mul64s(LagPolys[0],yPoints[0],p);
        *nummuls += 1;

        //*******************************************************************
        //solve Lagrange interpolation (even case)
        //*******************************************************************

        //build vector
        for(i=1,j=1;i<size-1;i++,j+=2){
            V[i] = add64s(yPoints[j],yPoints[j+1],p);
        }
        if(dx%2==1){V[size-1] = yPoints[dx];}
        else{V[size-1] = add64s(yPoints[dx-1],yPoints[dx],p);}

        //calculate Delta (even Case)
        p1 = &LagPolys[1];
        for(i=0,j=2;i<size-1;i++,j+=2){
           if(p < 1LL << 50) Delta[j] = DotProduct(p1,V,size-1,p);
           else Delta[j] = DotProductP(p1,V,size-1,p);
           p1 = p1+size;
        }

        //*******************************************************************
        //solve Lagrange interpolation (odd case)
        //*******************************************************************

        //build vector
        for(i=1,j=1;i<size-1;i++,j+=2){
            V[i] = sub64s(yPoints[j],yPoints[j+1],p);
        }
        if(dx%2==1){V[size-1] = yPoints[dx];}
        else{V[size-1] = sub64s(yPoints[dx-1],yPoints[dx],p);}

        //calculate Delta (odd Case)
        for(i=0,j=1;i<size-1;i++,j+=2){
            if(p < 1LL << 50) Delta[j] = DotProduct(p1,V,size-1,p);
            else Delta[j] = DotProductP(p1,V,size-1,p);
           p1 = p1+size;
        }
    }


    //when dx is odd
    else{
        Delta[0] = yPoints[0];
        for(i=1;i<=dx;i++){
           if(p < 1LL << 50) Delta[i] = DotProduct(&LagPolys[i*(dx+1)],yPoints,dx,p);
           else DotProductP(&LagPolys[i*(dx+1)],yPoints,dx,p);
        }
    }

    return;
}

void altDiv2(LONG *A,int dx, int dz, LONG alpha, LONG* W, LONG p){

   //local variables
   int i,j,k,b,blockSize,blocks;
   LONG x0,I,t,*p1,*pj,*pA,*pA2,*pW,*pW2;
   clock_t T1,T2,s1,s2,s3;

   s1=0;s2=0;s3=0;

   //Calculate A*x_0^j
   x0 = 1;
   blockSize = 32;
   blocks = (dx + blockSize)/blockSize;
   b = dx+1 - (blocks-1)*blockSize;
   cleanArray64s(W,(dz+1)*blockSize);

   //iterative case
   T1 = clock();
   pj = &A[dx+1];
   for(i=1;i<=dz;i++){
      x0 = mul64s(x0,alpha,p);
      for (j=0;j<=dx;j++){
         pj[j] = mul64s(pj[j],x0,p);
      }
      pj = pj + dx+1;
   }
   T2 = clock();
   //printf("Mult1=%8.2fms\n",(T2-T1)/1000.0/1.0);


   //if there is one block of code
   if(blocks==1){
      for(k=0;k<=dz-1;k++){
         pA = &A[(dz-1)*(dx+1)]; pA2 = pA + dx+1;
         for(j=dz-1;j>=k;j--){
            poladdFast(pA,pA2,pA,dx,p);
            pA2 -= (dx+1); pA -= (dx+1);
         }
      }
   }
   //if there are multiple blocks of code
   else{

       //For each main block of data, compress Data, add it, then return it
       for(i=1;i<blocks;i++){

          //Move data
          T1 = clock();
          pW = W; pA = &A[(i-1)*blockSize];
          for(j=0;j<=dz;j++){
             for(k=0;k<blockSize;k++){
                pW[k] = pA[k];
             }
             pW += blockSize; pA += dx+1;
          }
          T2 = clock();
          s1 =s1+ T2-T1;

          //Main addition Loop
          T1 = clock();
          for(k=0;k<=dz-1;k++){
            pW = &W[(dz-1)*blockSize]; pW2 = pW + blockSize;
             for(j=dz-1;j>=k;j--){
                poladdFast(pW,pW2,pW,blockSize-1,p);
                pW2 = pW; pW -= blockSize;
             }
          }
          T2 = clock();
          s2 = s2+ T2-T1;

          T1 = clock();
          pW = W; pA = &A[(i-1)*blockSize];
          for(j=0;j<=dz;j++){
             for(k=0;k<blockSize;k++){
                 pA[k] = pW[k];
             }
             pW += blockSize; pA += dx+1;
          }
          T2 = clock();
          s3 = s3 + T2-T1;

       }

       //Final, mis-sized block
       if(b>0){

           T1 = clock();
           pW = W; pA = &A[(blocks-1)*blockSize];
           for(j=0;j<=dz;j++){
              for(k=0;k<b;k++){
                 pW[k] = pA[k];
              }
              pW += b; pA += dx+1;
           }
           T2 = clock();
           s1 = s1 + T2-T1;

           T1 = clock();
           for(k=0;k<=dz-1;k++){
              pW = &W[(dz-1)*b];
              pW2 = pW + b;
              for(j=dz-1;j>=k;j--){
                poladdFast(pW,pW2,pW,b-1,p);
                pW2 = pW; pW -= b;
              }
           }
           T2 = clock();
           s2 = s2 + T2-T1;

           T1 = clock();
           pW = W; pA = &A[(blocks-1)*blockSize];
           for(j=0;j<=dz;j++){
              for(k=0;k<b;k++){
                 pA[k] = pW[k];
              }
              pW += b; pA += dx+1;
           }
           T2 = clock();
           s3 = s3 + T2-T1;

        }
   }


   //multiply by inverse
   T1 = clock();
   x0 = modinv64s(alpha,p);
   I = x0;
   pj = &A[dx+1];
   for(i=1;i<=dz;i++){
      for(j=0;j<=dx;j++){
         pj[j] = mul64s(pj[j],I,p);
      }
      pj = pj + dx+1;
      I = mul64s(I,x0,p);
   }
   T2 = clock();

   /*printf("Shift 1 =%8.2fms\n",(s1)/1000.0/1.0);
   printf("Addition =%8.2fms\n",(s2)/1000.0/1.0);
   printf("Shift 2=%8.2fms\n",(s3)/1000.0/1.0);
   printf("Mult 2=%8.2fms\n",(T2-T1)/1000.0/1.0);
*/


   return;

}

//Evaluates polynomials at x = 0,+-1,+-2,+-3,...
void FastEval(LONG **G, LONG **H, LONG* Space, LONG* degrees, LONG *F, LONG *EvalPoints, int iter, int n, int numPolys, int *degsX, int *degsY, int du, int dx, int dz, LONG p){

   int i,j,k,m,ndz,dxdz;
   LONG *p1,*p2,e,o,t;
   LONG *shift1, *shift2;
   ULONG z1[2],z2[2];

   ndz = n*(dz+1);
   dxdz = (dx+1)*(dz+1);

   //deal with easy case (x=0, store just constants)
   p1 = &F[iter*(dx+1)];
   for(i=0;i<n;i++){

        if(*p1 != 0){
            G[i] = extendSpace(G[i],H,&Space[i],iter,degrees[i],log2N(iter)+1);
            degrees[i] = iter;
            G[i][iter] = *p1;
        }
      p1 = p1 + dxdz;
   }

   //preform evaluation if using a smaller prime
   p2 = EvalPoints;
   i=1;
   while(i<=dx/2){
   //for(i=1;i<=dx/2;i++){
      //pointers
      //p1 = polynomials
      //p2 = x = 0,+-1,...
      p1 = F + iter*(dx+1);

      for(j=0;j<n;j++){

         if(degsY[j] == iter){

             //extend space
             G[(2*i-1)*(numPolys)+j] = extendSpace(G[(2*i-1)*(numPolys)+j],H,&Space[(2*i-1)*numPolys+j],iter,degrees[(2*i-1)*numPolys+j],log2N(iter)+1);
             G[(2*i)*(numPolys)+j] = extendSpace(G[(2*i)*(numPolys)+j],H,&Space[(2*i)*numPolys+j],iter,degrees[(2*i)*numPolys+j],log2N(iter)+1);

             degrees[(2*i-1)*(numPolys)+j] = iter;
             degrees[(2*i)*(numPolys)+j] = iter;

             z1[0] = p1[0]; z1[1] = 0;
             z2[0] = p1[1]; z2[1] = 0;

             if( p < 1LL << 50 ) { // if p < 2^50
                 for(m=1,k=2;k<=degsX[j];k+=2,m++){
                    zfma(z1,p1[k],p2[m]);
                    zfma(z2,p1[k+1],p2[m]);
                 }
             }
             else{
                for(m=1,k=2;k<=degsX[j];k+=2,m++){
                    zfma(z1,p1[k],p2[m]); if( z1[1]>=p ) z1[1] -= p;
                    zfma(z2,p1[k+1],p2[m]); if( z2[1]>=p ) z2[1] -= p;
                 }
             }

             zmod(z1,p); e = z1[0];
             zmod(z2,p); o = mul64s(z2[0],i,p);

             G[(2*i-1)*(numPolys)+j][iter] = add64s(e,o,p);
             G[(2*i)*(numPolys)+j][iter] = sub64s(e,o,p);
         }
         p1 = p1 + dxdz;


      }
      p2 = p2 + du+1;
      i++;
   }

   //odd case (if dx%2==1)
   if(dx%2==1){

       p1 = F + iter*(dx+1);
       for(j=0;j<n;j++){
          if(degsY[j] == iter){
             G[(dx)*(numPolys)+j] = extendSpace(G[(dx)*(numPolys)+j],H,&Space[(dx)*numPolys+j],iter,degrees[(dx)*numPolys+j],log2N(iter)+1);
             degrees[(dx)*(numPolys)+j] = iter;
             z1[0] = p1[0]; z1[1] = 0;
             z2[0] = p1[1]; z2[1] = 0;

             if( p < 1LL << 50 ) { // if p < 2^50
                 for(m=1,k=2;k<=degsX[j];k+=2,m++){
                    zfma(z1,p1[k],p2[m]);
                    zfma(z2,p1[k+1],p2[m]);
                 }
             }
             else{
                for(m=1,k=2;k<=degsX[j];k+=2,m++){
                    zfma(z1,p1[k],p2[m]); if( z1[1]>=p ) z1[1] -= p;
                    zfma(z2,p1[k+1],p2[m]); if( z2[1]>=p ) z2[1] -= p;
                 }
             }

             zmod(z1,p); e = z1[0];
             zmod(z2,p); o = mul64s(z2[0],i,p);

              G[dx*(numPolys)+j][iter] = add64s(e,o,p);

         }
         p1 = p1 + dxdz;
       }
   }

   return;
}


//This preforms hensel lifting on n factors to factor polynomial A
int HenselLiftCubic(LONG *A, int dx, int dz, LONG *F0, int n, int du, LONG *F, LONG alpha, LONG *TempSpace, LONG p ){
/*
A[x][z]- polynomial you wish to factor
dx - degree bound on variable x of A
dz - degree bound on variable z of A
F0 - set of linear factors
n - number of initial factors
du - degree bound of initial factors (could be extended to an array later)
F - array that will store the final polynomials. must be of size n*(dx+1)*(dz+1)
W - working array for calculations. Must be at least size (tbd)
alpha - LONG integer
p - prime
*/

   //local variables
   int i,j,k,l,m,maxSizeA,ndz,flag, *maxDeg, *sDeg, *curXDeg, sumDeg, InterpFlag, numPolys;
   LONG s,t,t2,z,*M,*E,*p1,*p2,*p3,*p4,*p5,*TEMP1,*Coeffs, *LagInterpPolys;
   LONG *DioSVal, numDioMuls,numCoeffExtractMul,numEvalMuls,numInterpMuls;
   LONG *H, *Space, *degrees, *ck,*evalPoints,*interpPoints,*Delta,*EvalPointsMul, *W, wTemp;
   LONG temp,temp2,temp3;
   LONG ** P1;
   recint P;
   clock_t T1,T2,T3,T4,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;

   //set time to 0
   s1=0;s2=0;s3=0;s4=0;s5=0;s6=0;s7=0;s8=0;s9=0;s10=0;s11=0;s12=0;s13=0;s14=0;s15=0;

   //initial size calculations
   maxSizeA = (dx+1)*(dz+1);
   ndz = n*(dz+1);
   numEvalMuls = 0;
   numDioMuls = 0;
   numCoeffExtractMul = 0;
   numInterpMuls = 0;
   InterpFlag = 0;

   P = recip1(p);
   //find the number of polynomial multiples we create
   numPolys = 1; t = n;
   while (t>1){
    numPolys +=  t;
    t = ceil2(t);
   }

   //declare arrays
   W = TempSpace;
   maxDeg = arrayI(n); //this is in y
   sDeg = arrayI(n);  //Degree of each s variable (for Diophantine Equation)
   curXDeg = arrayI(n);  //Degree of x for each factor (efficiency)
   M = W;                          W += (n-1)*n*du/2 + n - 1; //M polynomials (diophantine)
   E = W;                          W += maxSizeA;             //The error at each iteration
   TEMP1 = W;                      W += n*(du+1);             //Temporary storage
   evalPoints = W;                 W += dx+1;
   interpPoints = W;               W += dx+1;
   LagInterpPolys = W;             W += (dx+1)*(dx+1);
   Delta = W;                      W += dx+1;
   EvalPointsMul = W;              W += (du+1)*((dx/2)+2);
   DioSVal = W;                    W += (n-1)*(dx+1);
   Space = W;                      W += (dx+1)*(numPolys); // < (dx+1)*3*n
   degrees = W;                    W += (dx+1)*(numPolys); // < (dx+1)*3*n
   H = W;                          W += 4*(dx+1)*(log2N(n)+1)*(dz+n);

    LONG *G[(dx+1)*numPolys];

    cleanArray64s(H,4*(dx+1)*(log2N(n)+1));

   //move contents of F0 to F
   //Calculate exact degree bound of factors
   p1 = F;
   p2 = F0;


   //Initialize matrix G, Space, and degrees
   for(i=0;i<(dx+1)*numPolys;i++){
           G[i] = H;
           Space[i] = 1;
           degrees[i] = 0;
           H += 1;
   }

   for(i=0;i<n;i++){
      curXDeg[i] = -1;
      for(j=0;j<=du;j++){
         if(p2[j] != 0){
            curXDeg[i]=j;
            p1[j] = p2[j];
         }

      }
      p1 = p1 + maxSizeA;
      p2 = p2 + du+1;
   }

   //declare maxDeg, the size of each n factors
   for(i=0;i<n;i++){
      maxDeg[i] = 0;
   }

   //set evalPoints
   evalPoints[0] = 0;
   p1 = evalPoints+1;
   for(i=1;i<=dx/2;i++){
      p1[0] = i;
      p1[1] = p-i;
      p1 +=2;
   }
   if(dx%2 == 1){
      evalPoints[dx] = i;
   }

   //Generate Lagrange Polynomials for Interpolation in main loop
   T1 = clock();
   LagInterpSetup(LagInterpPolys,evalPoints,dx,W,p);
   s7 = clock() - T1;

   //calculate evaluation point constants
   p1 = EvalPointsMul;
   for(i=1;i<=dx/2;i++){
      t = 1;
      p1[0] = t;
      t2 = i*i;
      for(j=1;j<=du;j++){
         t = mul64s(t,t2,p);
         p1[j] = t;
      }
      p1 = p1 + du+1;
   }
   if(dx%2==1){
       t = 1;
      p1[0] = t;
      t2 = i*i;
      for(j=1;j<=du;j++){
         t = mul64s(t,t2,p);
         p1[j] = t;
      }
   }

   //Recover multiple of U for future calls to diophantine equation
   j = MultiEEA(F0,du,n,M,W,p);

   //if the EEA failed, return 'FAIL'(add last)
   if(j==-1){
      printf("The input factors are not coprime");
      return -1;
   }

   //Generate the S values from Gcdex
   generateGcdexS(F0,n,du,M,dx,DioSVal,sDeg,W,p);

   // declare initial Error
   for(i=0;i<maxSizeA;i++){
      E[i] = A[i];
   }
   //Change it to its Taylor Representation E = a_0 + a_1(y-alpha) + ...
   T1 = clock();
   if(alpha != 0){
      altDiv2(E,dx,dz,alpha,W,p);
   }
   s9 = clock()-T1;

   //printf("E:=");polprintbivar64s(E,dx,dz,"x","(y-5)");printf(":\n");

   //Preform initial polynomial evaluations
   T1 = clock();
   FastEval(G,&H, Space, degrees, F,EvalPointsMul, 0, n, numPolys, curXDeg, maxDeg,  du, dx, dz, p);
   s1 = clock()-T1;

   //initial CoeffCalc calculations
   T1 = clock();
   for(i=0;i<=dx;i++){
             //calculate coefficient for (y-alpha)^k
             interpPoints[i] = getCoeff3(&G[i*numPolys],H,&Space[i*numPolys],&degrees[i*numPolys],n,0,p);
    }
   s2 = clock()-T1;


   //main FOR loop
   T3 = clock();
   ck = E;
   for(k=1;k<=dz;k++){

        //printf("\n\nIteration: k = %d\n\n",k);

      //get the error for this loop
      ck += dx+1;

         //Preform coefficient extraction
         T1 = clock();

          for(i=0;i<=dx;i++){
             //calculate coefficient for (y-alpha)^k
             interpPoints[i] = getCoeff3(&G[i*numPolys],H,&Space[i*numPolys],&degrees[i*numPolys],n,k,p);
          }
          T2 = clock();
          s2 = s2 + T2 - T1;

      //Interpolate
      T1 = clock();
      LagInterpEval(LagInterpPolys,interpPoints,Delta,dx, W, p, &numInterpMuls);
      T2 = clock();
      s3 = s3 + T2 - T1;

      //subtract delta from e
      temp = polsub64s(ck, Delta, ck, dx, dx, p);

      //check to make sure ck isn't zero
      flag = 0;
      for (i=0;i<=dx;i++){
        if (ck[i] != 0){
            flag = 1;
            break;
        }
      }

      //add the sum of all the degrees
      sumDeg = 0;
      for (i=0;i<n;i++){
         sumDeg += maxDeg[i];
      }

      //check fail case
      if(flag == 1 && sumDeg == dz){
         printf("Error: c_k != 0 when the sum of degrees is equal to d\n\n");
         return -1;
      }

      //if c_k isn't equal to zero, solve diophantine equation and preform updates/evaluations
      else if(flag == 1){

          //Solve Diophantine Equation
          cleanArray64s(TEMP1,n*(du+1));
           T1 = clock();
          Dionvar(F0, n, du, ck, dx, M, DioSVal,sDeg, W, TEMP1, p, &numDioMuls);
          T2 = clock();
          s4 = s4 + T2 - T1;

          //Update F
          //Update degree of X counter for factors
          p1 = &F[k*(dx+1)];
          p2 = TEMP1;
          InterpFlag = 1;
          for(i=0;i<n;i++){
             curXDeg[i]=-1;
             for(j=0;j<du;j++){
                if(p2[j] != 0){
                   InterpFlag = 0;
                   curXDeg[i]=j;
                   maxDeg[i] = k;
                   p1[j] = p2[j];
                }
             }
             p1 = p1 + maxSizeA;
             p2 = p2 + du;
          }

          //Calculate evaluation points
          T1 = clock();
          FastEval(G,&H, Space, degrees, F,EvalPointsMul, k, n, numPolys, curXDeg, maxDeg, du, dx, dz, p);
          T2 = clock();
          s1 = s1 + T2 - T1;

          //Update CoeffCients
          T1 = clock();

          for(i=0;i<=dx;i++){
            P1 = &G[i*numPolys];
            p2 = &Space[i*numPolys];
            p3 = &degrees[i*numPolys];
            s = 0; t = n;
            cleanArray64s(TEMP1, n);
            while(t>1){
                j=0;
                while(j<t/2){
                    if(s==0){
                        TEMP1[j] = 0;
                        if(p3[2*j]==k){TEMP1[j] = mul64s(P1[2*j][k],P1[2*j+1][0],p); }
                        if(p3[2*j+1]==k){TEMP1[j] = add64s(TEMP1[j],mul64s(P1[2*j+1][k],P1[2*j][0],p),p); }
                    }
                    else{TEMP1[j] = add64s(mul64s(P1[s+2*j+1][0],TEMP1[2*j],p),mul64s(P1[s+2*j][0],TEMP1[2*j+1],p),p);}

                   P1[t+s+j] = extendSpace(P1[t+s+j],&H,&p2[t+s+j],p3[s+2*j]+p3[s+2*j+1],p3[t+s+j],log2N(p3[s+2*j]+p3[s+2*j+1])+1);

                   p3[s+t+j] = p3[s+2*j] + p3[s+2*j+1];
                   P1[s+t+j][k] = add64s(P1[s+t+j][k],TEMP1[j],p);
                   j++;
                }

                if(t%2==1){
                    TEMP1[j] = P1[s+2*j][k];
                    P1[t+s+j] = extendSpace(P1[t+s+j],&H,&p2[t+s+j],p3[s+2*j],p3[t+s+j],log2N(p3[s+2*j])+1);
                    p3[s+t+j] = p3[s+2*j];
                    P1[t+s+j][k] = TEMP1[j];
                }
                s += t; t = ceil2(t);
            }

      }

      T2 = clock();
      s5 = s5 + T2 - T1;

      }

   }

   T4 = clock();

   //check if sum of degrees = dzvpn.its.sfu.ca
   if(sumDeg < dz){
      printf("Error: sum of degrees not equal to %d\n",dz);
      return -1;
   }

   //print statements
   printf("Change of Base time=%8.2fms\n",(s9)/1000.0/1.0);
   printf("Evaluation time=%8.2fms\n",(s1)/1000.0/1.0);
   printf("Coefficient Extraction time=%8.2fms\n",(s2)/1000.0/1.0);
   printf("Interpolation Setup time=%8.2fms\n",(s7)/1000.0/1.0);
   printf("Interpolation Calc time=%8.2fms\n",(s3)/1000.0/1.0);
   printf("Coefficient Update=%8.2fms\n",(s5)/1000.0/1.0);
   printf("Coefficient Shift time=%8.2fms\n",(s6)/1000.0/1.0);
   printf("Total Interpolation time=%8.2fms\n",(s3+s7)/1000.0/1.0);
   printf("Diophantine time=%8.2fms\n",(s4)/1000.0/1.0);
   /*printf("The number of Evaluation Multiplications is %lld\n",numEvalMuls);
   printf("The number of Diophantine Multiplications is %lld\n",numDioMuls);
   printf("The number of Interpolation Multiplications is %lld\n",numInterpMuls);
   printf("The number of Coefficient Extraction Multiplications is %lld\n",numCoeffExtractMul);*/
   //printf("\n");


   free(maxDeg); free(sDeg); free(curXDeg);

   return 0;

}


int HenselLiftQuartic(LONG *A, int dx, int dz, LONG *F0, int n, int du, LONG *F, LONG alpha, LONG *W, LONG p ){
/*
A[x][z]- polynomial you wish to factor
dx - degree bound on variable x of A
dz - degree bound on variable z of A
F0 - set of linear factors
n - number of initial factors
du - degree bound of initial factors (could be extended to an array later)
F - array that will store the final polynomials. must be of size n*(dx+1)*(dz+1)
W - working array for calculations. Must be at least size (tbd)
alpha - LONG integer
p - prime (duh)
*/

   //local variables
   int i,j,k,l,m,q,flag,mSize,mCounter,maxSizeA,*maxDeg,temp,temp2,temp3,point1,point2,point3,point4,MIN,MAX,MIN2,MAX2,accDeg,curDeg,*sDeg,*facDegs,*coeffDegs;
   LONG t,t2,*M,*M2,*E,*Prod,*TEMP1,*TEMP2,*Coeffs,*CoeffCalc,*ck,*evalPoints,*interpPoints,*Delta,*DioSVal,numDioMuls,*poly1,*poly2,*poly3,*poly4;
   clock_t T1,T2,T3,T4,T5,T6,s1,s2,s3,s4,s5,s6,s7,s8,s9;
   ULONG z[2];

   //set time to 0
   s1=0;s2=0;s3=0;s4=0;s5=0;s6=0;s7=0;s8=0;s9=0;

   //initial size calculations
   maxSizeA = (dx+1)*(dz+1);
   mSize = ((n-1)*n)/2*du + n - 1;
   mCounter = 0;
   numDioMuls = 0;

   //declare arrays
   maxDeg = arrayI(n);
   sDeg = arrayI(n);
   facDegs = arrayI((n)*(dz+1));
   coeffDegs = arrayI(n*(dz+1));
   M = array(mSize);
   M2 = array(du*(n-1)+1);
   E = array(maxSizeA);
   Prod = array(maxSizeA);
   TEMP1 = array(maxSizeA);
   TEMP2 = array(dx+1);
   ck = array(dx+1);
   evalPoints = array(dx+1);
   interpPoints = array(dx+1);
   Delta = array(dx+1);
   CoeffCalc = array((dx+1)*(n-2)*(dz+1));
   DioSVal = array((n-1)*(dx+1));


   cleanArray64s(M,mSize);
   cleanArray64s(TEMP1,maxSizeA);
   cleanArray64s(TEMP2,dx+1);
   cleanArray64s(CoeffCalc,(dx+1)*(n-2)*(dz+1));
   cleanArray64s(DioSVal,(n-1)*(dx+1));
   for(i=0;i<n*(dz+1);i++){facDegs[i]=0;coeffDegs[i]=0;}

   //move contents of F0 to F
   for(i=0;i<n;i++){
      point1 = i*(dx+1)*(dz+1);
      point2 = i*(du+1);
      facDegs[i*(dz+1)] = -1;
      for(j=0;j<=du;j++){
         t = F0[point2 + j];
         if(t != 0){
            facDegs[i*(dz+1)] = j;
            F[point1 + j] = t;
         }
      }
   }

   //declare maxDeg, the size of each n factors
   for(i=0;i<n;i++){
      maxDeg[i] = 0;
   }

   //set evalPoints
   for(i=0;i<=dx;i++){
      evalPoints[i] = i;
   }

   //over multiple of U for future calls to diophantine equation
   T1 = clock();
   //Recover multiple of U for future calls to diophantine equation
   j = MultiEEA(F0,du,n,M,W,p);

   //if the EEA failed, return 'FAIL'(add last)
   if(j==-1){
      printf("The input factors are not coprime");
      return -1;
   }
   T2 = clock();
   //if the EEA failed, return 'FAIL'(add last)

   // declare initial Error

   for(i=0;i<maxSizeA;i++){
      E[i] = A[i];
   }
   T1 = clock();
   altDiv2(E,dx,dz,alpha,W,p);
   T2 = clock();
   //printf("Division time=%8.2fms\n",(T2 - T1)/1000.0/1.0);

   //Generate the S values from Gcdex
   generateGcdexS(F0,n,du,M,dx,DioSVal,sDeg,W,p);

   //save initial CoeffCalc multiplications
   poly1 = &F[(n-1)*(dz+1)*(dx+1)];
   poly2 = &F[(n-2)*(dz+1)*(dx+1)];
   coeffDegs[(n-2)*(dz+1)] = facDegs[(n-1)*(dz+1)] + facDegs[(n-2)*(dz+1)];
   fastPolyMult(poly1,poly2,&CoeffCalc[(n-3)*(dx+1)*(dz+1)],facDegs[(n-1)*(dz+1)],facDegs[(n-2)*(dz+1)],p);
   for(j=n-3;j>=1;j--){
       poly1 = &F[(j)*(dz+1)*(dx+1)];
       poly2 = &CoeffCalc[(j)*(dx+1)*(dz+1)];
       coeffDegs[(j)*(dz+1)] = facDegs[(j)*(dz+1)] + coeffDegs[(j+1)*(dz+1)];
       fastPolyMult(poly1,poly2,&CoeffCalc[(j-1)*(dx+1)*(dz+1)],facDegs[(j)*(dz+1)],coeffDegs[(j+1)*(dz+1)],p);
   }


   //main FOR loop
   T3 = clock();
   for(k=1;k<=dz;k++){

      //get the error for this loop
      point1 = k*(dx+1);
      ck = &(E[point1]);

      //calculate deltacoeffDegs[(j+1)*(dz+1)+(i-q)]
      T1 = clock();
      for(i=k;i<=k;i++){

          //set degree totals
          accDeg = maxDeg[n-1];

          for (j=n-2; j>=1; j--){

              curDeg = accDeg;
              accDeg = accDeg + maxDeg[j];

              //make sure i need to do current calculations
              if(i <= accDeg){

                  //first 2 polynomials multiplied
                  if (j==n-2) {

                     MIN = max32s(0,i-maxDeg[j]);
                     MAX = min32s(i,maxDeg[j]);

                     point1 = (n-1)*(dx+1)*(dz+1);
                     point2 = (n-2)*(dx+1)*(dz+1);

                     //refers to the degree of (y-alpha) for each factor
                     for (q=MIN;q<=MAX;q++){
                        //multiply the 2 polynomials together (classical)

                        point3 = q*(dx+1);
                        point4 = (i-q)*(dx+1);
                        poly2 = &F[point1+point3];
                        poly1 = &F[point2+point4];
                        coeffDegs[(n-2)*(dz+1)+i] = max32s(coeffDegs[(n-2)*(dz+1)+i],facDegs[(n-1)*(dz+1)+(i-q)]+facDegs[(n-2)*(dz+1)+q]);

                        //multiply quickly
                        fastPolyMult(poly2,poly1,&CoeffCalc[(j-1)*(dx+1)*(dz+1) + i*(dx+1)],facDegs[(n-1)*(dz+1)+q],facDegs[(n-2)*(dz+1)+(i-q)],p);

                     }

                 }
                 //continued polynomial multiplication up to degree d
                 else{

                    MIN = max32s(0,i-curDeg);
                    MAX = min32s(i,maxDeg[j]);

                     point1 = (j)*(dx+1)*(dz+1); //this draws from F
                     point2 = (j-1)*(dx+1)*(dz+1); //this draws from CoeffCalc

                     //refers to the degree of (y-alpha) for each factor
                     for (q=MIN;q<=MAX;q++){
                        //multiply the 2 polynomials together (classical)

                        point3 = q*(dx+1);
                        point4 = (i-q)*(dx+1);
                        poly2 = &F[point1+point3];
                        poly1 = &CoeffCalc[point1 + point4];
                        coeffDegs[(j)*(dz+1)+i] = max32s(coeffDegs[(j)*(dz+1)+i],coeffDegs[(j+1)*(dz+1)+(i-q)]+facDegs[(j)*(dz+1)+q]);

                        //multiply quickly
                        fastPolyMult(poly2,poly1,&CoeffCalc[(j-1)*(dx+1)*(dz+1) + i*(dx+1)],facDegs[(j)*(dz+1)+q],coeffDegs[(j+1)*(dz+1)+(i-q)],p);
                     }

                 }
             }
          }
      }

      //zero Delta array
      cleanArray64s(Delta,dx+1);

      curDeg = accDeg;
      accDeg = accDeg + maxDeg[0];

      MIN = max32s(0,k-accDeg);
      MAX = min32s(k,curDeg);

      //Calculate final coefficients (Delta)
      for(m=MIN;m<=MAX;m++){
          poly1 = &F[m*(dx+1)];
          poly2 = &CoeffCalc[(k-m)*(dx+1)];

          fastPolyMult(poly1,poly2,Delta,facDegs[m],coeffDegs[(dz+1) + (k-m)],p);
      }
      T2 = clock();
      s1 = s1 + T2 - T1;

      //subtract delta from e
      T1 = clock();
      temp = polsub64s(ck, Delta, ck, dx, dx, p);
      T2 = clock();
      s5 = s5 + T2 - T1;
      //printf("ck:=");polprint64s(ck,dx);printf("\n");

      cleanArray64s(TEMP1,maxSizeA);

      //check to make sure ck isn't zero
      flag = 0;
      for (i=0;i<=dx;i++){
        if (ck[i] != 0){
            flag = 1;
            break;
        }
      }

      if(flag == 1) {
          //Solve Diophantine Equation
          T1 = clock();
          Dionvar(F0, n, du, ck, dx, M, DioSVal,sDeg, W, TEMP1, p, &numDioMuls);
          T2 = clock();
          s4 = s4 + T2 - T1;

          //printf("Del:=");polprint64s(TEMP1,maxSizeA);printf("\n");

          //Update F
          point2 = k*(dx+1);
          for(i=0;i<n;i++){
             point1 = i*(dx+1)*(dz+1);
             facDegs[i*(dz+1)+k] = -1;
             for(j=0;j<du;j++){
                if(TEMP1[i*du+j] != 0){
                   maxDeg[i] = k;
                   facDegs[i*(dz+1)+k] = j;
                   F[point1 + point2 + j] = TEMP1[i*du+j];
                }
             }
          }

          //Update CoeffCalc
          //Queue up first polynomial

            cleanArray64s(TEMP1,dx);
            poly1 = &F[(n-1)*(dx+1)*(dz+1)];
            poly2 = &F[(n-2)*(dx+1)*(dz+1)];
            fastPolyMult(poly1,&poly2[k*(dx+1)],TEMP1,facDegs[(n-1)*(dz+1)],facDegs[(n-1)*(dz+1)+k],p);
            fastPolyMult(&poly1[k*(dx+1)],poly2,TEMP1,facDegs[(n-1)*(dz+1)+k],facDegs[(n-1)*(dz+1)],p);
            accDeg = max32s(facDegs[(n-1)*(dz+1)]+facDegs[(n-1)*(dz+1)+k],facDegs[(n-1)*(dz+1)+k]+facDegs[(n-1)*(dz+1)]);
            coeffDegs[(n-2)*(dz+1)+k] = max32s(accDeg,coeffDegs[(n-2)*(dz+1)+k]);

            poly1 = &CoeffCalc[(n-3)*(dx+1)*(dz+1) + k*(dx+1)];
            for(i=0;i<=accDeg;i++){
                poly1[i] = add64s(poly1[i],TEMP1[i],p);
            }

            //specific co-efficients
            for(i=n-3;i>=1;i--){
                poly1 = &F[(i)*(dx+1)*(dz+1)];
                poly2 = &F[(i)*(dx+1)*(dz+1) + k*(dx+1)];
                poly3 = &CoeffCalc[(i)*(dx+1)*(dz+1)];
                poly4 = &CoeffCalc[(i-1)*(dx+1)*(dz+1) + k*(dx+1)];

                cleanArray64s(TEMP2,dx+1);
                fastPolyMult(poly1,TEMP1,TEMP2,facDegs[(i)*(dz+1)],accDeg,p);
                fastPolyMult(poly2,poly3,TEMP2,facDegs[(i)*(dz+1)+k],coeffDegs[(i+1)*(dz+1)],p);
                accDeg = max32s(accDeg + facDegs[(i)*(dz+1)],facDegs[(i)*(dz+1)+k]+coeffDegs[(i+1)*(dz+1)]);
                coeffDegs[(i)*(dz+1)+k] =  max32s(accDeg,coeffDegs[(i)*(dz+1)+k]);

                poly1 = &CoeffCalc[(i-1)*(dx+1)*(dz+1) + k*(dx+1)];
                for(j=0;j<=accDeg;j++){
                    TEMP1[j] = TEMP2[j];
                    poly1[j] = add64s(poly1[j],TEMP2[j],p);
                }
            }

      }
   }
   T4 = clock();

   //print statements
    /*
   printf("Polynomial Multiplication time=%8.2fms\n",(s1)/1000.0/1.0);
   printf("Diophantine time=%8.2fms\n",(s4)/1000.0/1.0);
   printf("Sub time=%8.2fms\n",(s5)/1000.0/1.0);
   printf("Total Loop time=%8.2fms\n",(T4-T3)/1000.0/1.0);
    */
   return 0;

}