[Ecm-commits] r1571 - in trunk/gpu: . gpu_ecm_cc13

cvs commits ecm-commits at lists.gforge.inria.fr
Wed Apr 13 10:46:46 CEST 2011


Author: bouvierc
Date: 2011-04-13 10:46:45 +0200 (Wed, 13 Apr 2011)
New Revision: 1571

Added:
   trunk/gpu/gpu_ecm_cc13/
   trunk/gpu/gpu_ecm_cc13/Makefile
   trunk/gpu/gpu_ecm_cc13/cudaarith.cu
   trunk/gpu/gpu_ecm_cc13/cudaarith.h
   trunk/gpu/gpu_ecm_cc13/cudautils.cu
   trunk/gpu/gpu_ecm_cc13/main.cu
   trunk/gpu/gpu_ecm_cc13/obj/
   trunk/gpu/gpu_ecm_cc13/test.sh
   trunk/gpu/gpu_ecm_cc13/utils.h
Log:
Implementation of ecm for NVIDIA GPU of compute capability 1.3\n./test.sh provides a example.

Added: trunk/gpu/gpu_ecm_cc13/Makefile
===================================================================
--- trunk/gpu/gpu_ecm_cc13/Makefile	                        (rev 0)
+++ trunk/gpu/gpu_ecm_cc13/Makefile	2011-04-13 08:46:45 UTC (rev 1571)
@@ -0,0 +1,30 @@
+BIN=gpu_ecm
+
+GMP=/usr
+
+LDFLAGS=$(GMP)/lib/libgmp.a -lm
+CFLAGS=-O2 -I$(GMP)/include
+NVCCFLAGS=-m64 --compiler-options -fno-strict-aliasing --opencc-options -O2 --ptxas-options=-v
+
+COMPUTECAPABILITY=-arch=sm_13
+CFLAGS+=-DCC13
+
+
+$(BIN): obj/main.cu.o obj/cudautils.cu.o obj/cudaarith.cu.o
+	gcc -fPIC -m64 -o $@ $^ $(LDFLAGS) -L/usr/local/cuda/lib64 -lcudart
+
+obj/main.cu.o : main.cu cudautils.cu utils.h 
+	nvcc $(CFLAGS) $(COMPUTECAPABILITY) $(NVCCFLAGS) -o $@ -c $<
+
+obj/cudautils.cu.o : cudautils.cu cudaarith.cu utils.h 
+	nvcc $(CFLAGS) $(COMPUTECAPABILITY) $(NVCCFLAGS) -o $@ -c $<
+
+obj/cudaarith.cu.o : cudaarith.cu utils.h
+	nvcc $(CFLAGS) $(COMPUTECAPABILITY) $(NVCCFLAGS) -o $@ -c $<
+
+clean: 
+	rm obj/*.cu.o
+	
+mrproper: clean
+	rm $(BIN)
+

Added: trunk/gpu/gpu_ecm_cc13/cudaarith.cu
===================================================================
--- trunk/gpu/gpu_ecm_cc13/cudaarith.cu	                        (rev 0)
+++ trunk/gpu/gpu_ecm_cc13/cudaarith.cu	2011-04-13 08:46:45 UTC (rev 1571)
@@ -0,0 +1,1464 @@
+#include "cudaarith.h"
+
+//specific functions for compute capability 1.3
+//#ifdef CC13
+//  (A > B)?, returns 1(true), -1(false) or 0(a=b) 
+//Assume A and B are normalize (no carry or borrow)
+__device__ int Cuda_Cmp(const biguint_t A, const biguint_t B)
+{
+	unsigned int i = SIZE_NUMBER-1;
+	do
+	{
+		if (A[i] > B[i])
+			return 1;
+		if (A[i] < B[i])
+			return -1;
+		i--;
+	}while(i!=0);
+	return 0;
+}
+//#endif
+/*
+//specific functions for compute capability 20
+#ifdef CC20
+//  (A > B)?, returns 1(true), -1(false) or 0(a=b) 
+//Assume A and B are normalize (no carry or borrow)
+__device__ int Cuda_Is_Eq(const biguint_t A, const biguint_t B)
+{
+	return A[threadIdx.x]==B[threadIdx.x];
+}
+__device__ int Cuda_Is_Gt(const biguint_t A, const biguint_t B)
+{
+	return A[threadIdx.x]>B[threadIdx.x];
+}
+__device__ int Cuda_Cmp(const biguint_t A, const biguint_t B)
+{
+	unsigned int ballot=__ballot(Cuda_Is_Eq(A,B));
+	if (ballot==0)
+		return 0;
+	else
+		if (__ballot(Cuda_Is_Gt(A,B))>__ballot(Cuda_Is_Gt(B,A)))
+			return 1;
+		else
+			return -1;
+}
+#endif
+*/
+
+//return -1 if A<N, 0 if A=N, 1 if A>N
+__device__ int Cuda_ge_N(const biguint_t A, const int cy)
+{
+	unsigned int i = SIZE_NUMBER-1;
+	if (cy>0)
+		return 1;
+	do
+	{
+		if (A[i] > Ncst[i])
+			return 1;
+		if (A[i] < Ncst[i])
+			return -1;
+		i--;
+	}while(i!=0);
+	return 0;
+}
+
+//r[i], cy[i] <-r[i] + b for all i
+__device__ void Cuda_Add1 (biguint_t r, dbigint_t cy ,const unsigned int b)
+{
+	r[threadIdx.x]+=b;
+	cy[threadIdx.x]+=(r[threadIdx.x]<b);
+}
+
+//r[i], cy[i] <-r[i] - b for all i
+__device__ void Cuda_Sub1 (biguint_t r, dbigint_t cy, const unsigned int b)
+{
+	cy[threadIdx.x]-=(r[threadIdx.x] < b);
+	r[threadIdx.x] -= b;
+}
+
+//Normalise a result; 
+__device__ int Cuda_Is_Normalize(dbigint_t cy)
+{
+	if (threadIdx.x==SIZE_NUMBER-1 || cy[threadIdx.x]==0)
+		return 0;
+	else
+		return 1;
+}
+
+__device__ void Cuda_Normalize(biguint_t A,dbigint_t cy)
+{
+	int oldcy;
+	if (threadIdx.x==0)
+		oldcy = 0;
+	else
+	{
+		oldcy = cy[threadIdx.x-1];
+		cy[threadIdx.x-1]=0;
+	}
+
+	if (oldcy>=0)
+		Cuda_Add1(A,cy,oldcy);
+	else // oldcy < 0
+		Cuda_Sub1(A,cy,-oldcy);
+	
+}
+
+__device__ void Cuda_Fully_Normalize(biguint_t A,dbigint_t cy)
+{
+	do
+	{
+	Cuda_Normalize(A,cy);
+	}while(__any(Cuda_Is_Normalize(cy))!=0);
+}
+
+
+__device__ void Cuda_Add (biguint_t r, dbigint_t cy ,const biguint_t b)
+{
+	r[threadIdx.x]+=b[threadIdx.x];
+	cy[threadIdx.x]+=(r[threadIdx.x]<b[threadIdx.x]);
+}
+
+//Assume r and b are different; r and a can be the same.
+__device__ void Cuda_Add 
+(biguint_t r, dbigint_t cy ,const biguint_t a, const biguint_t b)
+{
+	r[threadIdx.x]=a[threadIdx.x];
+	r[threadIdx.x]+=b[threadIdx.x];
+	cy[threadIdx.x]+=(r[threadIdx.x]<b[threadIdx.x]);
+}
+
+__device__ void Cuda_Sub (biguint_t r, dbigint_t cy, const biguint_t b)
+{
+	cy[threadIdx.x]-=(r[threadIdx.x] < b[threadIdx.x]);
+	r[threadIdx.x] -= b[threadIdx.x];
+}
+
+//Assume r and b are different; r and a can be the same.
+__device__ void Cuda_Sub 
+(biguint_t r, dbigint_t cy, const biguint_t a, const biguint_t b)
+{
+	r[threadIdx.x]=a[threadIdx.x];
+	cy[threadIdx.x]-=(r[threadIdx.x] < b[threadIdx.x]);
+	r[threadIdx.x] -= b[threadIdx.x];
+}
+
+//Compute Rmod <- A + B [mod] 
+__device__ void Cuda_Add_mod
+(biguint_t Rmod, dbigint_t cy, const biguint_t A,const biguint_t B,const biguint_t mod)
+{
+  Cuda_Add(Rmod,cy,A,B);
+	Cuda_Fully_Normalize(Rmod,cy);	
+  //if (Cuda_Cmp (Rmod, mod) >= 0 || cy[SIZE_NUMBER-1]>0) // (a >= mod)? 
+  if (Cuda_ge_N (Rmod,cy[SIZE_NUMBER-1]) >= 0) // (a >= mod)? 
+	{
+   	Cuda_Sub (Rmod, cy, mod); // R <- R - mod 
+		Cuda_Fully_Normalize(Rmod,cy);	
+	}
+}
+
+//Compute Rmod  <-Rmod + A [mod] 
+__device__ void Cuda_Add_mod
+(biguint_t Rmod, dbigint_t cy, const biguint_t A,const biguint_t mod)
+{
+ 	Cuda_Add(Rmod,cy,A);
+	Cuda_Fully_Normalize(Rmod,cy);	
+ 	//if (Cuda_Cmp (Rmod, mod) >= 0 || cy[SIZE_NUMBER-1]>0) // (a >= mod)? 
+  if (Cuda_ge_N (Rmod,cy[SIZE_NUMBER-1]) >= 0) // (a >= mod)? 
+	{
+   	Cuda_Sub (Rmod, cy, mod);  
+		Cuda_Fully_Normalize(Rmod,cy);	
+	}
+}
+
+//Compute Rmod <- A - B [mod] 
+__device__ void Cuda_Sub_mod 
+(biguint_t Rmod, dbigint_t cy, const biguint_t A,const biguint_t B,const biguint_t mod)
+{
+ 	Cuda_Sub(Rmod,cy,A,B);
+	Cuda_Fully_Normalize(Rmod,cy);	
+	
+ 	if (cy[SIZE_NUMBER-1] <0 ) // we should subtract 1 at smod[n] 
+ 	{
+   	Cuda_Add (Rmod, cy, mod);
+		Cuda_Fully_Normalize(Rmod,cy);	
+ 	}
+}
+
+//Compute Rmod <- Rmod - A [mod]
+__device__ void Cuda_Sub_mod 
+(biguint_t Rmod, dbigint_t cy, const biguint_t A, const biguint_t mod)
+{
+ 	Cuda_Sub(Rmod,cy,A);
+	Cuda_Fully_Normalize(Rmod,cy);	
+
+ 	if (cy[SIZE_NUMBER-1] <0 ) 
+ 	{
+   	Cuda_Add (Rmod, cy, mod);
+		Cuda_Fully_Normalize(Rmod,cy);	
+ 	}
+}
+
+//  Return h, l such that h*2^32 + l = A*B 
+__device__ void Cuda_Mul_uint (unsigned int *h, unsigned int *l, const unsigned int A,const unsigned int B)
+{
+	//if (A==B)
+	//	Cuda_Square_uint(h,l,A);
+	//else
+	//{	
+		unsigned int m1,m2,ah,al,bh,bl;
+		al=A&0x0000ffff;
+		bl=B&0x0000ffff;
+		ah=A>>16;
+		bh=B>>16;
+	
+		*h=ah * bh;
+		m1=ah * bl;
+		m2=al * bh;
+		*l=al * bl;
+
+		//al,an,bl,bh used as temporary variables
+		al=(m1&0x0000ffff)<<16;
+		bl=(m2&0x0000ffff)<<16;
+		ah=m1>>16;
+		bh=m2>>16;
+
+
+		*h+=(ah+bh);
+	
+		*l+=bl;
+		*h+=(*l<bl);
+	
+		*l+=al;
+		*h+=(*l<al);
+	//}
+}
+
+//  Return h, l such that h*2^32 + l = A*A 
+__device__ void Cuda_Square_uint (unsigned int *h, unsigned int *l, const unsigned int A)
+{
+	unsigned int m1,ah,al;
+	al=A&0x0000ffff;
+	ah=A>>16;
+	
+	*h=ah * ah;
+	m1=ah * al;
+	*l=al * al;
+
+	//al,ah used as temporary variables
+	al=(m1&0x0000ffff)<<16;
+	ah=m1>>16;
+
+
+	*h+=(ah+ah);
+	
+	*l+=al;
+	*h+=(*l<al);
+	*l+=al;
+	*h+=(*l<al);
+}
+
+#ifdef TEST
+
+//  Return cy,h, l such that cy*2^64+h*2^32 + l = A*(B1+B2) 
+__device__ void Cuda_AddMul_uint (int *cy, unsigned int *h, unsigned int *l, const unsigned int A,const unsigned int B1, const unsigned int B2)
+{
+	unsigned int sum=B1+B2;
+	unsigned int c = (sum < B2);
+	Cuda_Mul_uint(h,l,A,sum);
+
+	c*=A;
+	*h+=c;
+	*cy=(*h<c);
+}
+
+//  Return cy,h, l such that cy*2^64+h*2^32 + l = A*(B1-B2) 
+__device__ void Cuda_SubMul_uint (int *cy, unsigned int *h, unsigned int *l, const unsigned int A,const unsigned int B1, const unsigned int B2)
+{
+	unsigned int diff=B1-B2;
+	int c = (B1 < B2);
+	Cuda_Mul_uint(h,l,A,diff);
+
+	c*=A;
+	*cy=(*h<c);
+	*h-=c;
+}
+
+//  Return cy,h, l such that cy*2^64+h*2^32 + l = A*(B1-B2) 
+__device__ void Cuda_SubMul2_uint (int *cy, unsigned int *h, unsigned int *l, const unsigned int A1, const unsigned int A2, const unsigned int B1, const unsigned int B2)
+{
+	unsigned int diffa=A1-A2;
+	unsigned int diffb=B1-B2;
+	int ca = (A1 < A2);
+	int cb = (B1 < B2);
+	Cuda_Mul_uint(h,l,diffa,diffb);
+
+	*cy=cb*ca;
+
+	ca*=diffb;
+	cb*=diffa;
+	*cy-=(*h<ca);
+	*h-=ca;
+	*cy-=(*h<cb);
+	*h-=cb;
+}
+
+//Compute R(with cy) <- R+ A1^2*2^1024+A2^2 where A = A1*2^512+A2
+__device__ void Cuda_Square_halves
+(dbiguint_t R, dbigint_t cy, const biguint_t A)
+{
+	int i;
+	unsigned int h,l;
+	unsigned int dec;
+	if (threadIdx.x<SIZE_NUMBER/2)
+		dec=0;
+	else
+		dec=SIZE_NUMBER/2;
+	
+	for (i=0;i<SIZE_NUMBER/2;i++)
+	{
+		Cuda_Mul_uint(&h,&l,A[threadIdx.x],A[dec+i]);
+		//Cuda_Add1(R+dec+i,cy+dec+i,l);
+		//Cuda_Add1(R+dec+i+1,cy+dec+i+1,h);
+		R[i+threadIdx.x+dec] +=l;
+		cy[i+threadIdx.x+dec]+=(R[i+threadIdx.x+dec] < l);
+		
+		R[i+1+threadIdx.x+dec] +=h;
+		cy[i+1+threadIdx.x+dec]+=(R[i+1+threadIdx.x+dec]<h);
+	}
+}
+
+//Compute R(with cy) <- R+ A1*B1*2^1024+(A1*B1+A2*B2)*2^512+A2*B2 
+//where A = A1*2^512+A2  and B = B1*2^512+A2
+__device__ void Cuda_Mul_halves
+(dbiguint_t R, dbigint_t cy, const biguint_t A, const biguint_t B)
+{
+	int i;
+	unsigned int h,l;
+	unsigned int dec;
+	if (threadIdx.x<SIZE_NUMBER/2)
+		dec=0;
+	else
+		dec=SIZE_NUMBER/2;
+	
+	for (i=0;i<SIZE_NUMBER/2;i++)
+	{
+		Cuda_Mul_uint(&h,&l,A[threadIdx.x],B[dec+i]);
+		if (threadIdx.x<SIZE_NUMBER/2)
+		{
+		R[i+threadIdx.x] +=l;
+		cy[i+threadIdx.x]+=(R[i+threadIdx.x] < l);
+		
+		R[i+1+threadIdx.x] +=h;
+		cy[i+1+threadIdx.x]+=(R[i+1+threadIdx.x]<h);
+		
+		R[i+threadIdx.x+SIZE_NUMBER/2] +=l;
+		cy[i+threadIdx.x+SIZE_NUMBER/2]+=(R[i+threadIdx.x+SIZE_NUMBER/2] < l);
+		
+		R[i+1+threadIdx.x+SIZE_NUMBER/2] +=h;
+		cy[i+1+threadIdx.x+SIZE_NUMBER/2]+=(R[i+1+threadIdx.x+SIZE_NUMBER/2]<h);
+		
+		}
+		else
+		{
+		//Not in the same order to avoid writing at the same adress at the same time
+		//No need to add SIZE_NUMBER/2 because threadIdx.x > SIZE_NUMBER/1
+		R[i+threadIdx.x] +=l;
+		cy[i+threadIdx.x]+=(R[i+threadIdx.x] < l);
+		
+		R[i+1+threadIdx.x] +=h;
+		cy[i+1+threadIdx.x]+=(R[i+1+threadIdx.x]<h);
+
+		R[i+threadIdx.x+dec] +=l;
+		cy[i+threadIdx.x+dec]+=(R[i+threadIdx.x+dec] < l);
+		
+		R[i+1+threadIdx.x+dec] +=h;
+		cy[i+1+threadIdx.x+dec]+=(R[i+1+threadIdx.x+dec]<h);
+		}
+	}
+}
+
+/*
+//Compute R(with cy) <- R-(A1-A2)*(B1-B2)*2^512 
+//where A = A1*2^512+A2  and B = B1*2^512+A2
+__device__ void Cuda_SubMul
+(dbiguint_t R, dbigint_t cy, const biguint_t A, const biguint_t B)
+{
+	int i;
+	unsigned int h2,h,l;
+	unsigned int dec;
+	
+	for (i=0;i<SIZE_NUMBER/2;i++)
+	{
+		Cuda_SubMul_uint(&h2,&h,&l,A[threadIdx.x],B[i],B[i+SIZE_NUMBER/2]);
+		
+		R[i+threadIdx.x+SIZE_NUMBER/2] +=h;
+		cy[i+1+threadIdx.x]+=(R[i+1+threadIdx.x]<h);
+		}
+	}
+	
+}
+*/
+
+
+//Compute R(with cy) <- R+ 2*A*B where A and B are SIZE_NUMBER/2-long
+__device__ void Cuda_DblMul_half
+(dbiguint_t R, dbigint_t cy, const biguint_t A,const biguint_t B)
+{
+	int i;
+	unsigned int h,l;
+	unsigned int hh,ll;
+	int cy1,cy2;
+	
+	unsigned int idx=threadIdx.x%(SIZE_NUMBER/2);
+	unsigned int idi=threadIdx.x/(SIZE_NUMBER/2);
+	
+	for (i=0;i<SIZE_NUMBER/2;i++)
+	{
+		Cuda_Mul_uint(&h,&l,A[idx],B[idi]);
+		ll=2*l;
+		cy1=(ll<l);
+		hh=2*h;
+		cy2=(hh<h);
+		if (threadIdx.x<SIZE_NUMBER/2)
+		{
+			R[idi+idx] +=ll;
+			cy[idi+idx]+=cy1+(R[idi+idx] < ll);
+		
+			R[idi+1+idx] +=hh;
+			cy[idi+1+idx]+=cy2+(R[idi+1+idx] < hh);
+		}
+		if (threadIdx.x>=SIZE_NUMBER/2)
+		{
+			R[idi+idx] +=ll;
+			cy[idi+idx]+=cy1+(R[idi+idx] < ll);
+		
+			R[idi+1+idx] +=hh;
+			cy[idi+1+idx]+=cy2+(R[idi+1+idx] < hh);
+		}
+		idi+=2;
+	}
+}
+
+
+__device__ void Cuda_Mul 
+(dbiguint_t R, dbigint_t cy, const biguint_t A, const biguint_t B)
+{
+	Cuda_Mul_halves(R,cy,A,B);
+	//Cuda_SubMul(R+SIZE_NUMBER/2,cy+SIZE_NUMBER/2,A,B);
+
+	//Normalize : but R and cy are 2 * SIZE_NUMBER long
+	Cuda_Fully_Normalize(R,cy);
+	Cuda_Fully_Normalize(R+SIZE_NUMBER-1,cy+SIZE_NUMBER-1);
+	if (threadIdx.x==0)
+		{
+		R[2*SIZE_NUMBER-1]+=cy[2*SIZE_NUMBER-2];
+		cy[2*SIZE_NUMBER-2]=0;//only to let cy clean in order to re-use it
+		}
+}
+
+
+
+__device__ void Cuda_Square (dbiguint_t R, dbigint_t cy, const biguint_t A)
+{
+	Cuda_Square_halves(R,cy,A);
+	Cuda_DblMul_half(R+SIZE_NUMBER/2,cy+SIZE_NUMBER/2,A,A+SIZE_NUMBER/2);
+	//Normalize : but R and cy are 2 * SIZE_NUMBER long
+	Cuda_Fully_Normalize(R,cy);
+	Cuda_Fully_Normalize(R+SIZE_NUMBER-1,cy+SIZE_NUMBER-1);
+	if (threadIdx.x==0)
+		{
+		R[2*SIZE_NUMBER-1]+=cy[2*SIZE_NUMBER-2];
+		cy[2*SIZE_NUMBER-2]=0;//only to let cy clean in order to re-use it
+		}
+}
+
+
+#endif
+
+#ifndef TEST
+__device__ void Cuda_Mul
+(dbiguint_t R, dbigint_t cy, const biguint_t A,const biguint_t B)
+{
+	int i;
+	unsigned int h,l;
+	
+	for (i=0;i<SIZE_NUMBER;i++)
+	{
+		//h*2^32+l =A[i]*B[threadIDx.x]
+		Cuda_Mul_uint(&h,&l,A[threadIdx.x],B[i]);
+		
+		Cuda_Add1(R+i,cy+i,l);
+		Cuda_Add1(R+i+1,cy+i+1,h);
+		//R[i+threadIdx.x] +=l;
+		//cy[i+threadIdx.x]+=(R[i+threadIdx.x] < l);
+		
+		//R[i+1+threadIdx.x] +=h;
+		//cy[i+1+threadIdx.x]+=(R[i+1+threadIdx.x]<h);
+	}
+
+	//Normalize : but R and cy are 2 * SIZE_NUMBER long
+	Cuda_Fully_Normalize(R,cy);
+	Cuda_Fully_Normalize(R+SIZE_NUMBER-1,cy+SIZE_NUMBER-1);
+	if (threadIdx.x==0)
+		{
+		R[2*SIZE_NUMBER-1]+=cy[2*SIZE_NUMBER-2];
+		cy[2*SIZE_NUMBER-2]=0;//only to let cy clean in order to re-use it
+		}
+}
+
+__device__ void Cuda_Square (dbiguint_t R, dbigint_t cy, const biguint_t A)
+{
+	int i;
+	unsigned int h,l;
+		
+	Cuda_Square_uint(&h,&l,A[threadIdx.x]);
+
+	//Cuda_Add1(R+threadIdx.x,cy+threadIdx.x,l);
+	//Cuda_Add1(R+threadIdx.x+1,cy+threadIdx.x+1,h);
+	R[2*threadIdx.x] += l;
+	cy[2*threadIdx.x]+=(R[2*threadIdx.x] < l);
+	
+	R[2*threadIdx.x+1] += h;
+	cy[2*threadIdx.x+1]+=(R[2*threadIdx.x+1]<h);
+	
+	for (i=0;i<threadIdx.x;i++)
+	{
+		Cuda_Mul_uint(&h,&l,A[threadIdx.x],A[i]);
+		
+		Cuda_Add1(R+i,cy+i,l);
+		Cuda_Add1(R+i,cy+i,l);
+		Cuda_Add1(R+i+1,cy+i+1,h);
+		Cuda_Add1(R+i+1,cy+i+1,h);
+		//R[i+threadIdx.x] += l;
+		//cy[i+threadIdx.x]+=(R[i+threadIdx.x] < l);
+		//R[i+threadIdx.x] += l;
+		//cy[i+threadIdx.x]+=(R[i+threadIdx.x] < l);
+		
+		//R[i+1+threadIdx.x] +=h;
+		//cy[i+1+threadIdx.x]+=(R[i+1+threadIdx.x]<h);
+		//R[i+1+threadIdx.x] +=h;
+		//cy[i+1+threadIdx.x]+=(R[i+1+threadIdx.x]<h);
+	}
+
+	//Normalize : but R and cy are 2 * SIZE_NUMBER long
+	Cuda_Fully_Normalize(R,cy);
+	Cuda_Fully_Normalize(R+SIZE_NUMBER-1,cy+SIZE_NUMBER-1);
+	if (threadIdx.x==0)
+		{
+		R[2*SIZE_NUMBER-1]+=cy[2*SIZE_NUMBER-2];
+		cy[2*SIZE_NUMBER-2]=0;//only to let cy clean in order to re-use it
+		}
+}
+#endif 
+
+
+__device__ void Cuda_RedMontgomery (biguint_t mul, dbigint_t cy, const biguint_t mod, dbiguint_t r, dbiguint_t temp)
+{
+	//temp=((r mod 2^(32*SIZE_NUMBER))*mod^-1) (mod^-1 already compute)
+	Cuda_Mul(temp,cy,r,invmodcst);//pour r que la partie mod R compte : Ok
+	
+	//mul = temp (mod 2^(32*SIZE_NUMBER)) 
+	mul[threadIdx.x]=temp[threadIdx.x];
+	temp[threadIdx.x]=0;
+	temp[threadIdx.x+SIZE_NUMBER]=0;
+	
+	//temp=mul*m
+	Cuda_Mul(temp,cy,mul,mod);
+	
+	//r=r+temp // r and temp2 are 2*SIZE_NUMBER long
+	Cuda_Add(r,cy,temp);
+	Cuda_Add(r+SIZE_NUMBER,cy+SIZE_NUMBER,temp+SIZE_NUMBER);
+	//Normalize : but R and cy are 2 * SIZE_NUMBER long
+	Cuda_Fully_Normalize(r,cy);
+	Cuda_Fully_Normalize(r+SIZE_NUMBER-1,cy+SIZE_NUMBER-1);
+	if (threadIdx.x==0)
+		{
+		r[2*SIZE_NUMBER-1]+=cy[2*SIZE_NUMBER-2];
+		cy[2*SIZE_NUMBER-2]=0;//only to let cy clean in order to re-use it
+		}
+
+	//return r/ 2^(32*SIZE_NUMBER)
+	mul[threadIdx.x]=r[threadIdx.x+SIZE_NUMBER];
+
+  //if (Cuda_Cmp (mul, mod) >= 0) // (mul >= mod)? 
+  if (Cuda_ge_N (mul,0) >= 0) // (mul >= mod)? 
+	{
+  	Cuda_Sub (mul, cy, mod); 
+		Cuda_Fully_Normalize(mul,cy);	
+	}
+}
+
+
+//Assume A ans B are the montgomery representation
+//Compute mul = A * B * 2^-(32*SIZE_NUMBER) mod[mod]
+// r and temp have size 2*SIZE_NUMBER
+__device__ void Cuda_Mul_mod (biguint_t mul, dbigint_t cy, const biguint_t A,const biguint_t B, const biguint_t mod, dbiguint_t r, dbiguint_t temp)
+{
+	temp[threadIdx.x]=0;
+	temp[threadIdx.x+SIZE_NUMBER]=0;
+	r[threadIdx.x]=0;
+	r[threadIdx.x+SIZE_NUMBER]=0;
+	
+	//__syncthreads();
+	//r=a*b
+	Cuda_Mul(r,cy,A,B);
+	Cuda_RedMontgomery (mul, cy, mod, r, temp);
+/*	
+	//temp=((r mod 2^(32*SIZE_NUMBER))*mod^-1) (mod^-1 already compute)
+	Cuda_Mul(temp,cy,r,invmodcst);//pour r que la partie mod R compte : Ok
+	
+	//mul = temp (mod 2^(32*SIZE_NUMBER)) 
+	mul[threadIdx.x]=temp[threadIdx.x];
+	temp[threadIdx.x]=0;
+	temp[threadIdx.x+SIZE_NUMBER]=0;
+	
+	//temp=mul*m
+	Cuda_Mul(temp,cy,mul,mod);
+	
+	//r=r+temp // r and temp2 are 2*SIZE_NUMBER long
+	Cuda_Add(r,cy,temp);
+	Cuda_Add(r+SIZE_NUMBER,cy+SIZE_NUMBER,temp+SIZE_NUMBER);
+	//Normalize : but R and cy are 2 * SIZE_NUMBER long
+	Cuda_Fully_Normalize(r,cy);
+	Cuda_Fully_Normalize(r+SIZE_NUMBER-1,cy+SIZE_NUMBER-1);
+	if (threadIdx.x==0)
+		{
+		r[2*SIZE_NUMBER-1]+=cy[2*SIZE_NUMBER-2];
+		cy[2*SIZE_NUMBER-2]=0;//only to let cy clean in order to re-use it
+		}
+
+	//return r/ 2^(32*SIZE_NUMBER)
+	mul[threadIdx.x]=r[threadIdx.x+SIZE_NUMBER];
+
+  //if (Cuda_Cmp (mul, mod) >= 0) // (mul >= mod)? 
+  if (Cuda_ge_N (mul,0) >= 0) // (mul >= mod)? 
+	{
+  	Cuda_Sub (mul, cy, mod); 
+		Cuda_Fully_Normalize(mul,cy);	
+	}
+*/	
+}
+
+
+//Assume A ans B are the montgomery representation
+//Compute mul = A * A * 2^-(32*SIZE_NUMBER) mod[mod]
+// r and temp have size 2*SIZE_NUMBER
+__device__ void Cuda_Square_mod (biguint_t mul, dbigint_t cy, const biguint_t A, const biguint_t mod, dbiguint_t r, dbiguint_t temp)
+{
+	temp[threadIdx.x]=0;
+	temp[threadIdx.x+SIZE_NUMBER]=0;
+	r[threadIdx.x]=0;
+	r[threadIdx.x+SIZE_NUMBER]=0;
+	
+	//__syncthreads();
+	//r=a*b
+	Cuda_Square(r,cy,A);
+	Cuda_RedMontgomery (mul, cy, mod, r, temp);
+/*	
+	//temp=((r mod 2^(32*SIZE_NUMBER))*mod^-1) (mod^-1 already compute)
+	Cuda_Mul(temp,cy,r,invmodcst);//pour r que la partie mod R compte : Ok
+	
+	//mul = temp (mod 2^(32*SIZE_NUMBER)) 
+	mul[threadIdx.x]=temp[threadIdx.x];
+	temp[threadIdx.x]=0;
+	temp[threadIdx.x+SIZE_NUMBER]=0;
+	
+	//temp=mul*m
+	Cuda_Mul(temp,cy,mul,mod);
+	
+	//r=r+temp // r and temp2 are 2*SIZE_NUMBER long
+	Cuda_Add(r,cy,temp);
+	Cuda_Add(r+SIZE_NUMBER,cy+SIZE_NUMBER,temp+SIZE_NUMBER);
+	//Normalize : but R and cy are 2 * SIZE_NUMBER long
+	Cuda_Fully_Normalize(r,cy);
+	Cuda_Fully_Normalize(r+SIZE_NUMBER-1,cy+SIZE_NUMBER-1);
+	if (threadIdx.x==0)
+		{
+		r[2*SIZE_NUMBER-1]+=cy[2*SIZE_NUMBER-2];
+		cy[2*SIZE_NUMBER-2]=0;//only to let cy clean in order to re-use it
+		}
+
+	//__syncthreads();
+
+	//return r/ 2^(32*SIZE_NUMBER)
+	mul[threadIdx.x]=r[threadIdx.x+SIZE_NUMBER];
+
+  //if (Cuda_Cmp (mul, mod) >= 0) // (mul >= mod)? 
+  if (Cuda_ge_N (mul,0) >= 0) // (mul >= mod)? 
+	{
+  	Cuda_Sub (mul, cy, mod); 
+		Cuda_Fully_Normalize(mul,cy);	
+	}
+*/
+}
+
+__device__ void Cuda_Ell_Dbl(biguint_t x2p, biguint_t z2p, const biguint_t xp, const biguint_t zp, const biguint_t N, biguint_t temp_u, biguint_t temp_v, dbiguint_t temp_r, dbiguint_t temp_r2, dbigint_t cy)
+{
+	//u<-xp+zp mod N
+	Cuda_Add_mod(temp_u,cy,xp,zp,N);
+	//u <- u^2
+	Cuda_Square_mod(temp_u,cy,temp_u,N,temp_r,temp_r2);
+	//v<-xp-zp mod N
+	Cuda_Sub_mod(temp_v,cy,xp,zp,N);
+	//v <- v^2
+	Cuda_Square_mod(temp_v,cy,temp_v,N,temp_r,temp_r2);
+	//x2p=u-v mod N    x2p is used as a temporary variable here
+	Cuda_Sub_mod(x2p,cy,temp_u,temp_v,N);
+	//z2p<-x2p*d mod N z2p is used as a temporary variable here
+	Cuda_Mul_mod(z2p,cy,x2p,dcst[blockIdx.x],N,temp_r,temp_r2);
+	//z2p<- z2p+v mod N
+	Cuda_Add_mod(z2p,cy,temp_v,N);
+	//z2p <- x2p*z2p
+	Cuda_Mul_mod(z2p,cy,x2p,z2p,N,temp_r,temp_r2);
+	//x2p <- u*v
+	Cuda_Mul_mod(x2p,cy,temp_u,temp_v,N,temp_r,temp_r2);
+}
+
+__device__ void Cuda_Ell_Add(biguint_t xplus, biguint_t zplus, const biguint_t xp, const biguint_t zp, const biguint_t xq, const biguint_t zq, const biguint_t xminus, const biguint_t zminus, const biguint_t N, biguint_t temp_u, biguint_t temp_v, biguint_t temp_w, dbiguint_t temp_r, dbiguint_t temp_r2, dbigint_t cy)
+{
+	unsigned int tmp;
+	//u<-xp+zp mod N
+	Cuda_Add_mod(temp_u,cy,xp,zp,N);



More information about the Ecm-commits mailing list