[Ecm-commits] r1573 - trunk/gpu/gpu_ecm_cc13

cvs commits ecm-commits at lists.gforge.inria.fr
Wed Apr 20 17:40:57 CEST 2011


Author: bouvierc
Date: 2011-04-20 17:40:57 +0200 (Wed, 20 Apr 2011)
New Revision: 1573

Modified:
   trunk/gpu/gpu_ecm_cc13/Makefile
   trunk/gpu/gpu_ecm_cc13/cudaarith.cu
   trunk/gpu/gpu_ecm_cc13/cudaarith.h
   trunk/gpu/gpu_ecm_cc13/cudautils.cu
   trunk/gpu/gpu_ecm_cc13/main.cu
   trunk/gpu/gpu_ecm_cc13/test.sh
   trunk/gpu/gpu_ecm_cc13/utils.h
Log:
Some improvements (especially on multiplication). 30% gain in time. 


Modified: trunk/gpu/gpu_ecm_cc13/Makefile
===================================================================
--- trunk/gpu/gpu_ecm_cc13/Makefile	2011-04-14 13:33:22 UTC (rev 1572)
+++ trunk/gpu/gpu_ecm_cc13/Makefile	2011-04-20 15:40:57 UTC (rev 1573)
@@ -13,15 +13,18 @@
 $(BIN): obj/main.cu.o obj/cudautils.cu.o obj/cudaarith.cu.o
 	gcc -fPIC -m64 -o $@ $^ $(LDFLAGS) -L/usr/local/cuda/lib64 -lcudart
 
-obj/main.cu.o : main.cu cudautils.cu utils.h 
+obj/main.cu.o : main.cu cudautils.cu utils.h objdir 
 	nvcc $(CFLAGS) $(COMPUTECAPABILITY) $(NVCCFLAGS) -o $@ -c $<
 
-obj/cudautils.cu.o : cudautils.cu cudaarith.cu utils.h 
+obj/cudautils.cu.o : cudautils.cu cudaarith.cu utils.h objdir
 	nvcc $(CFLAGS) $(COMPUTECAPABILITY) $(NVCCFLAGS) -o $@ -c $<
 
-obj/cudaarith.cu.o : cudaarith.cu utils.h
+obj/cudaarith.cu.o : cudaarith.cu utils.h objdir
 	nvcc $(CFLAGS) $(COMPUTECAPABILITY) $(NVCCFLAGS) -o $@ -c $<
 
+objdir:
+	mkdir -p obj
+
 clean: 
 	rm obj/*.cu.o
 	

Modified: trunk/gpu/gpu_ecm_cc13/cudaarith.cu
===================================================================
--- trunk/gpu/gpu_ecm_cc13/cudaarith.cu	2011-04-14 13:33:22 UTC (rev 1572)
+++ trunk/gpu/gpu_ecm_cc13/cudaarith.cu	2011-04-20 15:40:57 UTC (rev 1573)
@@ -45,37 +45,23 @@
 #endif
 */
 
-//return -1 if A<N, 0 if A=N, 1 if A>N
-__device__ int Cuda_ge_N(const biguint_t A, const int cy)
+//return -1 if A<cst, 0 if A=cst, 1 if A>cst
+__device__ int Cuda_ge_cst(const biguint_t A, const biguint_t cst, const int cy)
 {
 	unsigned int i = SIZE_NUMBER-1;
 	if (cy>0)
 		return 1;
 	do
 	{
-		if (A[i] > Ncst[i])
+		if (A[i] > cst[i])
 			return 1;
-		if (A[i] < Ncst[i])
+		if (A[i] < cst[i])
 			return -1;
 		i--;
 	}while(i!=0);
 	return 0;
 }
 
-//r[i], cy[i] <-r[i] + b for all i
-__device__ void Cuda_Add1 (biguint_t r, dbigint_t cy ,const unsigned int b)
-{
-	r[threadIdx.x]+=b;
-	cy[threadIdx.x]+=(r[threadIdx.x]<b);
-}
-
-//r[i], cy[i] <-r[i] - b for all i
-__device__ void Cuda_Sub1 (biguint_t r, dbigint_t cy, const unsigned int b)
-{
-	cy[threadIdx.x]-=(r[threadIdx.x] < b);
-	r[threadIdx.x] -= b;
-}
-
 //Normalise a result; 
 __device__ int Cuda_Is_Normalize(dbigint_t cy)
 {
@@ -87,6 +73,9 @@
 
 __device__ void Cuda_Normalize(biguint_t A,dbigint_t cy)
 {
+	//if (threadIdx.x==0)
+	//	cptnormalize[blockIdx.x]++;
+
 	int oldcy;
 	if (threadIdx.x==0)
 		oldcy = 0;
@@ -97,10 +86,17 @@
 	}
 
 	if (oldcy>=0)
-		Cuda_Add1(A,cy,oldcy);
+	{
+		//Cuda_Add1(A,cy,oldcy);
+		A[threadIdx.x]+=oldcy;
+		cy[threadIdx.x]+=(A[threadIdx.x]<oldcy);
+	}
 	else // oldcy < 0
-		Cuda_Sub1(A,cy,-oldcy);
-	
+	{
+		//Cuda_Sub1(A,cy,-oldcy);
+		cy[threadIdx.x]-=(A[threadIdx.x]< -oldcy);
+		A[threadIdx.x]-= -oldcy;
+	}
 }
 
 __device__ void Cuda_Fully_Normalize(biguint_t A,dbigint_t cy)
@@ -148,8 +144,8 @@
 {
   Cuda_Add(Rmod,cy,A,B);
 	Cuda_Fully_Normalize(Rmod,cy);	
-  //if (Cuda_Cmp (Rmod, mod) >= 0 || cy[SIZE_NUMBER-1]>0) // (a >= mod)? 
-  if (Cuda_ge_N (Rmod,cy[SIZE_NUMBER-1]) >= 0) // (a >= mod)? 
+  
+  if (Cuda_ge_cst (Rmod,mod,cy[SIZE_NUMBER-1]) >= 0) // (a >= mod)? 
 	{
    	Cuda_Sub (Rmod, cy, mod); // R <- R - mod 
 		Cuda_Fully_Normalize(Rmod,cy);	
@@ -162,14 +158,15 @@
 {
  	Cuda_Add(Rmod,cy,A);
 	Cuda_Fully_Normalize(Rmod,cy);	
- 	//if (Cuda_Cmp (Rmod, mod) >= 0 || cy[SIZE_NUMBER-1]>0) // (a >= mod)? 
-  if (Cuda_ge_N (Rmod,cy[SIZE_NUMBER-1]) >= 0) // (a >= mod)? 
+
+  if (Cuda_ge_cst (Rmod,mod,cy[SIZE_NUMBER-1]) >= 0) // (a >= mod)? 
 	{
    	Cuda_Sub (Rmod, cy, mod);  
 		Cuda_Fully_Normalize(Rmod,cy);	
 	}
 }
 
+
 //Compute Rmod <- A - B [mod] 
 __device__ void Cuda_Sub_mod 
 (biguint_t Rmod, dbigint_t cy, const biguint_t A,const biguint_t B,const biguint_t mod)
@@ -201,357 +198,48 @@
 //  Return h, l such that h*2^32 + l = A*B 
 __device__ void Cuda_Mul_uint (unsigned int *h, unsigned int *l, const unsigned int A,const unsigned int B)
 {
-	//if (A==B)
-	//	Cuda_Square_uint(h,l,A);
-	//else
-	//{	
-		unsigned int m1,m2,ah,al,bh,bl;
-		al=A&0x0000ffff;
-		bl=B&0x0000ffff;
-		ah=A>>16;
-		bh=B>>16;
-	
-		*h=ah * bh;
-		m1=ah * bl;
-		m2=al * bh;
-		*l=al * bl;
-
-		//al,an,bl,bh used as temporary variables
-		al=(m1&0x0000ffff)<<16;
-		bl=(m2&0x0000ffff)<<16;
-		ah=m1>>16;
-		bh=m2>>16;
-
-
-		*h+=(ah+bh);
-	
-		*l+=bl;
-		*h+=(*l<bl);
-	
-		*l+=al;
-		*h+=(*l<al);
-	//}
+		*h=__umulhi(A,B);
+		*l=A*B;
 }
 
-//  Return h, l such that h*2^32 + l = A*A 
-__device__ void Cuda_Square_uint (unsigned int *h, unsigned int *l, const unsigned int A)
+__device__ void Cuda_Mul
+(dbiguint_t R, dbigint_t cy, const biguint_t A,const biguint_t B)
 {
-	unsigned int m1,ah,al;
-	al=A&0x0000ffff;
-	ah=A>>16;
-	
-	*h=ah * ah;
-	m1=ah * al;
-	*l=al * al;
-
-	//al,ah used as temporary variables
-	al=(m1&0x0000ffff)<<16;
-	ah=m1>>16;
-
-
-	*h+=(ah+ah);
-	
-	*l+=al;
-	*h+=(*l<al);
-	*l+=al;
-	*h+=(*l<al);
-}
-
-#ifdef TEST
-
-//  Return cy,h, l such that cy*2^64+h*2^32 + l = A*(B1+B2) 
-__device__ void Cuda_AddMul_uint (int *cy, unsigned int *h, unsigned int *l, const unsigned int A,const unsigned int B1, const unsigned int B2)
-{
-	unsigned int sum=B1+B2;
-	unsigned int c = (sum < B2);
-	Cuda_Mul_uint(h,l,A,sum);
-
-	c*=A;
-	*h+=c;
-	*cy=(*h<c);
-}
-
-//  Return cy,h, l such that cy*2^64+h*2^32 + l = A*(B1-B2) 
-__device__ void Cuda_SubMul_uint (int *cy, unsigned int *h, unsigned int *l, const unsigned int A,const unsigned int B1, const unsigned int B2)
-{
-	unsigned int diff=B1-B2;
-	int c = (B1 < B2);
-	Cuda_Mul_uint(h,l,A,diff);
-
-	c*=A;
-	*cy=(*h<c);
-	*h-=c;
-}
-
-//  Return cy,h, l such that cy*2^64+h*2^32 + l = A*(B1-B2) 
-__device__ void Cuda_SubMul2_uint (int *cy, unsigned int *h, unsigned int *l, const unsigned int A1, const unsigned int A2, const unsigned int B1, const unsigned int B2)
-{
-	unsigned int diffa=A1-A2;
-	unsigned int diffb=B1-B2;
-	int ca = (A1 < A2);
-	int cb = (B1 < B2);
-	Cuda_Mul_uint(h,l,diffa,diffb);
-
-	*cy=cb*ca;
-
-	ca*=diffb;
-	cb*=diffa;
-	*cy-=(*h<ca);
-	*h-=ca;
-	*cy-=(*h<cb);
-	*h-=cb;
-}
-
-//Compute R(with cy) <- R+ A1^2*2^1024+A2^2 where A = A1*2^512+A2
-__device__ void Cuda_Square_halves
-(dbiguint_t R, dbigint_t cy, const biguint_t A)
-{
 	int i;
 	unsigned int h,l;
-	unsigned int dec;
-	if (threadIdx.x<SIZE_NUMBER/2)
-		dec=0;
-	else
-		dec=SIZE_NUMBER/2;
 	
-	for (i=0;i<SIZE_NUMBER/2;i++)
+	for (i=0;i<SIZE_NUMBER;i++)
 	{
-		Cuda_Mul_uint(&h,&l,A[threadIdx.x],A[dec+i]);
-		//Cuda_Add1(R+dec+i,cy+dec+i,l);
-		//Cuda_Add1(R+dec+i+1,cy+dec+i+1,h);
-		R[i+threadIdx.x+dec] +=l;
-		cy[i+threadIdx.x+dec]+=(R[i+threadIdx.x+dec] < l);
+		//h*2^32+l =A[i]*B[threadIDx.x]
+		Cuda_Mul_uint(&h,&l,A[threadIdx.x],B[i]);
 		
-		R[i+1+threadIdx.x+dec] +=h;
-		cy[i+1+threadIdx.x+dec]+=(R[i+1+threadIdx.x+dec]<h);
-	}
-}
-
-//Compute R(with cy) <- R+ A1*B1*2^1024+(A1*B1+A2*B2)*2^512+A2*B2 
-//where A = A1*2^512+A2  and B = B1*2^512+A2
-__device__ void Cuda_Mul_halves
-(dbiguint_t R, dbigint_t cy, const biguint_t A, const biguint_t B)
-{
-	int i;
-	unsigned int h,l;
-	unsigned int dec;
-	if (threadIdx.x<SIZE_NUMBER/2)
-		dec=0;
-	else
-		dec=SIZE_NUMBER/2;
-	
-	for (i=0;i<SIZE_NUMBER/2;i++)
-	{
-		Cuda_Mul_uint(&h,&l,A[threadIdx.x],B[dec+i]);
-		if (threadIdx.x<SIZE_NUMBER/2)
-		{
 		R[i+threadIdx.x] +=l;
 		cy[i+threadIdx.x]+=(R[i+threadIdx.x] < l);
 		
 		R[i+1+threadIdx.x] +=h;
 		cy[i+1+threadIdx.x]+=(R[i+1+threadIdx.x]<h);
-		
-		R[i+threadIdx.x+SIZE_NUMBER/2] +=l;
-		cy[i+threadIdx.x+SIZE_NUMBER/2]+=(R[i+threadIdx.x+SIZE_NUMBER/2] < l);
-		
-		R[i+1+threadIdx.x+SIZE_NUMBER/2] +=h;
-		cy[i+1+threadIdx.x+SIZE_NUMBER/2]+=(R[i+1+threadIdx.x+SIZE_NUMBER/2]<h);
-		
-		}
-		else
-		{
-		//Not in the same order to avoid writing at the same adress at the same time
-		//No need to add SIZE_NUMBER/2 because threadIdx.x > SIZE_NUMBER/1
-		R[i+threadIdx.x] +=l;
-		cy[i+threadIdx.x]+=(R[i+threadIdx.x] < l);
-		
-		R[i+1+threadIdx.x] +=h;
-		cy[i+1+threadIdx.x]+=(R[i+1+threadIdx.x]<h);
-
-		R[i+threadIdx.x+dec] +=l;
-		cy[i+threadIdx.x+dec]+=(R[i+threadIdx.x+dec] < l);
-		
-		R[i+1+threadIdx.x+dec] +=h;
-		cy[i+1+threadIdx.x+dec]+=(R[i+1+threadIdx.x+dec]<h);
-		}
 	}
-}
 
-/*
-//Compute R(with cy) <- R-(A1-A2)*(B1-B2)*2^512 
-//where A = A1*2^512+A2  and B = B1*2^512+A2
-__device__ void Cuda_SubMul
-(dbiguint_t R, dbigint_t cy, const biguint_t A, const biguint_t B)
-{
-	int i;
-	unsigned int h2,h,l;
-	unsigned int dec;
-	
-	for (i=0;i<SIZE_NUMBER/2;i++)
-	{
-		Cuda_SubMul_uint(&h2,&h,&l,A[threadIdx.x],B[i],B[i+SIZE_NUMBER/2]);
-		
-		R[i+threadIdx.x+SIZE_NUMBER/2] +=h;
-		cy[i+1+threadIdx.x]+=(R[i+1+threadIdx.x]<h);
-		}
-	}
-	
-}
-*/
-
-
-//Compute R(with cy) <- R+ 2*A*B where A and B are SIZE_NUMBER/2-long
-__device__ void Cuda_DblMul_half
-(dbiguint_t R, dbigint_t cy, const biguint_t A,const biguint_t B)
-{
-	int i;
-	unsigned int h,l;
-	unsigned int hh,ll;
-	int cy1,cy2;
-	
-	unsigned int idx=threadIdx.x%(SIZE_NUMBER/2);
-	unsigned int idi=threadIdx.x/(SIZE_NUMBER/2);
-	
-	for (i=0;i<SIZE_NUMBER/2;i++)
-	{
-		Cuda_Mul_uint(&h,&l,A[idx],B[idi]);
-		ll=2*l;
-		cy1=(ll<l);
-		hh=2*h;
-		cy2=(hh<h);
-		if (threadIdx.x<SIZE_NUMBER/2)
-		{
-			R[idi+idx] +=ll;
-			cy[idi+idx]+=cy1+(R[idi+idx] < ll);
-		
-			R[idi+1+idx] +=hh;
-			cy[idi+1+idx]+=cy2+(R[idi+1+idx] < hh);
-		}
-		if (threadIdx.x>=SIZE_NUMBER/2)
-		{
-			R[idi+idx] +=ll;
-			cy[idi+idx]+=cy1+(R[idi+idx] < ll);
-		
-			R[idi+1+idx] +=hh;
-			cy[idi+1+idx]+=cy2+(R[idi+1+idx] < hh);
-		}
-		idi+=2;
-	}
-}
-
-
-__device__ void Cuda_Mul 
-(dbiguint_t R, dbigint_t cy, const biguint_t A, const biguint_t B)
-{
-	Cuda_Mul_halves(R,cy,A,B);
-	//Cuda_SubMul(R+SIZE_NUMBER/2,cy+SIZE_NUMBER/2,A,B);
-
 	//Normalize : but R and cy are 2 * SIZE_NUMBER long
 	Cuda_Fully_Normalize(R,cy);
 	Cuda_Fully_Normalize(R+SIZE_NUMBER-1,cy+SIZE_NUMBER-1);
-	if (threadIdx.x==0)
-		{
-		R[2*SIZE_NUMBER-1]+=cy[2*SIZE_NUMBER-2];
-		cy[2*SIZE_NUMBER-2]=0;//only to let cy clean in order to re-use it
-		}
-}
 
+	//if (threadIdx.x==0)
+	//	cptnormalizemul[blockIdx.x]++;
 
-
-__device__ void Cuda_Square (dbiguint_t R, dbigint_t cy, const biguint_t A)
-{
-	Cuda_Square_halves(R,cy,A);
-	Cuda_DblMul_half(R+SIZE_NUMBER/2,cy+SIZE_NUMBER/2,A,A+SIZE_NUMBER/2);
-	//Normalize : but R and cy are 2 * SIZE_NUMBER long
-	Cuda_Fully_Normalize(R,cy);
-	Cuda_Fully_Normalize(R+SIZE_NUMBER-1,cy+SIZE_NUMBER-1);
 	if (threadIdx.x==0)
-		{
-		R[2*SIZE_NUMBER-1]+=cy[2*SIZE_NUMBER-2];
-		cy[2*SIZE_NUMBER-2]=0;//only to let cy clean in order to re-use it
-		}
-}
-
-
-#endif
-
-#ifndef TEST
-__device__ void Cuda_Mul
-(dbiguint_t R, dbigint_t cy, const biguint_t A,const biguint_t B)
-{
-	int i;
-	unsigned int h,l;
-	
-	for (i=0;i<SIZE_NUMBER;i++)
 	{
-		//h*2^32+l =A[i]*B[threadIDx.x]
-		Cuda_Mul_uint(&h,&l,A[threadIdx.x],B[i]);
-		
-		Cuda_Add1(R+i,cy+i,l);
-		Cuda_Add1(R+i+1,cy+i+1,h);
-		//R[i+threadIdx.x] +=l;
-		//cy[i+threadIdx.x]+=(R[i+threadIdx.x] < l);
-		
-		//R[i+1+threadIdx.x] +=h;
-		//cy[i+1+threadIdx.x]+=(R[i+1+threadIdx.x]<h);
-	}
-
-	//Normalize : but R and cy are 2 * SIZE_NUMBER long
-	Cuda_Fully_Normalize(R,cy);
-	Cuda_Fully_Normalize(R+SIZE_NUMBER-1,cy+SIZE_NUMBER-1);
-	if (threadIdx.x==0)
-		{
 		R[2*SIZE_NUMBER-1]+=cy[2*SIZE_NUMBER-2];
 		cy[2*SIZE_NUMBER-2]=0;//only to let cy clean in order to re-use it
-		}
+	}
 }
 
 __device__ void Cuda_Square (dbiguint_t R, dbigint_t cy, const biguint_t A)
 {
-	int i;
-	unsigned int h,l;
-		
-	Cuda_Square_uint(&h,&l,A[threadIdx.x]);
-
-	//Cuda_Add1(R+threadIdx.x,cy+threadIdx.x,l);
-	//Cuda_Add1(R+threadIdx.x+1,cy+threadIdx.x+1,h);
-	R[2*threadIdx.x] += l;
-	cy[2*threadIdx.x]+=(R[2*threadIdx.x] < l);
-	
-	R[2*threadIdx.x+1] += h;
-	cy[2*threadIdx.x+1]+=(R[2*threadIdx.x+1]<h);
-	
-	for (i=0;i<threadIdx.x;i++)
-	{
-		Cuda_Mul_uint(&h,&l,A[threadIdx.x],A[i]);
-		
-		Cuda_Add1(R+i,cy+i,l);
-		Cuda_Add1(R+i,cy+i,l);
-		Cuda_Add1(R+i+1,cy+i+1,h);
-		Cuda_Add1(R+i+1,cy+i+1,h);
-		//R[i+threadIdx.x] += l;
-		//cy[i+threadIdx.x]+=(R[i+threadIdx.x] < l);
-		//R[i+threadIdx.x] += l;
-		//cy[i+threadIdx.x]+=(R[i+threadIdx.x] < l);
-		
-		//R[i+1+threadIdx.x] +=h;
-		//cy[i+1+threadIdx.x]+=(R[i+1+threadIdx.x]<h);
-		//R[i+1+threadIdx.x] +=h;
-		//cy[i+1+threadIdx.x]+=(R[i+1+threadIdx.x]<h);
-	}
-
-	//Normalize : but R and cy are 2 * SIZE_NUMBER long
-	Cuda_Fully_Normalize(R,cy);
-	Cuda_Fully_Normalize(R+SIZE_NUMBER-1,cy+SIZE_NUMBER-1);
-	if (threadIdx.x==0)
-		{
-		R[2*SIZE_NUMBER-1]+=cy[2*SIZE_NUMBER-2];
-		cy[2*SIZE_NUMBER-2]=0;//only to let cy clean in order to re-use it
-		}
+	Cuda_Mul(R,cy,A,A);
 }
-#endif 
 
-
+//Assume r<N^2
 __device__ void Cuda_RedMontgomery (biguint_t mul, dbigint_t cy, const biguint_t mod, dbiguint_t r, dbiguint_t temp)
 {
 	//temp=((r mod 2^(32*SIZE_NUMBER))*mod^-1) (mod^-1 already compute)
@@ -571,6 +259,7 @@
 	//Normalize : but R and cy are 2 * SIZE_NUMBER long
 	Cuda_Fully_Normalize(r,cy);
 	Cuda_Fully_Normalize(r+SIZE_NUMBER-1,cy+SIZE_NUMBER-1);
+
 	if (threadIdx.x==0)
 		{
 		r[2*SIZE_NUMBER-1]+=cy[2*SIZE_NUMBER-2];
@@ -580,15 +269,13 @@
 	//return r/ 2^(32*SIZE_NUMBER)
 	mul[threadIdx.x]=r[threadIdx.x+SIZE_NUMBER];
 
-  //if (Cuda_Cmp (mul, mod) >= 0) // (mul >= mod)? 
-  if (Cuda_ge_N (mul,0) >= 0) // (mul >= mod)? 
+	if (Cuda_Cmp (mul,mod) >= 0) // (mul >= mod)? 
 	{
   	Cuda_Sub (mul, cy, mod); 
 		Cuda_Fully_Normalize(mul,cy);	
 	}
 }
 
-
 //Assume A ans B are the montgomery representation
 //Compute mul = A * B * 2^-(32*SIZE_NUMBER) mod[mod]
 // r and temp have size 2*SIZE_NUMBER
@@ -603,40 +290,6 @@
 	//r=a*b
 	Cuda_Mul(r,cy,A,B);
 	Cuda_RedMontgomery (mul, cy, mod, r, temp);
-/*	
-	//temp=((r mod 2^(32*SIZE_NUMBER))*mod^-1) (mod^-1 already compute)
-	Cuda_Mul(temp,cy,r,invmodcst);//pour r que la partie mod R compte : Ok
-	
-	//mul = temp (mod 2^(32*SIZE_NUMBER)) 
-	mul[threadIdx.x]=temp[threadIdx.x];
-	temp[threadIdx.x]=0;
-	temp[threadIdx.x+SIZE_NUMBER]=0;
-	
-	//temp=mul*m
-	Cuda_Mul(temp,cy,mul,mod);
-	
-	//r=r+temp // r and temp2 are 2*SIZE_NUMBER long
-	Cuda_Add(r,cy,temp);
-	Cuda_Add(r+SIZE_NUMBER,cy+SIZE_NUMBER,temp+SIZE_NUMBER);
-	//Normalize : but R and cy are 2 * SIZE_NUMBER long
-	Cuda_Fully_Normalize(r,cy);
-	Cuda_Fully_Normalize(r+SIZE_NUMBER-1,cy+SIZE_NUMBER-1);
-	if (threadIdx.x==0)
-		{
-		r[2*SIZE_NUMBER-1]+=cy[2*SIZE_NUMBER-2];
-		cy[2*SIZE_NUMBER-2]=0;//only to let cy clean in order to re-use it
-		}
-
-	//return r/ 2^(32*SIZE_NUMBER)
-	mul[threadIdx.x]=r[threadIdx.x+SIZE_NUMBER];
-
-  //if (Cuda_Cmp (mul, mod) >= 0) // (mul >= mod)? 
-  if (Cuda_ge_N (mul,0) >= 0) // (mul >= mod)? 
-	{
-  	Cuda_Sub (mul, cy, mod); 
-		Cuda_Fully_Normalize(mul,cy);	
-	}
-*/	
 }
 
 
@@ -654,45 +307,9 @@
 	//r=a*b
 	Cuda_Square(r,cy,A);
 	Cuda_RedMontgomery (mul, cy, mod, r, temp);
-/*	
-	//temp=((r mod 2^(32*SIZE_NUMBER))*mod^-1) (mod^-1 already compute)
-	Cuda_Mul(temp,cy,r,invmodcst);//pour r que la partie mod R compte : Ok
-	
-	//mul = temp (mod 2^(32*SIZE_NUMBER)) 
-	mul[threadIdx.x]=temp[threadIdx.x];
-	temp[threadIdx.x]=0;
-	temp[threadIdx.x+SIZE_NUMBER]=0;
-	
-	//temp=mul*m
-	Cuda_Mul(temp,cy,mul,mod);
-	
-	//r=r+temp // r and temp2 are 2*SIZE_NUMBER long
-	Cuda_Add(r,cy,temp);
-	Cuda_Add(r+SIZE_NUMBER,cy+SIZE_NUMBER,temp+SIZE_NUMBER);
-	//Normalize : but R and cy are 2 * SIZE_NUMBER long
-	Cuda_Fully_Normalize(r,cy);
-	Cuda_Fully_Normalize(r+SIZE_NUMBER-1,cy+SIZE_NUMBER-1);
-	if (threadIdx.x==0)
-		{
-		r[2*SIZE_NUMBER-1]+=cy[2*SIZE_NUMBER-2];
-		cy[2*SIZE_NUMBER-2]=0;//only to let cy clean in order to re-use it
-		}
-
-	//__syncthreads();
-
-	//return r/ 2^(32*SIZE_NUMBER)
-	mul[threadIdx.x]=r[threadIdx.x+SIZE_NUMBER];
-
-  //if (Cuda_Cmp (mul, mod) >= 0) // (mul >= mod)? 
-  if (Cuda_ge_N (mul,0) >= 0) // (mul >= mod)? 
-	{
-  	Cuda_Sub (mul, cy, mod); 
-		Cuda_Fully_Normalize(mul,cy);	
-	}
-*/
 }
 
-__device__ void Cuda_Ell_Dbl(biguint_t x2p, biguint_t z2p, const biguint_t xp, const biguint_t zp, const biguint_t N, biguint_t temp_u, biguint_t temp_v, dbiguint_t temp_r, dbiguint_t temp_r2, dbigint_t cy)
+__device__ void Cuda_Ell_Dbl(biguint_t x2p, biguint_t z2p, const biguint_t xp, const biguint_t zp, const biguint_t N, const biguint_t d, biguint_t temp_u, biguint_t temp_v, dbiguint_t temp_r, dbiguint_t temp_r2, dbigint_t cy)
 {
 	//u<-xp+zp mod N
 	Cuda_Add_mod(temp_u,cy,xp,zp,N);
@@ -705,7 +322,7 @@
 	//x2p=u-v mod N    x2p is used as a temporary variable here
 	Cuda_Sub_mod(x2p,cy,temp_u,temp_v,N);
 	//z2p<-x2p*d mod N z2p is used as a temporary variable here
-	Cuda_Mul_mod(z2p,cy,x2p,dcst[blockIdx.x],N,temp_r,temp_r2);
+	Cuda_Mul_mod(z2p,cy,x2p,d,N,temp_r,temp_r2);
 	//z2p<- z2p+v mod N
 	Cuda_Add_mod(z2p,cy,temp_v,N);
 	//z2p <- x2p*z2p
@@ -717,6 +334,7 @@
 __device__ void Cuda_Ell_Add(biguint_t xplus, biguint_t zplus, const biguint_t xp, const biguint_t zp, const biguint_t xq, const biguint_t zq, const biguint_t xminus, const biguint_t zminus, const biguint_t N, biguint_t temp_u, biguint_t temp_v, biguint_t temp_w, dbiguint_t temp_r, dbiguint_t temp_r2, dbigint_t cy)
 {
 	unsigned int tmp;
+	
 	//u<-xp+zp mod N
 	Cuda_Add_mod(temp_u,cy,xp,zp,N);
 	//v<-xq-zq mod N
@@ -729,18 +347,19 @@
 	Cuda_Add_mod(temp_w,cy,zq,xq,N);
 	//u<-u*w mod N
 	Cuda_Mul_mod(temp_u,cy,temp_u,temp_w,N,temp_r,temp_r2);
+
 	//w<-v+u mod N
 	Cuda_Add_mod(temp_w,cy,temp_v,temp_u,N);
 	//v<-v-u mod N
 	Cuda_Sub_mod(temp_v,cy,temp_u,N);
-	//u<-w^2 mod N
-	Cuda_Square_mod(temp_u,cy,temp_w,N,temp_r,temp_r2);
+	//w<-w^2 mod N
+	Cuda_Square_mod(temp_w,cy,temp_w,N,temp_r,temp_r2);
 	//v<-v^2 mod N
 	Cuda_Square_mod(temp_v,cy,temp_v,N,temp_r,temp_r2);
 	
 	if (xplus==xminus) //same variable : in-place variant
 	{
-		Cuda_Mul_mod(zplus,cy,zminus,temp_u,N,temp_r,temp_r2);
+		Cuda_Mul_mod(zplus,cy,zminus,temp_w,N,temp_r,temp_r2);
 		Cuda_Mul_mod(xplus,cy,xminus,temp_v,N,temp_r,temp_r2);
 		//swap
 		tmp=xplus[threadIdx.x];
@@ -750,18 +369,17 @@
 	else
 	{
 		//xplus <- zminus*u mod N
-		Cuda_Mul_mod(xplus,cy,zminus,temp_u,N,temp_r,temp_r2);
+		Cuda_Mul_mod(xplus,cy,zminus,temp_w,N,temp_r,temp_r2);
 		//zplus <- xminus*v mod N
 		Cuda_Mul_mod(zplus,cy,xminus,temp_v,N,temp_r,temp_r2);
 	}
 }
 
-#ifndef TEST
 
 #ifdef CC13
 //Compute [3^power3*2^power2]A on the elliptic curve
 //prepare for prac, ie set B and C equal to A and set A to [2]A
-__global__ void Cuda_Ell_Mul_2_3(biguint_t *xAarg, biguint_t *zAarg, biguint_t *xBarg, biguint_t *zBarg, biguint_t *xCarg, biguint_t *zCarg, const unsigned int power2, const unsigned int power3)
+__global__ void Cuda_Ell_Mul_2_3(biguint_t *xAarg, biguint_t *zAarg, biguint_t *xBarg, biguint_t *zBarg, biguint_t *xCarg, biguint_t *zCarg, const biguint_t *darg, const unsigned int power2, const unsigned int power3)
 {
 	__shared__ unsigned int temp_r[2*SIZE_NUMBER];
 	__shared__ unsigned int temp_r2[2*SIZE_NUMBER];
@@ -776,6 +394,8 @@
 	__shared__ unsigned int xB[SIZE_NUMBER];
 	__shared__ unsigned int zB[SIZE_NUMBER];
 	
+	__shared__ unsigned int d[SIZE_NUMBER];
+	
 	unsigned int i;
 
 	//init
@@ -784,6 +404,8 @@
 	xB[threadIdx.x]=xAarg[blockIdx.x][threadIdx.x];
 	zB[threadIdx.x]=zAarg[blockIdx.x][threadIdx.x];
 
+	d[threadIdx.x]=darg[blockIdx.x][threadIdx.x];
+
 	temp_r[threadIdx.x]=0;	
 	temp_r[SIZE_NUMBER + threadIdx.x]=0;	
 	temp_r2[threadIdx.x]=0;	
@@ -800,16 +422,16 @@
 
 	for(i=0;i<power2;i++)
 	{
-		Cuda_Ell_Dbl(xB, zB, xB, zB, Ncst, temp_u, temp_v, temp_r, temp_r2, cy);
+		Cuda_Ell_Dbl(xB, zB, xB, zB, Ncst, d, temp_u, temp_v, temp_r, temp_r2, cy);
 	}
 	
 	for(i=0;i<power3;i++)
 	{
-		Cuda_Ell_Dbl(xA, zA, xB, zB, Ncst, temp_u, temp_v, temp_r, temp_r2, cy);
+		Cuda_Ell_Dbl(xA, zA, xB, zB, Ncst, d, temp_u, temp_v, temp_r, temp_r2, cy);
 	Cuda_Ell_Add(xB, zB, xA, zA, xB, zB, xB, zB, Ncst, temp_u, temp_v, temp_w, temp_r, temp_r2, cy);
 	}
 
-	Cuda_Ell_Dbl(xA, zA, xB, zB, Ncst, temp_u, temp_v, temp_r, temp_r2, cy);
+	Cuda_Ell_Dbl(xA, zA, xB, zB, Ncst, d, temp_u, temp_v, temp_r, temp_r2, cy);
 
 	//End of the computation; Copy the results for the cpu
 	xAarg[blockIdx.x][threadIdx.x]=xA[threadIdx.x];
@@ -820,7 +442,7 @@
 	zCarg[blockIdx.x][threadIdx.x]=zB[threadIdx.x];
 }
 



More information about the Ecm-commits mailing list