/*
This file is part of mfaktc (mfakto).
Copyright (C) 2009 - 2014  Oliver Weihe (o.weihe@t-online.de)
                           Bertram Franz (bertramf@gmx.net)

mfaktc (mfakto) is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

mfaktc (mfakto) is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with mfaktc (mfakto).  If not, see <http://www.gnu.org/licenses/>.

Version 0.16
*/


/*
This source is an OpenCL port of the CUDA code by George Woltman.
This code is a GPU-based sieve for mfakto.

Thanks go also to Ben Buhrow for his erato.cu program and to Rocke Verser for his gpusieve program.
See (http://www.mersenneforum.org/showthread.php?t=11900) for Ben's initial work.
*/

// function prototypes

unsigned int modularinverse (uint n, uint orig_d);

uint extract_bits(const uint bits_to_process, const uint tid, const uint lid, __local ushort *bitcount, __local ushort *smem, const __global uint * restrict bit_array);

// end prototypes

// TRACE_SIEVE_KERNEL: higher is more trace, 0-5 currently used
#define TRACE_SIEVE_KERNEL 0

// If above tracing is on, only the thread with the ID below will trace
#define TRACE_SIEVE_TID 0

// diagnostics
// #define GWDEBUG

// Primes up to 16M can be handled by this many "rows" of 256 primes
#define MAX_PRIMES_PER_THREAD	4224

// Size of shared memory array in bytes
#define block_size_in_bytes 8192
// Number of bits generated by each block
__constant uint block_size = block_size_in_bytes * 8;
// Threads per block
#define threadsPerBlock 256

#ifdef MORE_CLASSES
// Primes 2, 3, 5, 7, 11 are not sieved
#define primesNotSieved 5

// Count of primes handled with inline code (not using primes array)
							// Primes 13 through 61 are handled specially
//#define primesHandledWithSpecialCode 13
							// Primes 13 through 127 are handled specially
//#define primesHandledWithSpecialCode 26
							// Primes 13 through 251 are handled specially
#define primesHandledWithSpecialCode 49
							// Primes 13 through 509 are handled specially
//#define primesHandledWithSpecialCode 92
#else
// Primes 2, 3, 5, 7 are not sieved
#define primesNotSieved 4
// Count of primes handled with inline code (not using primes array)
							// Primes 11 through 61 are handled specially
//#define primesHandledWithSpecialCode 14
							// Primes 11 through 127 are handled specially
//#define primesHandledWithSpecialCode 27
							// Primes 11 through 251 are handled specially
#define primesHandledWithSpecialCode 50
							// Primes 11 through 509 are handled specially
//#define primesHandledWithSpecialCode 93
#endif

// Various useful constants

// There are 6542 16-bit primes
#define primesBelow64K 6542
// There are 12251 17-bit primes
#define primesBelow128K 12251
// There are 82025 20-bit primes
#define primesBelow1M 82025
__constant uint sieving64KCrossover = (primesBelow64K - primesNotSieved - primesHandledWithSpecialCode) / threadsPerBlock;
							// Number of thread loops processing primes below 64K
__constant uint sieving128KCrossover = (primesBelow128K - primesNotSieved - primesHandledWithSpecialCode) / threadsPerBlock;
							// Number of thread loops processing primes below 128K
__constant uint sieving1MCrossover = (primesBelow1M - primesNotSieved - primesHandledWithSpecialCode) / threadsPerBlock - 3;  // bug - awkward hard coded -3 here
							// Number of thread loops processing primes below 1M

// array for bit-shifting

__constant uint two_pow_n_32[] = {1<<0,  1<<1,  1<<2,  1<<3,  1<<4,  1<<5,  1<<6,  1<<7,  1<<8,  1<<9,  1<<10, 1<<11, 1<<12, 1<<13, 1<<14, 1<<15,
                                  1<<16, 1<<17, 1<<18, 1<<19, 1<<20, 1<<21, 1<<22, 1<<23, 1<<24, 1<<25, 1<<26, 1<<27, 1<<28, 1<<29, 1<<30, 1<<31,
                                      0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
                                      0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0,     0};

// Bit masks for small prime sieving

#define BITSLL11 (1 | (1<<11) | (1<<22))
#define BITSLL13 (1 | (1<<13) | (1<<26))
#define BITSLL17 (1 | (1<<17))
#define BITSLL19 (1 | (1<<19))
#define BITSLL23 (1 | (1<<23))
#define BITSLL29 (1 | (1<<29))
#define BITSLL31 (1 | (1<<31))

// Various padding required to keep warps accessing primes data on 128-byte boundaries

#define PINFO_PAD1		1024			// Allows room for lots of initial bit_to_clr values

// Inline to calculate x mod p using 2^32 / p.  Returns positive remainder even when x is negative.
// Assumes x is in the range -2^32 < x < p.  For this routine to work on only positive x values, we
// would change the gen_pinv macro to not add one.

// NOTE: This routine has a number of failure cases (samples below) which don't affect us, but should be investigated someday!
//	x mod p out of range!! x = 113175, p = 113177, pinv = 37950, r = -2
//	x mod p out of range!! x = 126009, p = 126011, pinv = 34085, r = -2
//	x mod p out of range!! x = 121506, p = 121507, pinv = 35348, r = -1
//	x mod p out of range!! x = 124427, p = 124429, pinv = 34518, r = -2
//	x mod p out of range!! x = 95152, p = 95153, pinv = 45138, r = -1
//	x mod p out of range!! x = 94120, p = 94121, pinv = 45633, r = -1
//	x mod p out of range!! x = 74686, p = 74687, pinv = 57507, r = -1
//	x mod p out of range!! x = 102795, p = 102797, pinv = 41782, r = -2
//	x mod p out of range!! x = 126741, p = 126743, pinv = 33888, r = -2
//	x mod p out of range!! x = 111532, p = 111533, pinv = 38509, r = -1
//	x mod p out of range!! x = 130810, p = 130811, pinv = 32834, r = -1
//	x mod p out of range!! x = 116705, p = 116707, pinv = 36802, r = -2

#define gen_pinv(p)	(0xFFFFFFFF / (p) + 1)

__inline int mod_p (int x, const int p, const int pinv)
{
	int	r;

	r = mul_hi (x, pinv);	// quotient = x * inverse_of_p
  x = x - p * r;		  	// x mod p (but may be too large by one p)
//  x = x - mul24(p, r);		  	// PERF: mul24 brings no performance benefit here (on VLIW5), probably we'd need to vectorize to make a difference
	r = x - p;	          // x mod p (the alternative return value)
	r = (r >= 0) ? r : x;

#ifdef GWDEBUG
	if (pinv != gen_pinv (p))
		printf((__constant char *)"mod_p: p doesn't match pinv!! p = %d, pinv = %d\n", p, pinv);
	if (r < 0 || r >= p)
		printf((__constant char *)"mod_p: x mod p out of range!! x = %d, p = %d, pinv = %d, r = %d\n", x, p, pinv, r);
#endif
#if (TRACE_SIEVE_KERNEL > 4)
    if (get_global_id(0) == TRACE_SIEVE_TID) printf((__constant char *)"mod_p(%d, %d, %d) = %d\n", x, p, pinv, r);
#endif

	return r;
}

// Inline to calculate x mod p where p is a constant

__inline int mod_const_p (int x, int p)
{
	return mod_p (x, p, gen_pinv (p));
}

// Inline to calculate x mod p using an inverse of floor ((2^32 / p) - 0.5).
// We're allowed to return a sloppy modulo result ranging from -p/2 to p-1.
// Assumes x is in the range -2^32 < x < p.  This routine needs changing if we must
// deal with large positive x values.

#define gen_sloppy_pinv(p)	((uint) floor (4294967296.0f / (p) - 0.5))

__inline int sloppy_mod_p (int x, int p, int pinv)
{
	int	q, r;

	q = mul_hi (x, pinv);		// quotient = x * inverse_of_p
  r = x - q * p;			// x mod p (but may be too small or large by one-half p)
//	r = x - mul24(q, p);			// PERF: mul24 brings no performance benefit here (on VLIW5), probably we'd need to vectorize to make a difference

#ifdef GWDEBUG
	if ((uint) pinv != gen_sloppy_pinv (p))
		printf((__constant char *)"sloppy_mod_p: p doesn't match pinv!! p = %d, pinv = %d (should be %d)\n", p, pinv, gen_sloppy_pinv (p));
	if (r < -p / 2 || r >= p)
		printf((__constant char *)"sloppy_mod_p: x sloppy mod p out of range!! x = %d, p = %d, pinv = %d, r = %d\n", x, p, pinv, r);
#endif

#if (TRACE_SIEVE_KERNEL > 4)
    if (get_global_id(0) == TRACE_SIEVE_TID) printf((__constant char *)"sloppy_mod_p(%d, %d, %d) = %d\n", x, p, pinv, r);
#endif

	return r;
}

// Inline to add a negative constant mod p.  That is given i between 0 and p-1, return ((i + inc) % p)

__inline int bump_mod_p (int i, int inc, int p)
{
	int	x, j;
	i = i + inc % p; j = i + p;
  x = (i>=0) ? i : j; //	asm("slct.s32.s32 %0, %1, %2, %1;" : "=r" (x) : "r" (i), "r" (j));

#ifdef GWDEBUG
	if (x < 0 || x >= p)
		printf ("x bump mod p out of range!! x = %d, i = %d, p = %d\n", x, i, p);
#endif
	return x;
}

// Inline to OR one bit into the shared memory array

__inline void bitOr (__local uchar *locsieve, uint bclr)
{
#define locsieve8	((__local uchar *) locsieve)
#define locsieve8v	((__local volatile uchar *) locsieve)
#define locsieve32	((__local uint *) locsieve)
#define locsieve32v	((__local volatile uint *) locsieve)
	locsieve8[bclr >> 3] |= 1 << (bclr & 7);
}

__inline void bitOrSometimesIffy (__local uchar *locsieve, uint bclr)
{
	uint	bytenum = bclr >> 3;
	uchar	mask = 1 << (bclr & 7);
	uchar	val = locsieve8[bytenum];
	if (! (val & mask)) locsieve8[bytenum] = val | mask;
}

// Make sure initial bit-to-clear makes sense

#ifdef GWDEBUG
#define validate_bclr(bclr,p)	if (bclr >= p) printf ("bclr too big! bclr = %d, p = %d\n", bclr, p);
#else
#define validate_bclr(bclr,p)
#endif

// Sieve a small slice of the big bit array using fast shared memory.  Note THIS IS A SLOPPY sieve!!
// We do not use atomic operations so that some candidates survive the sieve that shouldn't.  This is
// OK as it will just cost us some extra testing of candidates which is cheaper than the cost of using
// atomic operations.

/*
	Expect as input a set of primes to sieve with, their inverses, and the first bit to clear.

	Each block on the gpu sieves a different segment of the big bit array.  Each thread within each block
	simultaneously sieves a small set of primes, marking composites within shared memory.  There is no memory
	contention between threads because the marking process is write only.  Because each thread
	block starts at a different part of the big bit array, a small amount of computation must
	be done for each prime prior to sieving to figure out the first bit to clear.
*/

__kernel void __attribute__((reqd_work_group_size(256, 1, 1))) SegSieve (__global uchar *big_bit_array_dev, __global uchar *pinfo_dev, uint maxp)
{
	__local uchar locsieve[block_size_in_bytes];
	uint block_start = get_group_id(0) * block_size;
	uint i, j, p, pinv, bclr;

#define big_bit_array32	((__global uint *) big_bit_array_dev)
#define locsieve32	((__local uint *) locsieve)
#define locsieve64	((__local ulong *) locsieve)
#define pinfo16		((__global ushort *) pinfo_dev)
#define pinfo32		((__global uint *) pinfo_dev)

// Sieve using all 8 bits of each shared memory byte.
// This is more complicated code than using the whole byte as a flag
// but has 1/8th as many global memory accesses to the primes arrays.

// Sieve the smallest primes using inline code to avoid using atomics.
// Memory layout is simply a 16-bit bit-to-clear value for each small prime.

#define bit_to_clr	pinfo16

	//
	// In these sections each thread handles a 256-bit portion of the shared memory area.
	// This allows us to operate without atomic operations and without syncing.
	//

	uint thread_start = block_start + get_local_id(0) * block_size / threadsPerBlock;

#if (TRACE_SIEVE_KERNEL > 2)
  if (get_global_id(0) == TRACE_SIEVE_TID) printf((__constant char *)"SegSieve: grpid=%d, locid=%d, thread_start=%u, maxp=%u\n", get_group_id(0), get_local_id(0), thread_start, maxp);
#endif

	//
	// In this section each thread handles one 32 bit word at a time sieving primes below 64.
	// Each prime will hit a 32-bit word zero or one time.
	//

	{
	  uint mask, mask2, mask3, mask4, i11=0xfffffff, i13, i17, i19, i23, i29, i31, i37, i41, i43, i47, i53, i59, i61;

	  if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  i11 = mod_const_p (bit_to_clr[4] - thread_start, 11);	// compute bit to clear for prime 11
		  i13 = mod_const_p (bit_to_clr[5] - thread_start, 13);	// compute bit to clear for prime 13
		  i17 = mod_const_p (bit_to_clr[6] - thread_start, 17);	// compute bit to clear for prime 17
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  i13 = mod_const_p (bit_to_clr[5] - thread_start, 13);	// compute bit to clear for prime 13
		  i17 = mod_const_p (bit_to_clr[6] - thread_start, 17);	// compute bit to clear for prime 17
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  i17 = mod_const_p (bit_to_clr[6] - thread_start, 17);	// compute bit to clear for prime 17
	  }
	  i19 = mod_const_p (bit_to_clr[7] - thread_start, 19);	// compute bit to clear for prime 19
	  i23 = mod_const_p (bit_to_clr[8] - thread_start, 23);	// compute bit to clear for prime 23
	  i29 = mod_const_p (bit_to_clr[9] - thread_start, 29);	// compute bit to clear for prime 29
	  i31 = mod_const_p (bit_to_clr[10] - thread_start, 31);	// compute bit to clear for prime 31

	  if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  mask = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  mask = (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  mask = BITSLL17 << i17;
	  }
	  mask |= (BITSLL19 << i19) | (BITSLL23 << i23);
	  mask |= (BITSLL29 << i29) | (BITSLL31 << i31);

#if (TRACE_SIEVE_KERNEL > 1)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: mask=%#x,(%d, %d, %d, %d, %d, %d, %d), thread_start=%d\n", mask, i11, i13, i17, i19, i23, i29, i31, thread_start);
#endif

	  if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  i11 = bump_mod_p (i11, -32, 11);
		  i13 = bump_mod_p (i13, -32, 13);
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  i13 = bump_mod_p (i13, -32, 13);
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  i19 = bump_mod_p (i19, -32, 19);
	  i23 = bump_mod_p (i23, -32, 23);
	  i29 = bump_mod_p (i29, -32, 29);
	  i31 = bump_mod_p (i31, -32, 31);

	  if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  mask2 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  mask2 = (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  mask2 = BITSLL17 << i17;
	  }
	  mask2 |= (BITSLL19 << i19) | (BITSLL23 << i23);
	  mask2 |= (BITSLL29 << i29) | (BITSLL31 << i31);

#if (TRACE_SIEVE_KERNEL > 1)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: mask2=%#x,(%d, %d, %d, %d, %d, %d, %d), thread_start=%d\n", mask2, i11, i13, i17, i19, i23, i29, i31, thread_start);
#endif

	  if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  i11 = bump_mod_p (i11, -32, 11);
		  i13 = bump_mod_p (i13, -32, 13);
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  i13 = bump_mod_p (i13, -32, 13);
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  i19 = bump_mod_p (i19, -32, 19);
	  i23 = bump_mod_p (i23, -32, 23);
	  i29 = bump_mod_p (i29, -32, 29);
	  i31 = bump_mod_p (i31, -32, 31);

	  if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  mask3 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  mask3 = (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  mask3 = BITSLL17 << i17;
	  }
	  mask3 |= (BITSLL19 << i19) | (BITSLL23 << i23);
	  mask3 |= (BITSLL29 << i29) | (BITSLL31 << i31);

#if (TRACE_SIEVE_KERNEL > 1)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: mask3=%#x,(%d, %d, %d, %d, %d, %d, %d), thread_start=%d\n", mask3, i11, i13, i17, i19, i23, i29, i31, thread_start);
#endif

    if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  i11 = bump_mod_p (i11, -32, 11);
		  i13 = bump_mod_p (i13, -32, 13);
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  i13 = bump_mod_p (i13, -32, 13);
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  i19 = bump_mod_p (i19, -32, 19);
	  i23 = bump_mod_p (i23, -32, 23);
	  i29 = bump_mod_p (i29, -32, 29);
	  i31 = bump_mod_p (i31, -32, 31);

	  if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  mask4 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  mask4 = (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  mask4 = BITSLL17 << i17;
	  }
	  mask4 |= (BITSLL19 << i19) | (BITSLL23 << i23);
	  mask4 |= (BITSLL29 << i29) | (BITSLL31 << i31);

#if (TRACE_SIEVE_KERNEL > 1)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: mask4=%#x,(%d, %d, %d, %d, %d, %d, %d), thread_start=%d\n", mask4, i11, i13, i17, i19, i23, i29, i31, thread_start);
#endif

	  if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  i11 = bump_mod_p (i11, -32, 11);
		  i13 = bump_mod_p (i13, -32, 13);
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  i13 = bump_mod_p (i13, -32, 13);
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  i19 = bump_mod_p (i19, -32, 19);
	  i23 = bump_mod_p (i23, -32, 23);
	  i29 = bump_mod_p (i29, -32, 29);
	  i31 = bump_mod_p (i31, -32, 31);

	  locsieve32[get_local_id(0) * block_size / threadsPerBlock / 32 + 0] = mask;
	  locsieve32[get_local_id(0) * block_size / threadsPerBlock / 32 + 1] = mask2;
	  locsieve32[get_local_id(0) * block_size / threadsPerBlock / 32 + 2] = mask3;
	  locsieve32[get_local_id(0) * block_size / threadsPerBlock / 32 + 3] = mask4;

	  if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  mask = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  mask = (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  mask = BITSLL17 << i17;
	  }
	  mask |= (BITSLL19 << i19) | (BITSLL23 << i23);
	  mask |= (BITSLL29 << i29) | (BITSLL31 << i31);

#if (TRACE_SIEVE_KERNEL > 1)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: mask=%#x,(%d, %d, %d, %d, %d, %d, %d), thread_start=%d\n", mask, i11, i13, i17, i19, i23, i29, i31, thread_start);
#endif

	  if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  i11 = bump_mod_p (i11, -32, 11);
		  i13 = bump_mod_p (i13, -32, 13);
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  i13 = bump_mod_p (i13, -32, 13);
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  i19 = bump_mod_p (i19, -32, 19);
	  i23 = bump_mod_p (i23, -32, 23);
	  i29 = bump_mod_p (i29, -32, 29);
	  i31 = bump_mod_p (i31, -32, 31);

	  if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  mask2 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  mask2 = (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  mask2 = BITSLL17 << i17;
	  }
	  mask2 |= (BITSLL19 << i19) | (BITSLL23 << i23);
	  mask2 |= (BITSLL29 << i29) | (BITSLL31 << i31);

#if (TRACE_SIEVE_KERNEL > 1)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: mask2=%#x,(%d, %d, %d, %d, %d, %d, %d), thread_start=%d\n", mask2, i11, i13, i17, i19, i23, i29, i31, thread_start);
#endif

	  if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  i11 = bump_mod_p (i11, -32, 11);
		  i13 = bump_mod_p (i13, -32, 13);
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  i13 = bump_mod_p (i13, -32, 13);
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  i19 = bump_mod_p (i19, -32, 19);
	  i23 = bump_mod_p (i23, -32, 23);
	  i29 = bump_mod_p (i29, -32, 29);
	  i31 = bump_mod_p (i31, -32, 31);

	  if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  mask3 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  mask3 = (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  mask3 = BITSLL17 << i17;
	  }
	  mask3 |= (BITSLL19 << i19) | (BITSLL23 << i23);
	  mask3 |= (BITSLL29 << i29) | (BITSLL31 << i31);

#if (TRACE_SIEVE_KERNEL > 1)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: mask3=%#x,(%d, %d, %d, %d, %d, %d, %d), thread_start=%d\n", mask3, i11, i13, i17, i19, i23, i29, i31, thread_start);
#endif

	  if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  i11 = bump_mod_p (i11, -32, 11);
		  i13 = bump_mod_p (i13, -32, 13);
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  i13 = bump_mod_p (i13, -32, 13);
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  i17 = bump_mod_p (i17, -32, 17);
	  }
	  i19 = bump_mod_p (i19, -32, 19);
	  i23 = bump_mod_p (i23, -32, 23);
	  i29 = bump_mod_p (i29, -32, 29);
	  i31 = bump_mod_p (i31, -32, 31);

	  if (primesNotSieved == 4) {	// Primes 2, 3, 5, 7 are not sieved
		  mask4 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 5) {	// Primes 2, 3, 5, 7, 11 are not sieved
		  mask4 = (BITSLL13 << i13) | (BITSLL17 << i17);
	  }
	  if (primesNotSieved == 6) {	// Primes 2, 3, 5, 7, 11, 13 are not sieved
		  mask4 = BITSLL17 << i17;
	  }
	  mask4 |= (BITSLL19 << i19) | (BITSLL23 << i23);
	  mask4 |= (BITSLL29 << i29) | (BITSLL31 << i31);

#if (TRACE_SIEVE_KERNEL > 1)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: mask4=%#x,(%d, %d, %d, %d, %d, %d, %d), thread_start=%d\n", mask4, i11, i13, i17, i19, i23, i29, i31, thread_start);
#endif

	  locsieve32[get_local_id(0) * block_size / threadsPerBlock / 32 + 4] = mask;
	  locsieve32[get_local_id(0) * block_size / threadsPerBlock / 32 + 5] = mask2;
	  locsieve32[get_local_id(0) * block_size / threadsPerBlock / 32 + 6] = mask3;
	  locsieve32[get_local_id(0) * block_size / threadsPerBlock / 32 + 7] = mask4;

#if (TRACE_SIEVE_KERNEL > 0)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at i29\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

	  // The following handles primes, 32 < p < 64.  Each prime hits 0 or 1 32-bit words.

	  i37 = mod_const_p (bit_to_clr[11] - thread_start, 37);	// compute bit to clear for prime 37
	  i41 = mod_const_p (bit_to_clr[12] - thread_start, 41);	// compute bit to clear for prime 41
	  i43 = mod_const_p (bit_to_clr[13] - thread_start, 43);	// compute bit to clear for prime 43
	  i47 = mod_const_p (bit_to_clr[14] - thread_start, 47);	// compute bit to clear for prime 47
	  i53 = mod_const_p (bit_to_clr[15] - thread_start, 53);	// compute bit to clear for prime 53
	  i59 = mod_const_p (bit_to_clr[16] - thread_start, 59);	// compute bit to clear for prime 59
	  i61 = mod_const_p (bit_to_clr[17] - thread_start, 61);	// compute bit to clear for prime 61

	  for (j = 0; ; )
    {
      // three block doing the same for performance tests.
      /*
      // this one ist close to the fastest on VLIW5
      mask  = two_pow_n_32[i37];
      mask |= two_pow_n_32[i41];
      mask |= two_pow_n_32[i43];
      mask |= two_pow_n_32[i47];
      mask |= two_pow_n_32[i53];
      mask |= two_pow_n_32[i59];
      mask |= two_pow_n_32[i61];
      */

      /*
      // slow on VLIW5
      mask = (i37 > 31) << i37;
      mask |= ((i41 > 31) << i41) | ((i43 > 31) << i43);
      mask |= ((i47 > 31) << i47) | ((i53 > 31) << i53);
      mask |= ((i59 > 31) << i59) | ((i61 > 31) << i61);
      */

      // this one ist fastest on VLIW5
      mask = i37 > 31 ? 0 : (1 << i37);
      mask |= (i41 > 31 ? 0 : (1 << i41)) | (i43 > 31 ? 0 : (1 << i43));
      mask |= (i47 > 31 ? 0 : (1 << i47)) | (i53 > 31 ? 0 : (1 << i53));
      mask |= (i59 > 31 ? 0 : (1 << i59)) | (i61 > 31 ? 0 : (1 << i61));

		  locsieve32[get_local_id(0) * block_size / threadsPerBlock / 32 + j] |= mask;

		  j++;
		  if (j >= block_size / threadsPerBlock / 32) break;

		  i37 = bump_mod_p (i37, -32, 37);
		  i41 = bump_mod_p (i41, -32, 41);
		  i43 = bump_mod_p (i43, -32, 43);
		  i47 = bump_mod_p (i47, -32, 47);
		  i53 = bump_mod_p (i53, -32, 53);
		  i59 = bump_mod_p (i59, -32, 59);
		  i61 = bump_mod_p (i61, -32, 61);
	  }
	}

#if (TRACE_SIEVE_KERNEL > 0)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at i61\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

	// The following handles primes 64 < p < 128.
	// Each thread handles one 64-bit word of the 256-bit section of shared memory.
	// Each prime will hit a 64-bit word zero or one time.

	if (primesNotSieved + primesHandledWithSpecialCode > 18)
  {
	  uint i67, i71, i73, i79, i83, i89, i97, i101, i103, i107, i109, i113, i127;
	  ulong	mask;

	  i67 = mod_const_p (bit_to_clr[18] - thread_start, 67);	// compute bit to clear for prime 67
	  i71 = mod_const_p (bit_to_clr[19] - thread_start, 71);	// compute bit to clear for prime 71
	  i73 = mod_const_p (bit_to_clr[20] - thread_start, 73);	// compute bit to clear for prime 73
	  i79 = mod_const_p (bit_to_clr[21] - thread_start, 79);	// compute bit to clear for prime 79
	  i83 = mod_const_p (bit_to_clr[22] - thread_start, 83);	// compute bit to clear for prime 83
	  i89 = mod_const_p (bit_to_clr[23] - thread_start, 89);	// compute bit to clear for prime 89
	  i97 = mod_const_p (bit_to_clr[24] - thread_start, 97);	// compute bit to clear for prime 97

	  for (j = 0; ; )
    {
		  mask = i67 > 63 ? 0 : ((ulong) 1 << i67);
			mask |= (i71 > 63 ? 0 : ((ulong) 1 << i71));
      mask |= (i73 > 63 ? 0 : ((ulong) 1 << i73));
	    mask |= (i79 > 63 ? 0 : ((ulong) 1 << i79));
      mask |= (i83 > 63 ? 0 : ((ulong) 1 << i83));
		  mask |= (i89 > 63 ? 0 : ((ulong) 1 << i89));
      mask |= (i97 > 63 ? 0 : ((ulong) 1 << i97));

#if (TRACE_SIEVE_KERNEL > 1)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: i67=%d, i71=%d, i73=%d, i79=%d, i83=%d, i89=%d, i97=%d, mask=%#llx\n",
       i67, i71, i73, i79, i83, i89, i97, mask);
#endif

		  locsieve64[get_local_id(0) * block_size / threadsPerBlock / 64 + j] |= mask;

		  j++;
		  if (j >= block_size / threadsPerBlock / 64) break;

		  i67 = bump_mod_p (i67, -64, 67);
		  i71 = bump_mod_p (i71, -64, 71);
		  i73 = bump_mod_p (i73, -64, 73);
		  i79 = bump_mod_p (i79, -64, 79);
		  i83 = bump_mod_p (i83, -64, 83);
		  i89 = bump_mod_p (i89, -64, 89);
		  i97 = bump_mod_p (i97, -64, 97);
	  }

#if (TRACE_SIEVE_KERNEL > 0)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at i97\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

	  i101 = mod_const_p (bit_to_clr[25] - thread_start, 101);	// compute bit to clear for prime 101
	  i103 = mod_const_p (bit_to_clr[26] - thread_start, 103);	// compute bit to clear for prime 103
	  i107 = mod_const_p (bit_to_clr[27] - thread_start, 107);	// compute bit to clear for prime 107
	  i109 = mod_const_p (bit_to_clr[28] - thread_start, 109);	// compute bit to clear for prime 109
	  i113 = mod_const_p (bit_to_clr[29] - thread_start, 113);	// compute bit to clear for prime 113
	  i127 = mod_const_p (bit_to_clr[30] - thread_start, 127);	// compute bit to clear for prime 127

	  for (j = 0; ; )
    {
		  mask = i101 > 63 ? 0 : ((ulong) 1 << i101);
		  mask |= (i103 > 63 ? 0 : ((ulong) 1 << i103)) | (i107 > 63 ? 0 : ((ulong) 1 << i107));
		  mask |= (i109 > 63 ? 0 : ((ulong) 1 << i109)) | (i113 > 63 ? 0 : ((ulong) 1 << i113));
		  mask |= i127 > 63 ? 0 : ((ulong) 1 << i127);

#if (TRACE_SIEVE_KERNEL > 1)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: i101=%d, i103=%d, i107=%d, i109=%d, i113=%d, i127=%d, mask=%#llx\n",
       i101, i103, i107, i109, i113, i127, mask);
#endif

		  locsieve64[get_local_id(0) * block_size / threadsPerBlock / 64 + j] |= mask;

		  j++;
		  if (j >= block_size / threadsPerBlock / 64) break;

		  i101 = bump_mod_p (i101, -64, 101);
		  i103 = bump_mod_p (i103, -64, 103);
		  i107 = bump_mod_p (i107, -64, 107);
		  i109 = bump_mod_p (i109, -64, 109);
		  i113 = bump_mod_p (i113, -64, 113);
		  i127 = bump_mod_p (i127, -64, 127);
	  }
	}

#if (TRACE_SIEVE_KERNEL > 0)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at i127\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

	// The following handles primes 128 < p < 256.
	// Each thread handles one 128-bit word of the 256-bit section of shared memory.
	// Each prime will hit a 128-bit word zero or one time.

	if (primesNotSieved + primesHandledWithSpecialCode > 31)
  {
    uint 	i131, i137, i139, i149, i151, i157, i163, i167, i173, i179, i181, i191;
    uint 	i193, i197, i199, i211, i223, i227, i229, i233, i239, i241, i251;
    ulong	mask1, mask2;

    i131 = mod_const_p (bit_to_clr[31] - thread_start, 131);	// compute bit to clear for prime 131
    i137 = mod_const_p (bit_to_clr[32] - thread_start, 137);	// compute bit to clear for prime 137
    i139 = mod_const_p (bit_to_clr[33] - thread_start, 139);	// compute bit to clear for prime 139
    i149 = mod_const_p (bit_to_clr[34] - thread_start, 149);	// compute bit to clear for prime 149
    i151 = mod_const_p (bit_to_clr[35] - thread_start, 151);	// compute bit to clear for prime 151
    i157 = mod_const_p (bit_to_clr[36] - thread_start, 157);	// compute bit to clear for prime 157

    for (j = 0; ; )
    {
      mask1  = (i131 > 63 ? 0 : ((ulong) 1 << i131)) | (i137 > 63 ? 0 : ((ulong) 1 << i137));
      mask1 |= (i139 > 63 ? 0 : ((ulong) 1 << i139)) | (i149 > 63 ? 0 : ((ulong) 1 << i149));
      mask1 |= (i151 > 63 ? 0 : ((ulong) 1 << i151)) | (i157 > 63 ? 0 : ((ulong) 1 << i157));
      // "negative" uint will be much larger than 63, therefore, if i was < 63, the result will be 0
      mask2  = ((i131 - 64) > 63 ? 0 : ((ulong) 1 << (i131 - 64))) | ((i137 - 64) > 63 ? 0 : ((ulong) 1 << (i137 - 64)));
      mask2 |= ((i139 - 64) > 63 ? 0 : ((ulong) 1 << (i139 - 64))) | ((i149 - 64) > 63 ? 0 : ((ulong) 1 << (i149 - 64)));
      mask2 |= ((i151 - 64) > 63 ? 0 : ((ulong) 1 << (i151 - 64))) | ((i157 - 64) > 63 ? 0 : ((ulong) 1 << (i157 - 64)));

      locsieve64[get_local_id(0) * block_size / threadsPerBlock / 64 + j * 2] |= mask1;
      locsieve64[get_local_id(0) * block_size / threadsPerBlock / 64 + j * 2 + 1] |= mask2;

      j++;
      if (j >= block_size / threadsPerBlock / 128) break;

      i131 = bump_mod_p (i131, -128, 131);
      i137 = bump_mod_p (i137, -128, 137);
      i139 = bump_mod_p (i139, -128, 139);
      i149 = bump_mod_p (i149, -128, 149);
      i151 = bump_mod_p (i151, -128, 151);
      i157 = bump_mod_p (i157, -128, 157);
    }

#if (TRACE_SIEVE_KERNEL > 0)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at i157\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

    i163 = mod_const_p (bit_to_clr[37] - thread_start, 163);	// compute bit to clear for prime 163
    i167 = mod_const_p (bit_to_clr[38] - thread_start, 167);	// compute bit to clear for prime 167
    i173 = mod_const_p (bit_to_clr[39] - thread_start, 173);	// compute bit to clear for prime 173
    i179 = mod_const_p (bit_to_clr[40] - thread_start, 179);	// compute bit to clear for prime 179
    i181 = mod_const_p (bit_to_clr[41] - thread_start, 181);	// compute bit to clear for prime 181
    i191 = mod_const_p (bit_to_clr[42] - thread_start, 191);	// compute bit to clear for prime 191

    for (j = 0; ; )
    {
      mask1 = (i163 > 63 ? 0 : ((ulong) 1 << i163)) | (i167 > 63 ? 0 : ((ulong) 1 << i167));
      mask1 |= (i173 > 63 ? 0 : ((ulong) 1 << i173)) | (i179 > 63 ? 0 : ((ulong) 1 << i179));
      mask1 |= (i181 > 63 ? 0 : ((ulong) 1 << i181)) | (i191 > 63 ? 0 : ((ulong) 1 << i191));
      mask2 = (i163 - 64 > 63 ? 0 : ((ulong) 1 << (i163 - 64))) | (i167 - 64 > 63 ? 0 : ((ulong) 1 << (i167 - 64)));
      mask2 |= (i173 - 64 > 63 ? 0 : ((ulong) 1 << (i173 - 64))) | (i179 - 64 > 63 ? 0 : ((ulong) 1 << (i179 - 64)));
      mask2 |= (i181 - 64 > 63 ? 0 : ((ulong) 1 << (i181 - 64))) | (i191 - 64 > 63 ? 0 : ((ulong) 1 << (i191 - 64)));

      locsieve64[get_local_id(0) * block_size / threadsPerBlock / 64 + j * 2] |= mask1;
      locsieve64[get_local_id(0) * block_size / threadsPerBlock / 64 + j * 2 + 1] |= mask2;

      j++;
      if (j >= block_size / threadsPerBlock / 128) break;

      i163 = bump_mod_p (i163, -128, 163);
      i167 = bump_mod_p (i167, -128, 167);
      i173 = bump_mod_p (i173, -128, 173);
      i179 = bump_mod_p (i179, -128, 179);
      i181 = bump_mod_p (i181, -128, 181);
      i191 = bump_mod_p (i191, -128, 191);
    }

#if (TRACE_SIEVE_KERNEL > 0)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at i191\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

    i193 = mod_const_p (bit_to_clr[43] - thread_start, 193);	// compute bit to clear for prime 193
    i197 = mod_const_p (bit_to_clr[44] - thread_start, 197);	// compute bit to clear for prime 197
    i199 = mod_const_p (bit_to_clr[45] - thread_start, 199);	// compute bit to clear for prime 199
    i211 = mod_const_p (bit_to_clr[46] - thread_start, 211);	// compute bit to clear for prime 211
    i223 = mod_const_p (bit_to_clr[47] - thread_start, 223);	// compute bit to clear for prime 223
    i227 = mod_const_p (bit_to_clr[48] - thread_start, 227);	// compute bit to clear for prime 227

    for (j = 0; ; )
    {
      mask1 = (i193 > 63 ? 0 : ((ulong) 1 << i193)) | (i197 > 63 ? 0 : ((ulong) 1 << i197));
      mask1 |= (i199 > 63 ? 0 : ((ulong) 1 << i199)) | (i211 > 63 ? 0 : ((ulong) 1 << i211));
      mask1 |= (i223 > 63 ? 0 : ((ulong) 1 << i223)) | (i227 > 63 ? 0 : ((ulong) 1 << i227));
      mask2 = (i193 - 64 > 63 ? 0 : ((ulong) 1 << (i193 - 64))) | (i197 - 64 > 63 ? 0 : ((ulong) 1 << (i197 - 64)));
      mask2 |= (i199 - 64 > 63 ? 0 : ((ulong) 1 << (i199 - 64))) | (i211 - 64 > 63 ? 0 : ((ulong) 1 << (i211 - 64)));
      mask2 |= (i223 - 64 > 63 ? 0 : ((ulong) 1 << (i223 - 64))) | (i227 - 64 > 63 ? 0 : ((ulong) 1 << (i227 - 64)));

      locsieve64[get_local_id(0) * block_size / threadsPerBlock / 64 + j * 2] |= mask1;
      locsieve64[get_local_id(0) * block_size / threadsPerBlock / 64 + j * 2 + 1] |= mask2;

#if (TRACE_SIEVE_KERNEL > 1)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: i193=%d, i197=%d, i199=%d, i211=%d, i223=%d, i227=%d, mask1=%#llx, mask2=%#llx\n",
       i193, i197, i199, i211, i223, i227, mask1, mask2);
#endif

      j++;
      if (j >= block_size / threadsPerBlock / 128) break;

      i193 = bump_mod_p (i193, -128, 193);
      i197 = bump_mod_p (i197, -128, 197);
      i199 = bump_mod_p (i199, -128, 199);
      i211 = bump_mod_p (i211, -128, 211);
      i223 = bump_mod_p (i223, -128, 223);
      i227 = bump_mod_p (i227, -128, 227);
    }

#if (TRACE_SIEVE_KERNEL > 0)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at i227\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

    i229 = mod_const_p (bit_to_clr[49] - thread_start, 229);	// compute bit to clear for prime 229
    i233 = mod_const_p (bit_to_clr[50] - thread_start, 233);	// compute bit to clear for prime 233
    i239 = mod_const_p (bit_to_clr[51] - thread_start, 239);	// compute bit to clear for prime 239
    i241 = mod_const_p (bit_to_clr[52] - thread_start, 241);	// compute bit to clear for prime 241
    i251 = mod_const_p (bit_to_clr[53] - thread_start, 251);	// compute bit to clear for prime 251

    for (j = 0; ; )
    {
      mask1  = i229 > 63 ? 0 : ((ulong) 1 << i229);
      mask1 |= (i233 > 63 ? 0 : ((ulong) 1 << i233)) | (i239 > 63 ? 0 : ((ulong) 1 << i239));
      mask1 |= (i241 > 63 ? 0 : ((ulong) 1 << i241)) | (i251 > 63 ? 0 : ((ulong) 1 << i251));
      mask2  = i229 - 64 > 63 ? 0 : ((ulong) 1 << (i229 - 64));
      mask2 |= (i233 - 64 > 63 ? 0 : ((ulong) 1 << (i233 - 64))) | (i239 - 64 > 63 ? 0 : ((ulong) 1 << (i239 - 64)));
      mask2 |= (i241 - 64 > 63 ? 0 : ((ulong) 1 << (i241 - 64)));
      mask2 |= (i251 - 64 > 63 ? 0 : ((ulong) 1 << (i251 - 64)));

      locsieve64[get_local_id(0) * block_size / threadsPerBlock / 64 + j * 2] |= mask1;
      locsieve64[get_local_id(0) * block_size / threadsPerBlock / 64 + j * 2 + 1] |= mask2;

#if (TRACE_SIEVE_KERNEL > 1)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: i229=%d, i233=%d, i239=%d, i241=%d, i251=%d, mask1=%#llx, mask2=%#llx\n",
       i229, i233, i239, i241, i251, mask1, mask2);
#endif

      j++;
      if (j >= block_size / threadsPerBlock / 128) break;

      i229 = bump_mod_p (i229, -128, 229);
      i233 = bump_mod_p (i233, -128, 233);
      i239 = bump_mod_p (i239, -128, 239);
      i241 = bump_mod_p (i241, -128, 241);
      i251 = bump_mod_p (i251, -128, 251);
    }
	}

#if (TRACE_SIEVE_KERNEL > 0)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at i251\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

	// The following handles primes 256 < p < 512.
	// Each thread handles one 256-bit word of the 256-bit section of shared memory.
	// Each prime will hit a 256-bit word zero or one time.

#define SIEVE_256_BIT(n,p)	i = mod_const_p (bit_to_clr[n] - thread_start, p); \
				if (i < 256) locsieve[j * threadsPerBlock * 32 + get_local_id(0) * 32 + (i >> 3)] |= 1 << (i & 7);

	if (primesNotSieved + primesHandledWithSpecialCode > 54)
	for (j = 0; j < block_size / (threadsPerBlock * 256); j++) {
		SIEVE_256_BIT (54, 257);
		SIEVE_256_BIT (55, 263);
		SIEVE_256_BIT (56, 269);
		SIEVE_256_BIT (57, 271);
		SIEVE_256_BIT (58, 277);
		SIEVE_256_BIT (59, 281);
		SIEVE_256_BIT (60, 283);
		SIEVE_256_BIT (61, 293);
		SIEVE_256_BIT (62, 307);
		SIEVE_256_BIT (63, 311);
		SIEVE_256_BIT (64, 313);
		SIEVE_256_BIT (65, 317);
		SIEVE_256_BIT (66, 331);
		SIEVE_256_BIT (67, 337);
		SIEVE_256_BIT (68, 347);
		SIEVE_256_BIT (69, 349);
		SIEVE_256_BIT (70, 353);
		SIEVE_256_BIT (71, 359);
		SIEVE_256_BIT (72, 367);
		SIEVE_256_BIT (73, 373);
		SIEVE_256_BIT (74, 379);
		SIEVE_256_BIT (75, 383);
		SIEVE_256_BIT (76, 389);
		SIEVE_256_BIT (77, 397);
		SIEVE_256_BIT (78, 401);
		SIEVE_256_BIT (79, 409);
		SIEVE_256_BIT (80, 419);
		SIEVE_256_BIT (81, 421);
		SIEVE_256_BIT (82, 431);
		SIEVE_256_BIT (83, 433);
		SIEVE_256_BIT (84, 439);
		SIEVE_256_BIT (85, 443);
		SIEVE_256_BIT (86, 449);
		SIEVE_256_BIT (87, 457);
		SIEVE_256_BIT (88, 461);
		SIEVE_256_BIT (89, 463);
		SIEVE_256_BIT (90, 467);
		SIEVE_256_BIT (91, 479);
		SIEVE_256_BIT (92, 487);
		SIEVE_256_BIT (93, 491);
		SIEVE_256_BIT (94, 499);
		SIEVE_256_BIT (95, 503);
		SIEVE_256_BIT (96, 509);
	}

#undef bit_to_clr

	// sync before sieving more primes
  // ist this really needed? And does a local mem_fence suffice
	barrier(CLK_LOCAL_MEM_FENCE);

	// Bump the bit_to_clr_dev pointer to a 256-byte boundary so that warps access
	// memory without crossing memory block boundaries.

	pinfo_dev += PINFO_PAD1;

#if (TRACE_SIEVE_KERNEL > 0)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at i509\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

	// Sieve the first row or two of primes (we could do more but it wasn't helpful) using 8 threads to process each prime.
	// We do this to reduce masking calculations as well as to hopefully reduce
	// shared memory conflicts (we are at least guaranteed the 8 threads processing
	// a single prime will not conflict).  However, we have 8 times as many bclr calculations.
	// Our memory layout here is 16-bits for p, 16-bits for bit-to-clr,
	// 32-bits for pinv (a total of 8 bytes per prime).

	i = 0;
	for ( ; i < 1 && i < maxp; i++, pinfo_dev += threadsPerBlock * 8) {
		for (j = 0; j < 8; j++) {
			uchar	mask;

			bclr = pinfo32[j * threadsPerBlock / 8 + get_local_id(0) / 8];	// Read p and the bit_to_clear in one instruction
			p = bclr >> 16;
			bclr &= 0xFFFF;
			pinv = pinfo32[threadsPerBlock + j * threadsPerBlock / 8 + get_local_id(0) / 8];
			validate_bclr (bclr, p);

			bclr = mod_p (bclr - block_start, p, pinv) + (get_local_id(0) & 7) * p;
			mask = 1 << (bclr & 7);
			bclr = bclr >> 3;

			// Clear bits
			do {
				uchar val = locsieve8[bclr];
				if (! (val & mask)) locsieve8[bclr] = val | mask;
				bclr += p;
			} while (bclr < block_size_in_bytes);
		}
	}

#if (TRACE_SIEVE_KERNEL > 0)
    barrier(CLK_LOCAL_MEM_FENCE);
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at #1\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

	// Sieve the primes below 64K (there are 6542 primes below 64K)
	// Our memory layout here is 16-bits for p, 16-bits for bit-to-clr,
	// 32-bits for pinv (a total of 8 bytes per prime).

	for ( ; i < sieving64KCrossover && i < maxp; i += 3, pinfo_dev += threadsPerBlock * 24) {
		uint	p3, pinv3, bclr3, p2, pinv2, bclr2;

		bclr3 = pinfo32[get_local_id(0)];		// Read p and the bit_to_clear in one instruction
		bclr2 = pinfo32[threadsPerBlock*2 + get_local_id(0)];
		bclr = pinfo32[threadsPerBlock*4 + get_local_id(0)];

		p3 = bclr3 >> 16;
		p2 = bclr2 >> 16;
		p = bclr >> 16;

		bclr3 &= 0xFFFF;
		bclr2 &= 0xFFFF;
		bclr &= 0xFFFF;

		validate_bclr (bclr3, p3);
		validate_bclr (bclr2, p2);
		validate_bclr (bclr, p);

		pinv3 = pinfo32[threadsPerBlock + get_local_id(0)];
		pinv2 = pinfo32[threadsPerBlock*3 + get_local_id(0)];
		pinv = pinfo32[threadsPerBlock*5 + get_local_id(0)];

		bclr3 = mod_p (bclr3 - block_start, p3, pinv3);
		bclr2 = mod_p (bclr2 - block_start, p2, pinv2);
		bclr = mod_p (bclr - block_start, p, pinv);

		// Clear bits (assumes 64K bitmap)
		do {
			bitOrSometimesIffy (locsieve, bclr3);
			bclr3 += p3;
		} while (bclr3 < block_size);
		do {
			bitOrSometimesIffy (locsieve, bclr2);
			bclr2 += p2;
		} while (bclr2 < block_size);
		do {
			bitOrSometimesIffy (locsieve, bclr);
			bclr += p;
		} while (bclr < block_size);
	}

#if (TRACE_SIEVE_KERNEL > 0)
    barrier(CLK_LOCAL_MEM_FENCE);
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at #2\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

	// We need one transitional loop to crossover the 64K boundary.  This will get us to the point
	// where all remaining primes to sieve are above 64K.
	// We need one more transitional loop to switch to a memory layout that let's us cram all needed info in 32-bits.
	// Our memory layout here is 32-bits for bit-to-clr, 32-bits for p, 32-bits for pinv.

	if (i < maxp) {
		uint	bclr2, pinv2, p2;

		bclr2 = pinfo32[get_local_id(0)];
		pinv2 = pinfo32[threadsPerBlock + get_local_id(0)];
		p2 = pinfo32[threadsPerBlock * 2 + get_local_id(0)];
		validate_bclr (bclr2, p2);

		bclr2 = mod_p (bclr2 - block_start, p2, pinv2);

		// Clear (rarely) 0, 1 or (rarely) 2 bits (bug: assumes block_size = 64K)
		if (bclr2 < block_size) {
			bitOr (locsieve, bclr2);
			bclr2 += p2;
			if (bclr2 < block_size) bitOr (locsieve, bclr2);
		}

		bclr = pinfo32[threadsPerBlock * 3 + get_local_id(0)];
		pinv = pinfo32[threadsPerBlock * 4 + get_local_id(0)];
		p = pinfo32[threadsPerBlock * 5 + get_local_id(0)];
		validate_bclr (bclr, p);

		bclr = mod_p (bclr - block_start, p, pinv);

		// Clear (rarely) 0, 1 or (rarely) 2 bits (bug: assumes block_size = 64K)
		if (bclr < block_size) bitOr (locsieve, bclr);
		i += 2, pinfo_dev += threadsPerBlock * 24;
	}

#if (TRACE_SIEVE_KERNEL > 0)
    barrier(CLK_LOCAL_MEM_FENCE);
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at #3\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

	// Sieve primes up to and including the row containing the first 18-bit prime (more than 128K).
	// Our memory layout here is 18-bits for bit-to-clr, 7-bits for (p difference) / 2, 7-bits for pinv difference.

	for ( ; i < sieving128KCrossover + 1 && i < maxp; i += 3, pinfo_dev += threadsPerBlock * 12) {
		uint tmp3 = pinfo32[get_local_id(0)];
		uint tmp2 = pinfo32[threadsPerBlock + get_local_id(0)];
		uint tmp = pinfo32[threadsPerBlock*2 + get_local_id(0)];
		uint bclr3, p3, pinv3, bclr2, p2, pinv2;

		bclr3 = tmp3 & 0x0003FFFF;
		bclr2 = tmp2 & 0x0003FFFF;
		bclr = tmp & 0x0003FFFF;

		pinv3 = pinv - (tmp3 >> 25);
		pinv2 = pinv - (tmp2 >> 25);
		pinv -= tmp >> 25;

		p3 = p + ((tmp3 & 0x01FC0000) >> 17);
		p2 = p + ((tmp2 & 0x01FC0000) >> 17);
		p += (tmp & 0x01FC0000) >> 17;

		validate_bclr (bclr3, p3);
		validate_bclr (bclr2, p2);
		validate_bclr (bclr, p);

		bclr3 = mod_p (bclr3 - block_start, p3, pinv3);
		bclr2 = mod_p (bclr2 - block_start, p2, pinv2);
		bclr = mod_p (bclr - block_start, p, pinv);

		// Optionally clear bit (bug: assumes block_size <= 64K)
		if (bclr3 < block_size) bitOr (locsieve, bclr3);
		if (bclr2 < block_size) bitOr (locsieve, bclr2);
		if (bclr < block_size) bitOr (locsieve, bclr);
	}

#if (TRACE_SIEVE_KERNEL > 0)
    barrier(CLK_LOCAL_MEM_FENCE);
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at #4\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

	// We need one transitional loop which handles the first complete row containing primes above 128K.
	// Our memory layout here is 32-bits for bit-to-clr, 32-bits for p, 32-bits for pinv.

	if (i < maxp) {
		bclr = pinfo32[get_local_id(0)];
		pinv = pinfo32[threadsPerBlock + get_local_id(0)];
		p = pinfo32[threadsPerBlock * 2 + get_local_id(0)];
		validate_bclr (bclr, p);

		bclr = sloppy_mod_p (bclr - block_start, p, pinv);

		// Optionally clear bit
		if (bclr < block_size) bitOr (locsieve, bclr);
		i++, pinfo_dev += threadsPerBlock * 12;
	}

#if (TRACE_SIEVE_KERNEL > 0)
    barrier(CLK_LOCAL_MEM_FENCE);
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at #5\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

	// Sieve the primes above 128K up to 1M.
	// Our memory layout here is 20-bits for bit-to-clr, 7-bits for (p difference) / 2, 5-bits for pinv difference.
	// Primes above 128K can use SLOPPY_MOD.

	for ( ; i < sieving1MCrossover && i < maxp; i += 4, pinfo_dev += threadsPerBlock * 16) {
		uint tmp4 = pinfo32[get_local_id(0)];
		uint tmp3 = pinfo32[threadsPerBlock + get_local_id(0)];
		uint tmp2 = pinfo32[threadsPerBlock*2 + get_local_id(0)];
		uint tmp = pinfo32[threadsPerBlock*3 + get_local_id(0)];
		uint bclr4, p4, pinv4, bclr3, p3, pinv3, bclr2, p2, pinv2;

		bclr4 = tmp4 & 0x000FFFFF;
		bclr3 = tmp3 & 0x000FFFFF;
		bclr2 = tmp2 & 0x000FFFFF;
		bclr = tmp & 0x000FFFFF;

		pinv4 = pinv - (tmp4 >> 27);
		pinv3 = pinv - (tmp3 >> 27);
		pinv2 = pinv - (tmp2 >> 27);
		pinv -= tmp >> 27;

		p4 = p + ((tmp4 & 0x07F00000) >> 19);
		p3 = p + ((tmp3 & 0x07F00000) >> 19);
		p2 = p + ((tmp2 & 0x07F00000) >> 19);
		p += (tmp & 0x07F00000) >> 19;

		validate_bclr (bclr4, p4);
		validate_bclr (bclr3, p3);
		validate_bclr (bclr2, p2);
		validate_bclr (bclr, p);

		bclr4 = sloppy_mod_p (bclr4 - block_start, p4, pinv4);
		bclr3 = sloppy_mod_p (bclr3 - block_start, p3, pinv3);
		bclr2 = sloppy_mod_p (bclr2 - block_start, p2, pinv2);
		bclr = sloppy_mod_p (bclr - block_start, p, pinv);

		// Optionally clear bit
		if (bclr4 < block_size) bitOr (locsieve, bclr4);
		if (bclr3 < block_size) bitOr (locsieve, bclr3);
		if (bclr2 < block_size) bitOr (locsieve, bclr2);
		if (bclr < block_size) bitOr (locsieve, bclr);
	}

#if (TRACE_SIEVE_KERNEL > 0)
    barrier(CLK_LOCAL_MEM_FENCE);
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at #6\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

	// We need one transitional loop to switch to a memory layout that again let's us cram all needed info in 32-bits.
	// Our memory layout here is 32-bits for bit-to-clr, 32-bits for p, 32-bits for pinv.

	if (i < maxp) {
		bclr = pinfo32[get_local_id(0)];
		pinv = pinfo32[threadsPerBlock + get_local_id(0)];
		p = pinfo32[threadsPerBlock * 2 + get_local_id(0)];
		validate_bclr (bclr, p);

		bclr = sloppy_mod_p (bclr - block_start, p, pinv);

		// Optionally clear bit
		if (bclr < block_size)
			bitOr (locsieve, bclr);
		i++, pinfo_dev += threadsPerBlock * 12;
	}

#if (TRACE_SIEVE_KERNEL > 0)
    barrier(CLK_LOCAL_MEM_FENCE);
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at #7\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

	// Sieve the primes above 1M up to 16M.
	// Our memory layout here is 24-bits for bit-to-clr, 7-bits for (p difference) / 2, 1-bit for pinv difference.

	for ( ; i < maxp; i += 4, pinfo_dev += threadsPerBlock * 16) {
		uint tmp4 = pinfo32[get_local_id(0)];
		uint tmp3 = pinfo32[threadsPerBlock + get_local_id(0)];
		uint tmp2 = pinfo32[threadsPerBlock*2 + get_local_id(0)];
		uint tmp = pinfo32[threadsPerBlock*3 + get_local_id(0)];
		uint bclr4, p4, pinv4, bclr3, p3, pinv3, bclr2, p2, pinv2;

		bclr4 = tmp4 & 0x00FFFFFF;
		bclr3 = tmp3 & 0x00FFFFFF;
		bclr2 = tmp2 & 0x00FFFFFF;
		bclr = tmp & 0x00FFFFFF;

		pinv4 = pinv - (tmp4 >> 31);
		pinv3 = pinv - (tmp3 >> 31);
		pinv2 = pinv - (tmp2 >> 31);
		pinv -= tmp >> 31;

		p4 = p + ((tmp4 & 0x7F000000) >> 23);
		p3 = p + ((tmp3 & 0x7F000000) >> 23);
		p2 = p + ((tmp2 & 0x7F000000) >> 23);
		p += (tmp & 0x7F000000) >> 23;

		validate_bclr (bclr4, p4);
		validate_bclr (bclr3, p3);
		validate_bclr (bclr2, p2);
		validate_bclr (bclr, p);

		bclr4 = sloppy_mod_p (bclr4 - block_start, p4, pinv4);
		bclr3 = sloppy_mod_p (bclr3 - block_start, p3, pinv3);
		bclr2 = sloppy_mod_p (bclr2 - block_start, p2, pinv2);
		bclr = sloppy_mod_p (bclr - block_start, p, pinv);

		// Optionally clear bit
		if (bclr4 < block_size) bitOr (locsieve, bclr4);
		if (bclr3 < block_size) bitOr (locsieve, bclr3);
		if (bclr2 < block_size) bitOr (locsieve, bclr2);
		if (bclr < block_size) bitOr (locsieve, bclr);
	}

	// sync before copying
	barrier(CLK_LOCAL_MEM_FENCE);

#if (TRACE_SIEVE_KERNEL > 0)
    if (get_global_id(0) == TRACE_SIEVE_TID)
    printf((__constant char *)"SegSieve: locsieve=[%#x, %#x, %#x, %#x, %#x, %#x, %#x, %#x, ...] at final\n",
       locsieve32[0], locsieve32[1], locsieve32[2], locsieve32[3], locsieve32[4], locsieve32[5], locsieve32[6], locsieve32[7]);
#endif

// Copy our shared bit array results to the global big bit array

	// Point to the block of the big bit array we are copying to
	big_bit_array_dev += get_group_id(0) * block_size_in_bytes;

	// Have each thread copy a part of the array.
	for (j = 0; j < block_size / (threadsPerBlock * 32); j++)
		big_bit_array32[j * threadsPerBlock + get_local_id(0)] = ~locsieve32[j * threadsPerBlock + get_local_id(0)];
}

//
// Sieve initialization kernels
//

// Internal routine to compute 1/n mod d using extended Euclid GCD

unsigned int modularinverse (uint n, uint orig_d)
{
	uint	d = orig_d;
	int	x, lastx, q, t;
	x = 0;
	lastx = 1;
	while (d != 0)
	{
		q = n / d;			// Floor(n/d)
		t = d; d = n - q * d; n = t;	// d = n mod d; n = lastd;
		t = x; x = lastx - q * x; lastx = t;
	}
	return (lastx < 0) ? (lastx + orig_d) : lastx;
}

// Calculate the modular inverses used in computing initial bit-to-clear values

__kernel void __attribute__((reqd_work_group_size(256, 1, 1))) CalcModularInverses (uint exponent, __global int *calc_info)
{
	uint	index;		// Index for prime and modinv data in calc_info
	uint	prime;		// The prime to work on
	ulong	facdist;	// Distance between two successive factors in a class

// Handle the primes that are processed with special code.  That is, they are not part of an official "row" in pinfo_dev.

#if (TRACE_SIEVE_KERNEL > 3)
    if (get_global_id(0) == TRACE_SIEVE_TID) printf((__constant char *)"CalcModularInverses: grpid=%d, locid=%d, exp=%u\n", get_group_id(0), get_local_id(0), exponent);
#endif
	if (get_group_id(0) == 0) {
		if (get_local_id(0) < primesNotSieved || get_local_id(0) >= primesNotSieved + primesHandledWithSpecialCode) return;
		index = get_local_id(0);
	}

// Handle primes that are in "rows" of pinfo_dev.

	else {
		// Get and apply the distance between prime numbers in the pinfo_dev "row"
		index = primesNotSieved + primesHandledWithSpecialCode + (get_group_id(0) - 1) * threadsPerBlock + get_local_id(0);
	}

// Calculate and save the modular inverse for one of the sieve primes
// The modular inverse is one over the distance between the corresponding factors for two successive k values in a class.
#ifdef INTEL
  // Intel-OpenCL has a bug that will not use 64-bit quantities here. Force it.
  #define mul_16_32(a,b) ((((ulong) ((uint)(a) * ((uint)(b) >> 16))) << 16) + (ulong) ((uint)(a) * ((uint)(b) & 0xFFFF)))
#else
  #define mul_16_32(a,b) ((ulong)(a) * (ulong)(b))
#endif

	prime = calc_info[MAX_PRIMES_PER_THREAD*4 + index * 2];
  facdist = mul_16_32 (2 * NUM_CLASSES, exponent);

	calc_info[MAX_PRIMES_PER_THREAD*4 + index * 2 + 1] = modularinverse ((uint) (facdist % prime), prime);
#if (TRACE_SIEVE_KERNEL > 2)
    if (get_global_id(0) == TRACE_SIEVE_TID) printf((__constant char *)"CalcModularInverses: index=%d, prime=%d, facdist=%d, inv=%d\n", index, prime, facdist%prime, calc_info[MAX_PRIMES_PER_THREAD*4 + index * 2 + 1]);
#endif
#ifdef GWDEBUG
  if (((facdist%prime) * calc_info[MAX_PRIMES_PER_THREAD*4 + index * 2 + 1])%prime != 1)
    printf ("CalcModularInverses FAIL!: %d, %d\n", index, prime);
#endif
}


// Calculate the initial bit-to-clear values

__kernel void __attribute__((reqd_work_group_size(256, 1, 1))) CalcBitToClear (uint exponent, ulong k_base, __global int *calc_info, __global uchar *pinfo_dev)
{
	uint	index;		// Index for prime and modinv data in calc_info
	uint	mask;		// Mask that tells us what bits must be preserved in pinfo_dev when setting bit-to-clear
	uint	prime;		// Calculate the bit-to-clear of this prime number
	uint	modinv;		// Distance between successive factors mod prime
	uint	bit_to_clear;	// Calculated bit to clear

// Handle the primes that are processed with special code.  That is, they are not part of an official "row" in pinfo_dev.

#if (TRACE_SIEVE_KERNEL > 3)
    if (get_global_id(0) == TRACE_SIEVE_TID) printf((__constant char *)"CalcBitToClear: grpid=%d, locid=%d, exp=%u, k_base=%llu\n", get_group_id(0), get_local_id(0), exponent, k_base);
#endif
	if (get_group_id(0) == 0) {
		if (get_local_id(0) < primesNotSieved || get_local_id(0) >= primesNotSieved + primesHandledWithSpecialCode) return;
		pinfo_dev += get_local_id(0) * 2;
		index = get_local_id(0);
	}

// Get info on the "row" of pinfo_dev we are working on.

	else {
		// Form the pointer to the start of the "row"
		pinfo_dev += calc_info[(get_group_id(0) - 1)];

		// The distance between bit-to-clear values in the pinfo_dev "row" is always 4 bytes
		pinfo_dev += get_local_id(0) * 4;

		// Get the index for the first prime number in the pinfo_dev "row"
		index = calc_info[MAX_PRIMES_PER_THREAD + (get_group_id(0) - 1)];

		// Get and apply the distance between prime numbers in the pinfo_dev "row"
		index += get_local_id(0) * calc_info[MAX_PRIMES_PER_THREAD*2 + (get_group_id(0) - 1)];

		// Get the mask to apply to word where we set the bit-to-clear value
		mask = calc_info[MAX_PRIMES_PER_THREAD*3 + (get_group_id(0) - 1)];
	}

// Read the prime and its modular inverse

	prime = calc_info[MAX_PRIMES_PER_THREAD*4 + index * 2];
	modinv = calc_info[MAX_PRIMES_PER_THREAD*4 + index * 2 + 1];

// Compute lowest possible value such that the factor (2 * k * exponent + 1) is divisible by our prime

	ulong	k_mod_p;	// k_base mod prime
	ulong	factor_mod_p;	// factor mod prime

	k_mod_p = k_base % prime;
	factor_mod_p = (2 * k_mod_p * exponent + 1) % prime;
	bit_to_clear = ((ulong) prime - factor_mod_p) * modinv % prime;

#if (TRACE_SIEVE_KERNEL > 2)
    if (get_global_id(0) == TRACE_SIEVE_TID) printf((__constant char *)"CalcBitToClear: prime=%d, modinv=%d, k_mod_p=%llu, factor_mod_p=%llu, bit_to_clear=%d\n", prime, modinv, k_mod_p, factor_mod_p, bit_to_clear);
#endif

#ifdef GWDEBUG
    k_base = k_base + (ulong)bit_to_clear * NUM_CLASSES;

k_mod_p = k_base % prime;
factor_mod_p = (2UL * k_mod_p * exponent + 1UL) % prime;
if (factor_mod_p != 0)
  printf ("FAIL!: %d, %d, %d\n", index, prime, bit_to_clear);
#endif
// Handle the primes that are processed with special code.  That is, they are not part of an official "row" in pinfo_dev.
// For these primes we store bit-to-clear in a 16-bit word.

	if (get_group_id(0) == 0) {
		*pinfo16 = bit_to_clear;
	}

// Store the bit-to-clear in a masked 32-bit value

	else {
		*pinfo32 = (*pinfo32 & mask) + bit_to_clear;
	}
}	


/* This function is used at the beginning of each GPU-sieve TF-kernel in order to extract the bits from the sieve.
   returns total number of bits set */

uint extract_bits(const uint bits_to_process, const uint tid, const uint lid, __local ushort *bitcount, __local ushort *smem, const __global uint * restrict bit_array)
{
  __private uint     i, words_per_thread, sieve_word, k_bit_base, total_bit_count;

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192; // 256 threads * 32 bits per word
  bit_array += mul24(tid, words_per_thread);

#if (TRACE_SIEVE_KERNEL > 2)
  if (tid==TRACE_SIEVE_TID) printf((__constant char *)"extract_bits: tid=%u, lid=%u, bits=%d, wpt=%u, base addr=%#x\n",
        tid, lid, bits_to_process, words_per_thread, bit_array);
#endif


// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[lid] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[lid] +=  popcount(bit_array[i]);

  barrier(CLK_LOCAL_MEM_FENCE);
#if (TRACE_SIEVE_KERNEL > 3)
  if (tid==TRACE_SIEVE_TID){ printf((__constant char *)"extract_bits: bitcount0: %d, %d, %d, %d, %d, %d, %d, %d, %d, %d,",
        bitcount[0], bitcount[1], bitcount[2], bitcount[3], bitcount[4], bitcount[5], bitcount[6], bitcount[7], bitcount[8], bitcount[9]);
        printf((__constant char *)" ..., %d, %d, %d, %d, %d, %d, %d, %d, %d, %d\n",
        bitcount[246], bitcount[247], bitcount[248], bitcount[249], bitcount[250], bitcount[251], bitcount[252], bitcount[253], bitcount[254], bitcount[255]);
    }
#endif

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  // AMD devs always run 16 threads at once => just 4 tallies PERF: optimize this (does removing the barriers improve performance at all?)
  // As OpenCL should be able to run on any device, rather leave the barriers in here ...
  if (lid & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[lid] += bitcount[lid - 1];

  barrier(CLK_LOCAL_MEM_FENCE);
#if (TRACE_SIEVE_KERNEL > 3)
    if (tid==TRACE_SIEVE_TID) printf((__constant char *)"extract_bits: bitcount1: %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, ..., %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n",
        bitcount[0], bitcount[1], bitcount[2], bitcount[3], bitcount[4], bitcount[5], bitcount[6], bitcount[7], bitcount[8], bitcount[9],
         bitcount[246], bitcount[247], bitcount[248], bitcount[249], bitcount[250], bitcount[251], bitcount[252], bitcount[253], bitcount[254], bitcount[255]);
#endif

  if (lid & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[lid] += bitcount[(lid - 2) | 1];

  barrier(CLK_LOCAL_MEM_FENCE);
#if (TRACE_SIEVE_KERNEL > 3)
  if (tid==TRACE_SIEVE_TID) printf((__constant char *)"extract_bits: bitcount2: %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, ..., %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n",
      bitcount[0], bitcount[1], bitcount[2], bitcount[3], bitcount[4], bitcount[5], bitcount[6], bitcount[7], bitcount[8], bitcount[9],
      bitcount[246], bitcount[247], bitcount[248], bitcount[249], bitcount[250], bitcount[251], bitcount[252], bitcount[253], bitcount[254], bitcount[255]);
#endif

  if (lid & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[lid] += bitcount[(lid - 4) | 3];

  barrier(CLK_LOCAL_MEM_FENCE);
#if (TRACE_SIEVE_KERNEL > 3)
  if (tid==TRACE_SIEVE_TID) printf((__constant char *)"extract_bits: bitcount4: %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, ..., %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n",
      bitcount[0], bitcount[1], bitcount[2], bitcount[3], bitcount[4], bitcount[5], bitcount[6], bitcount[7], bitcount[8], bitcount[9],
      bitcount[246], bitcount[247], bitcount[248], bitcount[249], bitcount[250], bitcount[251], bitcount[252], bitcount[253], bitcount[254], bitcount[255]);
#endif

  if (lid & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[lid] += bitcount[(lid - 8) | 7];

  barrier(CLK_LOCAL_MEM_FENCE);
#if (TRACE_SIEVE_KERNEL > 3)
  if (tid==TRACE_SIEVE_TID) printf((__constant char *)"extract_bits: bitcount8: %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, ..., %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n",
      bitcount[0], bitcount[1], bitcount[2], bitcount[3], bitcount[4], bitcount[5], bitcount[6], bitcount[7], bitcount[8], bitcount[9],
      bitcount[246], bitcount[247], bitcount[248], bitcount[249], bitcount[250], bitcount[251], bitcount[252], bitcount[253], bitcount[254], bitcount[255]);
#endif

 if (lid & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[lid] += bitcount[(lid - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  barrier(CLK_LOCAL_MEM_FENCE);
#if (TRACE_SIEVE_KERNEL > 3)
  if (tid==TRACE_SIEVE_TID) printf((__constant char *)"extract_bits: bitcount16: %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, ..., %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n",
      bitcount[0], bitcount[1], bitcount[2], bitcount[3], bitcount[4], bitcount[5], bitcount[6], bitcount[7], bitcount[8], bitcount[9],
      bitcount[246], bitcount[247], bitcount[248], bitcount[249], bitcount[250], bitcount[251], bitcount[252], bitcount[253], bitcount[254], bitcount[255]);
#endif

  if (lid  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[lid] += bitcount[(lid - 32) | 31];

  barrier(CLK_LOCAL_MEM_FENCE);
#if (TRACE_SIEVE_KERNEL > 3)
  if (tid==TRACE_SIEVE_TID) printf((__constant char *)"extract_bits: bitcount32: %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, ..., %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n",
      bitcount[0], bitcount[1], bitcount[2], bitcount[3], bitcount[4], bitcount[5], bitcount[6], bitcount[7], bitcount[8], bitcount[9],
      bitcount[246], bitcount[247], bitcount[248], bitcount[249], bitcount[250], bitcount[251], bitcount[252], bitcount[253], bitcount[254], bitcount[255]);
#endif

  if (lid & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[lid] += bitcount[(lid - 64) | 63];

  barrier(CLK_LOCAL_MEM_FENCE);
#if (TRACE_SIEVE_KERNEL > 3)
  if (tid==TRACE_SIEVE_TID) printf((__constant char *)"extract_bits: bitcount64: %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, ..., %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n",
      bitcount[0], bitcount[1], bitcount[2], bitcount[3], bitcount[4], bitcount[5], bitcount[6], bitcount[7], bitcount[8], bitcount[9],
      bitcount[246], bitcount[247], bitcount[248], bitcount[249], bitcount[250], bitcount[251], bitcount[252], bitcount[253], bitcount[254], bitcount[255]);
#endif

  if (lid & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[lid] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  barrier(CLK_LOCAL_MEM_FENCE);
  total_bit_count = bitcount[255];

#if (TRACE_SIEVE_KERNEL > 3)
  if (tid==TRACE_SIEVE_TID) printf((__constant char *)"extract_bits: bitcounts: %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, ..., %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n",
      bitcount[0], bitcount[1], bitcount[2], bitcount[3], bitcount[4], bitcount[5], bitcount[6], bitcount[7], bitcount[8], bitcount[9],
      bitcount[246], bitcount[247], bitcount[248], bitcount[249], bitcount[250], bitcount[251], bitcount[252], bitcount[253], bitcount[254], bitcount[255]);
#endif
#if (TRACE_SIEVE_KERNEL > 1)
  if (tid==TRACE_SIEVE_TID) printf((__constant char *)"extract_bits: total bitcount=%u = %u bytes\n",
      bitcount[255], bitcount[255]*sizeof(short));
#endif

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = lid * words_per_thread * 32;
  for (i = total_bit_count - bitcount[lid]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0 && (--words_per_thread > 0))
    {
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - clz(sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = convert_ushort(k_bit_base + bit_to_test);
#if (TRACE_SIEVE_KERNEL > 2)
    if (tid==TRACE_SIEVE_TID) printf((__constant char *)"extract_bits: smem[%d]=%d\n",
        i, k_bit_base + bit_to_test);
#endif
  }

  barrier(CLK_LOCAL_MEM_FENCE);

#if (TRACE_SIEVE_KERNEL > 3)
  if (tid==TRACE_SIEVE_TID) printf((__constant char *)"extract_bits: smem: [0]%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, ..., [246]%d, %d, %d, %d, %d, %d, %d, %d, %d, %d, ... , [%d]%d, %d\n",
        smem[0], smem[1], smem[2], smem[3], smem[4], smem[5], smem[6], smem[7], smem[8], smem[9],
        smem[246], smem[247], smem[248], smem[249], smem[250], smem[251], smem[252], smem[253], smem[254], smem[255],
        total_bit_count-2, smem[total_bit_count-2], smem[total_bit_count-1]);
#endif
  return total_bit_count;
}