/*
This source distribution is placed in the public domain by its author,  George Woltman.
This code is a GPU-based sieve for mfaktc.

Thanks go to Ben Buhrow for his erato.cu program and to Rocke Verser for his gpusieve program.
See (http://www.mersenneforum.org/showthread.php?t=11900) for Ben's initial work.

You are free to use this code as you wish; I take no reponsibility for
any such action.  Optionally, please be nice and tell me if you find this
source to be useful, or add an acknowledgement within your work. Again optionally,
if you add to the functionality present here please consider making those
additions public too, so that others may benefit from your work.
*/

#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>

#include "params.h"
#include "my_types.h"
#include "compatibility.h"
#include "my_intrinsics.h"
#define NVCC_EXTERN
#include "gpusieve.h"
#undef NVCC_EXTERN

#undef RAW_GPU_BENCH // FIXME

typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
typedef long long unsigned int uint64;

// clang-format off
#define MAX_PRIMES_PER_THREAD 4224                          // Primes up to 16M can be handled by this many "rows" of 256 primes

const uint32 block_size_in_bytes = 8192;                    // Size of shared memory array in bytes
const uint32 block_size          = block_size_in_bytes * 8; // Number of bits generated by each block
const uint32 threadsPerBlock     = 256;                     // Threads per block
#ifdef MORE_CLASSES
const uint32 primesNotSieved     = 5;                       // Primes 2, 3, 5, 7, 11 are not sieved

// const uint32 primesHandledWithSpecialCode = 13;          // Count of primes handled with inline code (not using primes array)
                                                            // Primes 13 through 61 are handled specially
// const uint32 primesHandledWithSpecialCode = 26;          // Count of primes handled with inline code (not using primes array)
                                                            // Primes 13 through 127 are handled specially
const uint32 primesHandledWithSpecialCode    = 49;          // Count of primes handled with inline code (not using primes array)
                                                            // Primes 13 through 251 are handled specially
// const uint32 primesHandledWithSpecialCode = 92;          // Count of primes handled with inline code (not using primes array)
                                                            // Primes 13 through 509 are handled specially
#else
const uint32 primesNotSieved                 = 4;           // Primes 2, 3, 5, 7 are not sieved
// const uint32 primesHandledWithSpecialCode = 14;          // Count of primes handled with inline code (not using primes array)
                                                            // Primes 11 through 61 are handled specially
// const uint32 primesHandledWithSpecialCode = 27;          // Count of primes handled with inline code (not using primes array)
                                                            // Primes 11 through 127 are handled specially
const uint32 primesHandledWithSpecialCode    = 50;          // Count of primes handled with inline code (not using primes array)
                                                            // Primes 11 through 251 are handled specially
// const uint32 primesHandledWithSpecialCode = 93;          // Count of primes handled with inline code (not using primes array)
                                                            // Primes 11 through 509 are handled specially
#endif
// clang-format on

// the maximum number of threads per SM is not the same for all architectures,
// see https://en.wikipedia.org/wiki/CUDA#Technical_specifications for details
#if __CUDA_ARCH < FERMI || __CUDA_ARCH__ == TURING
// Compute capability 1.1 only supports 768 threads per multiprocessor, but
// using minBlocksPerMultiprocessor = 3 may cause a "max reg limit too low"
// error. Using minBlocksPerMultiprocessor = 4 seems to work and does not
// result in any NVCC warnings. However, this should not be reachable as
// devices with compute capability 1.x are not supported in 0.24.0 and later.
#define MIN_BLOCKS_PER_MP 4
#else
#define MIN_BLOCKS_PER_MP 6
#endif

// Various useful constants

const uint32 primesBelow64K      = 6542; // There are 6542 16-bit primes
const uint32 primesBelow128K     = 12251; // There are 12251 17-bit primes
const uint32 primesBelow1M       = 82025; // There are 82025 20-bit primes
const uint32 sieving64KCrossover = (primesBelow64K - primesNotSieved - primesHandledWithSpecialCode) / threadsPerBlock;
// Number of thread loops processing primes below 64K
const uint32 sieving128KCrossover = (primesBelow128K - primesNotSieved - primesHandledWithSpecialCode) / threadsPerBlock;
// Number of thread loops processing primes below 128K
const uint32 sieving1MCrossover =
    (primesBelow1M - primesNotSieved - primesHandledWithSpecialCode) / threadsPerBlock - 3; // bug - awkward hard coded -3 here
// Number of thread loops processing primes below 1M

// Global vars.  These could be moved to mystuff, but no other code needs to know about these internal values.

uint32 primes_per_thread = 0; // Number of "rows" in the GPU sieving info array that each thread processes

// Bit masks for small prime sieving

#define BITSLL11 (1 | (1 << 11) | (1 << 22))
#define BITSLL13 (1 | (1 << 13) | (1 << 26))
#define BITSLL17 (1 | (1 << 17))
#define BITSLL19 (1 | (1 << 19))
#define BITSLL23 (1 | (1 << 23))
#define BITSLL29 (1 | (1 << 29))
#define BITSLL31 (1 | (1 << 31))

// Various padding required to keep warps accessing primes data on 128-byte boundaries

#define PINFO_PAD1 1024 // Allows room for lots of initial bit_to_clr values

// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)

inline void __checkCudaErrors(cudaError err, const char *file, const int line)
{
    if (cudaSuccess != err) {
        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n", file, line, (int)err, cudaGetErrorString(err));
        exit(-1);
    }
}

// Inline to calculate x mod p using 2^32 / p.  Returns positive remainder even when x is negative.
// Assumes x is in the range -2^32 < x < p.  For this routine to work on only positive x values, we
// would change the gen_pinv macro to not add one.

// NOTE: This routine has a number of failure cases (samples below) which don't affect us, but should be investigated someday!
// OW 2016: see mod_p_above64k() below.
// x mod p out of range!! x = 113175, p = 113177, pinv = 37950, r = -2
// x mod p out of range!! x = 126009, p = 126011, pinv = 34085, r = -2
// x mod p out of range!! x = 121506, p = 121507, pinv = 35348, r = -1
// x mod p out of range!! x = 124427, p = 124429, pinv = 34518, r = -2
// x mod p out of range!! x = 95152, p = 95153, pinv = 45138, r = -1
// x mod p out of range!! x = 94120, p = 94121, pinv = 45633, r = -1
// x mod p out of range!! x = 74686, p = 74687, pinv = 57507, r = -1
// x mod p out of range!! x = 102795, p = 102797, pinv = 41782, r = -2
// x mod p out of range!! x = 126741, p = 126743, pinv = 33888, r = -2
// x mod p out of range!! x = 111532, p = 111533, pinv = 38509, r = -1
// x mod p out of range!! x = 130810, p = 130811, pinv = 32834, r = -1
// x mod p out of range!! x = 116705, p = 116707, pinv = 36802, r = -2

#define gen_pinv(p) (0xFFFFFFFF / (p) + 1)

__device__ __inline static int mod_p(int x, int p, int pinv)
{
    // int q, r, a, b;

    // q = __mulhi (x, pinv);  // quotient = x * inverse_of_p
    // a = x - q * p;   // x mod p (but may be too large by one p)
    // b = a - p;   // x mod p (the alternative return value)
    // asm volatile("slct.s32.s32 %0, %1, %2, %3;" : "=r" (r) : "r" (b) , "r" (a) , "r" (b));

    // CUDA compiler generated crappy PTX code for the statements above.  I replaced them with my own PTX code.
    // Even the code below generates a needless copying of x.

    int r;
    asm volatile("mul.hi.s32 %0, %1, %2;\n\t" // r = __mulhi (x, pinv);
                 "mul.lo.s32 %0, %0, %3;\n\t" // r = r * p;
                 "sub.s32  %1, %1, %0;\n\t" // x = x - r;
                 "sub.s32  %0, %1, %3;\n\t" // r = x - p;
                 "slct.s32.s32 %0, %0, %1, %0;" // r = (r >= 0) ? r : x
                 : "=r"(r), "+r"(x)
                 : "r"(pinv), "r"(p));

#ifdef GWDEBUG
    if (pinv != gen_pinv(p)) printf("p doesn't match pinv!! p = %d, pinv = %d\n", p, pinv);
    if (r < 0 || r >= p) printf("x mod p out of range!! x = %d, p = %d, pinv = %d, r = %d\n", x, p, pinv, r);
#endif
    if (r < 0 || r >= p) printf("x mod p out of range!! x = %d, p = %d, pinv = %d, r = %d\n", x, p, pinv, r);
    return r;
}

__device__ __inline static int mod_p_above64k(int x, int p, int pinv)
{
    // int q, r, a, b;

    // q = __mulhi (x, pinv);  // quotient = x * inverse_of_p
    // a = x - q * p;   // x mod p (but may be too large by one p)
    // b = a - p;   // x mod p (the alternative return value)
    // asm("slct.s32.s32 %0, %1, %2, %3;" : "=r" (r) : "r" (b) , "r" (a) , "r" (b));

    // CUDA compiler generated crappy PTX code for the statements above.  I replaced them with my own PTX code.
    // Even the code below generates a needless copying of x.

    int r;
    asm("mul.hi.s32 %0, %1, %2;\n\t" // r = __mulhi (x, pinv);
        "slct.s32.s32 %0, 0, %0, %0;\n\t" // r = min(0, r); // correction for 2^16 < p < 2^17 (see above mod_p())
        "mul.lo.s32 %0, %0, %3;\n\t" // r = r * p;
        "sub.s32  %1, %1, %0;\n\t" // x = x - r;
        "sub.s32  %0, %1, %3;\n\t" // r = x - p;
        "slct.s32.s32 %0, %0, %1, %0;" // r = (r >= 0) ? r : x
        : "=r"(r), "+r"(x)
        : "r"(pinv), "r"(p));

#ifdef GWDEBUG
    if (pinv != gen_pinv(p)) printf("p doesn't match pinv!! p = %d, pinv = %d\n", p, pinv);
    if (r < 0 || r >= p) printf("x mod p out of range!! x = %d, p = %d, pinv = %d, r = %d\n", x, p, pinv, r);
#endif
    if (r < 0 || r >= p) printf("x mod p (above64k) out of range!! x = %d, p = %d, pinv = %d, r = %d\n", x, p, pinv, r);
    return r;
}

// Inline to calculate x mod p where p is a constant

__device__ __inline static int mod_const_p(int x, int p)
{
    return mod_p(x, p, gen_pinv(p));
}

// Inline to calculate x mod p using an inverse of floor ((2^32 / p) - 0.5).
// We're allowed to return a sloppy modulo result ranging from -p/2 to p-1.
// Assumes x is in the range -2^32 < x < p.  This routine needs changing if we must
// deal with large positive x values.

#define gen_sloppy_pinv(p) ((uint32)floor(4294967296.0 / (p) - 0.5))

__device__ __inline static int sloppy_mod_p(int x, int p, int pinv)
{
    int q, r;

    q = __mulhi(x, pinv); // quotient = x * inverse_of_p
    r = x - q * p; // x mod p (but may be too small or large by one-half p)

#ifdef GWDEBUG
    if ((uint32)pinv != gen_sloppy_pinv(p)) printf("p doesn't match pinv!! p = %d, pinv = %d\n", p, pinv);
    if (r < -p / 2 || r >= p) printf("x sloppy mod p out of range!! x = %d, p = %d, pinv = %d, r = %d\n", x, p, pinv, r);
#endif

    return r;
}

// Inline to add a negative constant mod p.  That is given i between 0 and p-1, return ((i + inc) % p)

__device__ __inline static int bump_mod_p(int i, int inc, int p)
{
    int x, j;
    i = i + inc % p;
    j = i + p;
    asm volatile("slct.s32.s32 %0, %1, %2, %1;" : "=r"(x) : "r"(i), "r"(j));

#ifdef GWDEBUG
    if (x < 0 || x >= p) printf("x bump mod p out of range!! x = %d, i = %d, p = %d\n", x, i, p);
#endif
    return x;
}

// Inline to OR one bit into the shared memory array

__device__ __inline static void bitOr(uint8 *locsieve, uint32 bclr)
{
#define locsieve8   ((uint8 *)locsieve)
#define locsieve8v  ((volatile uint8 *)locsieve)
#define locsieve32  ((uint32 *)locsieve)
#define locsieve32v ((volatile uint32 *)locsieve)
    locsieve8[bclr >> 3] |= 1 << (bclr & 7);
}

__device__ __inline static void bitOrSometimesIffy(uint8 *locsieve, uint32 bclr)
{
    uint32 bytenum = bclr >> 3;
    uint8 mask     = 1 << (bclr & 7);
    uint32 val     = locsieve8[bytenum];
    if (!(val & mask)) locsieve8[bytenum] = val | mask;
}

// Make sure initial bit-to-clear makes sense

#ifdef GWDEBUG
#define validate_bclr(bclr, p)                                           \
    if (bclr >= p) printf("bclr too big! bclr = %d, p = %d\n", bclr, p);
#else
#define validate_bclr(bclr, p)
#endif

// Sieve a small slice of the big bit array using fast shared memory.  Note THIS IS A SLOPPY sieve!!
// We do not use atomic operations so that some candidates survive the sieve that shouldn't.  This is
// OK as it will just cost us some extra testing of candidates which is cheaper than the cost of using
// atomic operations.

/*
  Expect as input a set of primes to sieve with, their inverses, and the first bit to clear.

  Each block on the gpu sieves a different segment of the big bit array.  Each thread within each block
  simultaneously sieves a small set of primes, marking composites within shared memory.  There is no memory
  contention between threads because the marking process is write only.  Because each thread
  block starts at a different part of the big bit array, a small amount of computation must
  be done for each prime prior to sieving to figure out the first bit to clear.
*/

__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP)
    SegSieve(uint8 *big_bit_array_dev, uint8 *pinfo_dev, uint32 maxp)
{
    __shared__ uint8 locsieve[block_size_in_bytes];
    uint32 block_start = blockIdx.x * block_size;
    uint32 i, j, p, pinv, bclr;

#define big_bit_array32 ((uint32 *)big_bit_array_dev)
#define locsieve32      ((uint32 *)locsieve)
#define locsieve64      ((uint64 *)locsieve)
#define pinfo16         ((uint16 *)pinfo_dev)
#define pinfo32         ((uint32 *)pinfo_dev)

    // Sieve using all 8 bits of each shared memory byte.
    // This is more complicated code than using the whole byte as a flag
    // but has 1/8th as many global memory accesses to the primes arrays.

    // Sieve the smallest primes using inline code to avoid using atomics.
    // Memory layout is simply a 16-bit bit-to-clear value for each small prime.

#define bit_to_clr pinfo16

    //
    // In these sections each thread handles a 256-bit portion of the shared memory area.
    // This allows us to operate without atomic operations and without syncing.
    //

    uint32 thread_start = block_start + threadIdx.x * block_size / threadsPerBlock;

    //
    // In this section each thread handles one 32 bit word at a time sieving primes below 64.
    // Each prime will hit a 32-bit word zero or one time.
    //

    {
        uint32 mask, mask2, mask3, mask4, i11, i13, i17, i19, i23, i29, i31, i37, i41, i43, i47, i53, i59, i61;

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            i11 = mod_const_p(bit_to_clr[4] - thread_start, 11); // compute bit to clear for prime 11
            i13 = mod_const_p(bit_to_clr[5] - thread_start, 13); // compute bit to clear for prime 13
            i17 = mod_const_p(bit_to_clr[6] - thread_start, 17); // compute bit to clear for prime 17
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            i13 = mod_const_p(bit_to_clr[5] - thread_start, 13); // compute bit to clear for prime 13
            i17 = mod_const_p(bit_to_clr[6] - thread_start, 17); // compute bit to clear for prime 17
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            i17 = mod_const_p(bit_to_clr[6] - thread_start, 17); // compute bit to clear for prime 17
        }
        i19 = mod_const_p(bit_to_clr[7] - thread_start, 19); // compute bit to clear for prime 19
        i23 = mod_const_p(bit_to_clr[8] - thread_start, 23); // compute bit to clear for prime 23
        i29 = mod_const_p(bit_to_clr[9] - thread_start, 29); // compute bit to clear for prime 29
        i31 = mod_const_p(bit_to_clr[10] - thread_start, 31); // compute bit to clear for prime 31

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            mask = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            mask = (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            mask = BITSLL17 << i17;
        }
        mask |= (BITSLL19 << i19) | (BITSLL23 << i23);
        mask |= (BITSLL29 << i29) | (BITSLL31 << i31);

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            i11 = bump_mod_p(i11, -32, 11);
            i13 = bump_mod_p(i13, -32, 13);
            i17 = bump_mod_p(i17, -32, 17);
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            i13 = bump_mod_p(i13, -32, 13);
            i17 = bump_mod_p(i17, -32, 17);
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            i17 = bump_mod_p(i17, -32, 17);
        }
        i19 = bump_mod_p(i19, -32, 19);
        i23 = bump_mod_p(i23, -32, 23);
        i29 = bump_mod_p(i29, -32, 29);
        i31 = bump_mod_p(i31, -32, 31);

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            mask2 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            mask2 = (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            mask2 = BITSLL17 << i17;
        }
        mask2 |= (BITSLL19 << i19) | (BITSLL23 << i23);
        mask2 |= (BITSLL29 << i29) | (BITSLL31 << i31);

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            i11 = bump_mod_p(i11, -32, 11);
            i13 = bump_mod_p(i13, -32, 13);
            i17 = bump_mod_p(i17, -32, 17);
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            i13 = bump_mod_p(i13, -32, 13);
            i17 = bump_mod_p(i17, -32, 17);
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            i17 = bump_mod_p(i17, -32, 17);
        }
        i19 = bump_mod_p(i19, -32, 19);
        i23 = bump_mod_p(i23, -32, 23);
        i29 = bump_mod_p(i29, -32, 29);
        i31 = bump_mod_p(i31, -32, 31);

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            mask3 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            mask3 = (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            mask3 = BITSLL17 << i17;
        }
        mask3 |= (BITSLL19 << i19) | (BITSLL23 << i23);
        mask3 |= (BITSLL29 << i29) | (BITSLL31 << i31);

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            i11 = bump_mod_p(i11, -32, 11);
            i13 = bump_mod_p(i13, -32, 13);
            i17 = bump_mod_p(i17, -32, 17);
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            i13 = bump_mod_p(i13, -32, 13);
            i17 = bump_mod_p(i17, -32, 17);
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            i17 = bump_mod_p(i17, -32, 17);
        }
        i19 = bump_mod_p(i19, -32, 19);
        i23 = bump_mod_p(i23, -32, 23);
        i29 = bump_mod_p(i29, -32, 29);
        i31 = bump_mod_p(i31, -32, 31);

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            mask4 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            mask4 = (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            mask4 = BITSLL17 << i17;
        }
        mask4 |= (BITSLL19 << i19) | (BITSLL23 << i23);
        mask4 |= (BITSLL29 << i29) | (BITSLL31 << i31);

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            i11 = bump_mod_p(i11, -32, 11);
            i13 = bump_mod_p(i13, -32, 13);
            i17 = bump_mod_p(i17, -32, 17);
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            i13 = bump_mod_p(i13, -32, 13);
            i17 = bump_mod_p(i17, -32, 17);
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            i17 = bump_mod_p(i17, -32, 17);
        }
        i19 = bump_mod_p(i19, -32, 19);
        i23 = bump_mod_p(i23, -32, 23);
        i29 = bump_mod_p(i29, -32, 29);
        i31 = bump_mod_p(i31, -32, 31);

        locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 0] = mask;
        locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 1] = mask2;
        locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 2] = mask3;
        locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 3] = mask4;

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            mask = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            mask = (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            mask = BITSLL17 << i17;
        }
        mask |= (BITSLL19 << i19) | (BITSLL23 << i23);
        mask |= (BITSLL29 << i29) | (BITSLL31 << i31);

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            i11 = bump_mod_p(i11, -32, 11);
            i13 = bump_mod_p(i13, -32, 13);
            i17 = bump_mod_p(i17, -32, 17);
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            i13 = bump_mod_p(i13, -32, 13);
            i17 = bump_mod_p(i17, -32, 17);
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            i17 = bump_mod_p(i17, -32, 17);
        }
        i19 = bump_mod_p(i19, -32, 19);
        i23 = bump_mod_p(i23, -32, 23);
        i29 = bump_mod_p(i29, -32, 29);
        i31 = bump_mod_p(i31, -32, 31);

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            mask2 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            mask2 = (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            mask2 = BITSLL17 << i17;
        }
        mask2 |= (BITSLL19 << i19) | (BITSLL23 << i23);
        mask2 |= (BITSLL29 << i29) | (BITSLL31 << i31);

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            i11 = bump_mod_p(i11, -32, 11);
            i13 = bump_mod_p(i13, -32, 13);
            i17 = bump_mod_p(i17, -32, 17);
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            i13 = bump_mod_p(i13, -32, 13);
            i17 = bump_mod_p(i17, -32, 17);
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            i17 = bump_mod_p(i17, -32, 17);
        }
        i19 = bump_mod_p(i19, -32, 19);
        i23 = bump_mod_p(i23, -32, 23);
        i29 = bump_mod_p(i29, -32, 29);
        i31 = bump_mod_p(i31, -32, 31);

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            mask3 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            mask3 = (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            mask3 = BITSLL17 << i17;
        }
        mask3 |= (BITSLL19 << i19) | (BITSLL23 << i23);
        mask3 |= (BITSLL29 << i29) | (BITSLL31 << i31);

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            i11 = bump_mod_p(i11, -32, 11);
            i13 = bump_mod_p(i13, -32, 13);
            i17 = bump_mod_p(i17, -32, 17);
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            i13 = bump_mod_p(i13, -32, 13);
            i17 = bump_mod_p(i17, -32, 17);
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            i17 = bump_mod_p(i17, -32, 17);
        }
        i19 = bump_mod_p(i19, -32, 19);
        i23 = bump_mod_p(i23, -32, 23);
        i29 = bump_mod_p(i29, -32, 29);
        i31 = bump_mod_p(i31, -32, 31);

        if (primesNotSieved == 4) { // Primes 2, 3, 5, 7 are not sieved
            mask4 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 5) { // Primes 2, 3, 5, 7, 11 are not sieved
            mask4 = (BITSLL13 << i13) | (BITSLL17 << i17);
        }
        if (primesNotSieved == 6) { // Primes 2, 3, 5, 7, 11, 13 are not sieved
            mask4 = BITSLL17 << i17;
        }
        mask4 |= (BITSLL19 << i19) | (BITSLL23 << i23);
        mask4 |= (BITSLL29 << i29) | (BITSLL31 << i31);

        locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 4] = mask;
        locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 5] = mask2;
        locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 6] = mask3;
        locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 7] = mask4;

        // The following handles primes, 32 < p < 64.  Each prime hits 0 or 1 32-bit words.

        i37 = mod_const_p(bit_to_clr[11] - thread_start, 37); // compute bit to clear for prime 37
        i41 = mod_const_p(bit_to_clr[12] - thread_start, 41); // compute bit to clear for prime 41
        i43 = mod_const_p(bit_to_clr[13] - thread_start, 43); // compute bit to clear for prime 43
        i47 = mod_const_p(bit_to_clr[14] - thread_start, 47); // compute bit to clear for prime 47
        i53 = mod_const_p(bit_to_clr[15] - thread_start, 53); // compute bit to clear for prime 53
        i59 = mod_const_p(bit_to_clr[16] - thread_start, 59); // compute bit to clear for prime 59
        i61 = mod_const_p(bit_to_clr[17] - thread_start, 61); // compute bit to clear for prime 61

        for (j = 0;;) {
            mask = 1 << i37;
            mask |= (1 << i41) | (1 << i43);
            mask |= (1 << i47) | (1 << i53);
            mask |= (1 << i59) | (1 << i61);

            locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + j] |= mask;

            j++;
            if (j == block_size / threadsPerBlock / 32) break;

            i37 = bump_mod_p(i37, -32, 37);
            i41 = bump_mod_p(i41, -32, 41);
            i43 = bump_mod_p(i43, -32, 43);
            i47 = bump_mod_p(i47, -32, 47);
            i53 = bump_mod_p(i53, -32, 53);
            i59 = bump_mod_p(i59, -32, 59);
            i61 = bump_mod_p(i61, -32, 61);
        }
    }

    // The following handles primes 64 < p < 128.
    // Each thread handles one 64-bit word of the 256-bit section of shared memory.
    // Each prime will hit a 64-bit word zero or one time.

    if (primesNotSieved + primesHandledWithSpecialCode > 18) {
        uint32 i67, i71, i73, i79, i83, i89, i97, i101, i103, i107, i109, i113, i127;
        uint64 mask;

        i67 = mod_const_p(bit_to_clr[18] - thread_start, 67); // compute bit to clear for prime 67
        i71 = mod_const_p(bit_to_clr[19] - thread_start, 71); // compute bit to clear for prime 71
        i73 = mod_const_p(bit_to_clr[20] - thread_start, 73); // compute bit to clear for prime 73
        i79 = mod_const_p(bit_to_clr[21] - thread_start, 79); // compute bit to clear for prime 79
        i83 = mod_const_p(bit_to_clr[22] - thread_start, 83); // compute bit to clear for prime 83
        i89 = mod_const_p(bit_to_clr[23] - thread_start, 89); // compute bit to clear for prime 89
        i97 = mod_const_p(bit_to_clr[24] - thread_start, 97); // compute bit to clear for prime 97

        for (j = 0;;) {
            mask = (uint64)1 << i67;
            mask |= ((uint64)1 << i71) | ((uint64)1 << i73);
            mask |= ((uint64)1 << i79) | ((uint64)1 << i83);
            mask |= ((uint64)1 << i89) | ((uint64)1 << i97);

            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j] |= mask;

            j++;
            if (j == block_size / threadsPerBlock / 64) break;

            i67 = bump_mod_p(i67, -64, 67);
            i71 = bump_mod_p(i71, -64, 71);
            i73 = bump_mod_p(i73, -64, 73);
            i79 = bump_mod_p(i79, -64, 79);
            i83 = bump_mod_p(i83, -64, 83);
            i89 = bump_mod_p(i89, -64, 89);
            i97 = bump_mod_p(i97, -64, 97);
        }

        i101 = mod_const_p(bit_to_clr[25] - thread_start, 101); // compute bit to clear for prime 101
        i103 = mod_const_p(bit_to_clr[26] - thread_start, 103); // compute bit to clear for prime 103
        i107 = mod_const_p(bit_to_clr[27] - thread_start, 107); // compute bit to clear for prime 107
        i109 = mod_const_p(bit_to_clr[28] - thread_start, 109); // compute bit to clear for prime 109
        i113 = mod_const_p(bit_to_clr[29] - thread_start, 113); // compute bit to clear for prime 113
        i127 = mod_const_p(bit_to_clr[30] - thread_start, 127); // compute bit to clear for prime 127

        for (j = 0;;) {
            mask = (uint64)1 << i101;
            mask |= ((uint64)1 << i103) | ((uint64)1 << i107);
            mask |= ((uint64)1 << i109) | ((uint64)1 << i113);
            mask |= (uint64)1 << i127;

            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j] |= mask;

            j++;
            if (j == block_size / threadsPerBlock / 64) break;

            i101 = bump_mod_p(i101, -64, 101);
            i103 = bump_mod_p(i103, -64, 103);
            i107 = bump_mod_p(i107, -64, 107);
            i109 = bump_mod_p(i109, -64, 109);
            i113 = bump_mod_p(i113, -64, 113);
            i127 = bump_mod_p(i127, -64, 127);
        }
    }

    // The following handles primes 128 < p < 256.
    // Each thread handles one 128-bit word of the 256-bit section of shared memory.
    // Each prime will hit a 128-bit word zero or one time.

    if (primesNotSieved + primesHandledWithSpecialCode > 31) {
        uint32 i131, i137, i139, i149, i151, i157, i163, i167, i173, i179, i181, i191;
        uint32 i193, i197, i199, i211, i223, i227, i229, i233, i239, i241, i251;
        uint64 mask1, mask2;

        i131 = mod_const_p(bit_to_clr[31] - thread_start, 131); // compute bit to clear for prime 131
        i137 = mod_const_p(bit_to_clr[32] - thread_start, 137); // compute bit to clear for prime 137
        i139 = mod_const_p(bit_to_clr[33] - thread_start, 139); // compute bit to clear for prime 139
        i149 = mod_const_p(bit_to_clr[34] - thread_start, 149); // compute bit to clear for prime 149
        i151 = mod_const_p(bit_to_clr[35] - thread_start, 151); // compute bit to clear for prime 151
        i157 = mod_const_p(bit_to_clr[36] - thread_start, 157); // compute bit to clear for prime 157

        for (j = 0;;) {
            mask1 = ((uint64)1 << i131) | ((uint64)1 << i137);
            mask1 |= ((uint64)1 << i139) | ((uint64)1 << i149);
            mask1 |= ((uint64)1 << i151) | ((uint64)1 << i157);
            mask2 = ((uint64)1 << (i131 - 64)) | ((uint64)1 << (i137 - 64));
            mask2 |= ((uint64)1 << (i139 - 64)) | ((uint64)1 << (i149 - 64));
            mask2 |= ((uint64)1 << (i151 - 64)) | ((uint64)1 << (i157 - 64));

            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2] |= mask1;
            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2 + 1] |= mask2;

            j++;
            if (j == block_size / threadsPerBlock / 128) break;

            i131 = bump_mod_p(i131, -128, 131);
            i137 = bump_mod_p(i137, -128, 137);
            i139 = bump_mod_p(i139, -128, 139);
            i149 = bump_mod_p(i149, -128, 149);
            i151 = bump_mod_p(i151, -128, 151);
            i157 = bump_mod_p(i157, -128, 157);
        }

        i163 = mod_const_p(bit_to_clr[37] - thread_start, 163); // compute bit to clear for prime 163
        i167 = mod_const_p(bit_to_clr[38] - thread_start, 167); // compute bit to clear for prime 167
        i173 = mod_const_p(bit_to_clr[39] - thread_start, 173); // compute bit to clear for prime 173
        i179 = mod_const_p(bit_to_clr[40] - thread_start, 179); // compute bit to clear for prime 179
        i181 = mod_const_p(bit_to_clr[41] - thread_start, 181); // compute bit to clear for prime 181
        i191 = mod_const_p(bit_to_clr[42] - thread_start, 191); // compute bit to clear for prime 191

        for (j = 0;;) {
            mask1 = ((uint64)1 << i163) | ((uint64)1 << i167);
            mask1 |= ((uint64)1 << i173) | ((uint64)1 << i179);
            mask1 |= ((uint64)1 << i181) | ((uint64)1 << i191);
            mask2 = ((uint64)1 << (i163 - 64)) | ((uint64)1 << (i167 - 64));
            mask2 |= ((uint64)1 << (i173 - 64)) | ((uint64)1 << (i179 - 64));
            mask2 |= ((uint64)1 << (i181 - 64)) | ((uint64)1 << (i191 - 64));

            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2] |= mask1;
            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2 + 1] |= mask2;

            j++;
            if (j == block_size / threadsPerBlock / 128) break;

            i163 = bump_mod_p(i163, -128, 163);
            i167 = bump_mod_p(i167, -128, 167);
            i173 = bump_mod_p(i173, -128, 173);
            i179 = bump_mod_p(i179, -128, 179);
            i181 = bump_mod_p(i181, -128, 181);
            i191 = bump_mod_p(i191, -128, 191);
        }

        i193 = mod_const_p(bit_to_clr[43] - thread_start, 193); // compute bit to clear for prime 193
        i197 = mod_const_p(bit_to_clr[44] - thread_start, 197); // compute bit to clear for prime 197
        i199 = mod_const_p(bit_to_clr[45] - thread_start, 199); // compute bit to clear for prime 199
        i211 = mod_const_p(bit_to_clr[46] - thread_start, 211); // compute bit to clear for prime 211
        i223 = mod_const_p(bit_to_clr[47] - thread_start, 223); // compute bit to clear for prime 223
        i227 = mod_const_p(bit_to_clr[48] - thread_start, 227); // compute bit to clear for prime 227

        for (j = 0;;) {
            mask1 = ((uint64)1 << i193) | ((uint64)1 << i197);
            mask1 |= ((uint64)1 << i199) | ((uint64)1 << i211);
            mask1 |= ((uint64)1 << i223) | ((uint64)1 << i227);
            mask2 = ((uint64)1 << (i193 - 64)) | ((uint64)1 << (i197 - 64));
            mask2 |= ((uint64)1 << (i199 - 64)) | ((uint64)1 << (i211 - 64));
            mask2 |= ((uint64)1 << (i223 - 64)) | ((uint64)1 << (i227 - 64));

            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2] |= mask1;
            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2 + 1] |= mask2;

            j++;
            if (j == block_size / threadsPerBlock / 128) break;

            i193 = bump_mod_p(i193, -128, 193);
            i197 = bump_mod_p(i197, -128, 197);
            i199 = bump_mod_p(i199, -128, 199);
            i211 = bump_mod_p(i211, -128, 211);
            i223 = bump_mod_p(i223, -128, 223);
            i227 = bump_mod_p(i227, -128, 227);
        }

        i229 = mod_const_p(bit_to_clr[49] - thread_start, 229); // compute bit to clear for prime 229
        i233 = mod_const_p(bit_to_clr[50] - thread_start, 233); // compute bit to clear for prime 233
        i239 = mod_const_p(bit_to_clr[51] - thread_start, 239); // compute bit to clear for prime 239
        i241 = mod_const_p(bit_to_clr[52] - thread_start, 241); // compute bit to clear for prime 241
        i251 = mod_const_p(bit_to_clr[53] - thread_start, 251); // compute bit to clear for prime 251

        for (j = 0;;) {
            mask1 = (uint64)1 << i229;
            mask1 |= ((uint64)1 << i233) | ((uint64)1 << i239);
            mask1 |= ((uint64)1 << i241) | ((uint64)1 << i251);
            mask2 = (uint64)1 << (i229 - 64);
            mask2 |= ((uint64)1 << (i233 - 64)) | ((uint64)1 << (i239 - 64));
            mask2 |= ((uint64)1 << (i241 - 64)) | ((uint64)1 << (i251 - 64));

            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2] |= mask1;
            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2 + 1] |= mask2;

            j++;
            if (j == block_size / threadsPerBlock / 128) break;

            i229 = bump_mod_p(i229, -128, 229);
            i233 = bump_mod_p(i233, -128, 233);
            i239 = bump_mod_p(i239, -128, 239);
            i241 = bump_mod_p(i241, -128, 241);
            i251 = bump_mod_p(i251, -128, 251);
        }
    }

    // The following handles primes 256 < p < 512.
    // Each thread handles one 256-bit word of the 256-bit section of shared memory.
    // Each prime will hit a 256-bit word zero or one time.

#define SIEVE_256_BIT(n, p)                                                                        \
    i = mod_const_p(bit_to_clr[n] - thread_start, p);                                              \
    if (i < 256) locsieve[j * threadsPerBlock * 32 + threadIdx.x * 32 + (i >> 3)] |= 1 << (i & 7);

    if (primesNotSieved + primesHandledWithSpecialCode > 54)
        for (j = 0; j < block_size / (threadsPerBlock * 256); j++) {
            SIEVE_256_BIT(54, 257);
            SIEVE_256_BIT(55, 263);
            SIEVE_256_BIT(56, 269);
            SIEVE_256_BIT(57, 271);
            SIEVE_256_BIT(58, 277);
            SIEVE_256_BIT(59, 281);
            SIEVE_256_BIT(60, 283);
            SIEVE_256_BIT(61, 293);
            SIEVE_256_BIT(62, 307);
            SIEVE_256_BIT(63, 311);
            SIEVE_256_BIT(64, 313);
            SIEVE_256_BIT(65, 317);
            SIEVE_256_BIT(66, 331);
            SIEVE_256_BIT(67, 337);
            SIEVE_256_BIT(68, 347);
            SIEVE_256_BIT(69, 349);
            SIEVE_256_BIT(70, 353);
            SIEVE_256_BIT(71, 359);
            SIEVE_256_BIT(72, 367);
            SIEVE_256_BIT(73, 373);
            SIEVE_256_BIT(74, 379);
            SIEVE_256_BIT(75, 383);
            SIEVE_256_BIT(76, 389);
            SIEVE_256_BIT(77, 397);
            SIEVE_256_BIT(78, 401);
            SIEVE_256_BIT(79, 409);
            SIEVE_256_BIT(80, 419);
            SIEVE_256_BIT(81, 421);
            SIEVE_256_BIT(82, 431);
            SIEVE_256_BIT(83, 433);
            SIEVE_256_BIT(84, 439);
            SIEVE_256_BIT(85, 443);
            SIEVE_256_BIT(86, 449);
            SIEVE_256_BIT(87, 457);
            SIEVE_256_BIT(88, 461);
            SIEVE_256_BIT(89, 463);
            SIEVE_256_BIT(90, 467);
            SIEVE_256_BIT(91, 479);
            SIEVE_256_BIT(92, 487);
            SIEVE_256_BIT(93, 491);
            SIEVE_256_BIT(94, 499);
            SIEVE_256_BIT(95, 503);
            SIEVE_256_BIT(96, 509);
        }

#undef bit_to_clr

    // sync before sieving more primes
    __syncthreads();

    // Bump the bit_to_clr_dev pointer to a 256-byte boundary so that warps access
    // memory without crossing memory block boundaries.

    pinfo_dev += PINFO_PAD1;

    // Sieve the first row or two of primes (we could do more but it wasn't helpful) using 8 threads to process each prime.
    // We do this to reduce masking calculations as well as to hopefully reduce
    // shared memory conflicts (we are at least guaranteed the 8 threads processing
    // a single prime will not conflict).  However, we have 8 times as many bclr calculations.
    // Our memory layout here is 16-bits for p, 16-bits for bit-to-clr,
    // 32-bits for pinv (a total of 8 bytes per prime).

    i = 0;
    for (; i < 1 && i < maxp; i++, pinfo_dev += threadsPerBlock * 8) {
        for (j = 0; j < 8; j++) {
            uint8 mask;

            bclr = pinfo32[j * threadsPerBlock / 8 + threadIdx.x / 8]; // Read p and the bit_to_clear in one instruction
            p    = bclr >> 16;
            bclr &= 0xFFFF;
            pinv = pinfo32[threadsPerBlock + j * threadsPerBlock / 8 + threadIdx.x / 8];
            validate_bclr(bclr, p);

            bclr = mod_p(bclr - block_start, p, pinv) + (threadIdx.x & 7) * p;
            mask = 1 << (bclr & 7);
            bclr = bclr >> 3;

            // Clear bits
            do {
                uint8 val = locsieve8[bclr];
                if (!(val & mask)) locsieve8[bclr] = val | mask;
                bclr += p;
            } while (bclr < block_size_in_bytes);
        }
    }

    // Sieve the primes below 64K (there are 6542 primes below 64K)
    // Our memory layout here is 16-bits for p, 16-bits for bit-to-clr,
    // 32-bits for pinv (a total of 8 bytes per prime).

    for (; i < sieving64KCrossover && i < maxp; i += 3, pinfo_dev += threadsPerBlock * 24) {
        uint32 p3, pinv3, bclr3, p2, pinv2, bclr2;

        bclr3 = pinfo32[threadIdx.x]; // Read p and the bit_to_clear in one instruction
        bclr2 = pinfo32[threadsPerBlock * 2 + threadIdx.x];
        bclr  = pinfo32[threadsPerBlock * 4 + threadIdx.x];

        p3 = bclr3 >> 16;
        p2 = bclr2 >> 16;
        p  = bclr >> 16;

        bclr3 &= 0xFFFF;
        bclr2 &= 0xFFFF;
        bclr &= 0xFFFF;

        validate_bclr(bclr3, p3);
        validate_bclr(bclr2, p2);
        validate_bclr(bclr, p);

        pinv3 = pinfo32[threadsPerBlock + threadIdx.x];
        pinv2 = pinfo32[threadsPerBlock * 3 + threadIdx.x];
        pinv  = pinfo32[threadsPerBlock * 5 + threadIdx.x];

        bclr3 = mod_p(bclr3 - block_start, p3, pinv3);
        bclr2 = mod_p(bclr2 - block_start, p2, pinv2);
        bclr  = mod_p(bclr - block_start, p, pinv);

        // Clear bits (assumes 64K bitmap)
        do {
            bitOrSometimesIffy(locsieve, bclr3);
            bclr3 += p3;
        } while (bclr3 < block_size);
        do {
            bitOrSometimesIffy(locsieve, bclr2);
            bclr2 += p2;
        } while (bclr2 < block_size);
        do {
            bitOrSometimesIffy(locsieve, bclr);
            bclr += p;
        } while (bclr < block_size);
    }

    // We need one transitional loop to crossover the 64K boundary.  This will get us to the point
    // where all remaining primes to sieve are above 64K.
    // We need one more transitional loop to switch to a memory layout that let's us cram all needed info in 32-bits.
    // Our memory layout here is 32-bits for bit-to-clr, 32-bits for p, 32-bits for pinv.

    if (i < maxp) {
        uint32 bclr2, pinv2, p2;

        bclr2 = pinfo32[threadIdx.x];
        pinv2 = pinfo32[threadsPerBlock + threadIdx.x];
        p2    = pinfo32[threadsPerBlock * 2 + threadIdx.x];
        validate_bclr(bclr2, p2);

        bclr2 = mod_p_above64k(bclr2 - block_start, p2, pinv2);

        // Clear (rarely) 0, 1 or (rarely) 2 bits (bug: assumes block_size = 64K)
        if (bclr2 < block_size) {
            bitOr(locsieve, bclr2);
            bclr2 += p2;
            if (bclr2 < block_size) bitOr(locsieve, bclr2);
        }

        bclr = pinfo32[threadsPerBlock * 3 + threadIdx.x];
        pinv = pinfo32[threadsPerBlock * 4 + threadIdx.x];
        p    = pinfo32[threadsPerBlock * 5 + threadIdx.x];
        validate_bclr(bclr, p);

        bclr = mod_p_above64k(bclr - block_start, p, pinv);

        // Clear (rarely) 0, 1 or (rarely) 2 bits (bug: assumes block_size = 64K)
        if (bclr < block_size) bitOr(locsieve, bclr);
        i += 2, pinfo_dev += threadsPerBlock * 24;
    }

    // Sieve primes up to and including the row containing the first 18-bit prime (more than 128K).
    // Our memory layout here is 18-bits for bit-to-clr, 7-bits for (p difference) / 2, 7-bits for pinv difference.

    for (; i < sieving128KCrossover + 1 && i < maxp; i += 3, pinfo_dev += threadsPerBlock * 12) {
        uint32 tmp3 = pinfo32[threadIdx.x];
        uint32 tmp2 = pinfo32[threadsPerBlock + threadIdx.x];
        uint32 tmp  = pinfo32[threadsPerBlock * 2 + threadIdx.x];
        uint32 bclr3, p3, pinv3, bclr2, p2, pinv2;

        bclr3 = tmp3 & 0x0003FFFF;
        bclr2 = tmp2 & 0x0003FFFF;
        bclr  = tmp & 0x0003FFFF;

        pinv3 = pinv - (tmp3 >> 25);
        pinv2 = pinv - (tmp2 >> 25);
        pinv -= tmp >> 25;

        p3 = p + ((tmp3 & 0x01FC0000) >> 17);
        p2 = p + ((tmp2 & 0x01FC0000) >> 17);
        p += (tmp & 0x01FC0000) >> 17;

        validate_bclr(bclr3, p3);
        validate_bclr(bclr2, p2);
        validate_bclr(bclr, p);

        bclr3 = mod_p_above64k(bclr3 - block_start, p3, pinv3);
        bclr2 = mod_p_above64k(bclr2 - block_start, p2, pinv2);
        bclr  = mod_p_above64k(bclr - block_start, p, pinv);

        // Optionally clear bit (bug: assumes block_size <= 64K)
        if (bclr3 < block_size) bitOr(locsieve, bclr3);
        if (bclr2 < block_size) bitOr(locsieve, bclr2);
        if (bclr < block_size) bitOr(locsieve, bclr);
    }

    // We need one transitional loop which handles the first complete row containing primes above 128K.
    // Our memory layout here is 32-bits for bit-to-clr, 32-bits for p, 32-bits for pinv.

    if (i < maxp) {
        bclr = pinfo32[threadIdx.x];
        pinv = pinfo32[threadsPerBlock + threadIdx.x];
        p    = pinfo32[threadsPerBlock * 2 + threadIdx.x];
        validate_bclr(bclr, p);

        bclr = sloppy_mod_p(bclr - block_start, p, pinv);

        // Optionally clear bit
        if (bclr < block_size) bitOr(locsieve, bclr);
        i++, pinfo_dev += threadsPerBlock * 12;
    }

    // Sieve the primes above 128K up to 1M.
    // Our memory layout here is 20-bits for bit-to-clr, 7-bits for (p difference) / 2, 5-bits for pinv difference.
    // Primes above 128K can use SLOPPY_MOD.

    for (; i < sieving1MCrossover && i < maxp; i += 4, pinfo_dev += threadsPerBlock * 16) {
        uint32 tmp4 = pinfo32[threadIdx.x];
        uint32 tmp3 = pinfo32[threadsPerBlock + threadIdx.x];
        uint32 tmp2 = pinfo32[threadsPerBlock * 2 + threadIdx.x];
        uint32 tmp  = pinfo32[threadsPerBlock * 3 + threadIdx.x];
        uint32 bclr4, p4, pinv4, bclr3, p3, pinv3, bclr2, p2, pinv2;

        bclr4 = tmp4 & 0x000FFFFF;
        bclr3 = tmp3 & 0x000FFFFF;
        bclr2 = tmp2 & 0x000FFFFF;
        bclr  = tmp & 0x000FFFFF;

        pinv4 = pinv - (tmp4 >> 27);
        pinv3 = pinv - (tmp3 >> 27);
        pinv2 = pinv - (tmp2 >> 27);
        pinv -= tmp >> 27;

        p4 = p + ((tmp4 & 0x07F00000) >> 19);
        p3 = p + ((tmp3 & 0x07F00000) >> 19);
        p2 = p + ((tmp2 & 0x07F00000) >> 19);
        p += (tmp & 0x07F00000) >> 19;

        validate_bclr(bclr4, p4);
        validate_bclr(bclr3, p3);
        validate_bclr(bclr2, p2);
        validate_bclr(bclr, p);

        bclr4 = sloppy_mod_p(bclr4 - block_start, p4, pinv4);
        bclr3 = sloppy_mod_p(bclr3 - block_start, p3, pinv3);
        bclr2 = sloppy_mod_p(bclr2 - block_start, p2, pinv2);
        bclr  = sloppy_mod_p(bclr - block_start, p, pinv);

        // Optionally clear bit
        if (bclr4 < block_size) bitOr(locsieve, bclr4);
        if (bclr3 < block_size) bitOr(locsieve, bclr3);
        if (bclr2 < block_size) bitOr(locsieve, bclr2);
        if (bclr < block_size) bitOr(locsieve, bclr);
    }

    // We need one transitional loop to switch to a memory layout that again let's us cram all needed info in 32-bits.
    // Our memory layout here is 32-bits for bit-to-clr, 32-bits for p, 32-bits for pinv.

    if (i < maxp) {
        bclr = pinfo32[threadIdx.x];
        pinv = pinfo32[threadsPerBlock + threadIdx.x];
        p    = pinfo32[threadsPerBlock * 2 + threadIdx.x];
        validate_bclr(bclr, p);

        bclr = sloppy_mod_p(bclr - block_start, p, pinv);

        // Optionally clear bit
        if (bclr < block_size) bitOr(locsieve, bclr);
        i++, pinfo_dev += threadsPerBlock * 12;
    }

    // Sieve the primes above 1M up to 16M.
    // Our memory layout here is 24-bits for bit-to-clr, 7-bits for (p difference) / 2, 1-bit for pinv difference.

    for (; i < maxp; i += 4, pinfo_dev += threadsPerBlock * 16) {
        uint32 tmp4 = pinfo32[threadIdx.x];
        uint32 tmp3 = pinfo32[threadsPerBlock + threadIdx.x];
        uint32 tmp2 = pinfo32[threadsPerBlock * 2 + threadIdx.x];
        uint32 tmp  = pinfo32[threadsPerBlock * 3 + threadIdx.x];
        uint32 bclr4, p4, pinv4, bclr3, p3, pinv3, bclr2, p2, pinv2;

        bclr4 = tmp4 & 0x00FFFFFF;
        bclr3 = tmp3 & 0x00FFFFFF;
        bclr2 = tmp2 & 0x00FFFFFF;
        bclr  = tmp & 0x00FFFFFF;

        pinv4 = pinv - (tmp4 >> 31);
        pinv3 = pinv - (tmp3 >> 31);
        pinv2 = pinv - (tmp2 >> 31);
        pinv -= tmp >> 31;

        p4 = p + ((tmp4 & 0x7F000000) >> 23);
        p3 = p + ((tmp3 & 0x7F000000) >> 23);
        p2 = p + ((tmp2 & 0x7F000000) >> 23);
        p += (tmp & 0x7F000000) >> 23;

        validate_bclr(bclr4, p4);
        validate_bclr(bclr3, p3);
        validate_bclr(bclr2, p2);
        validate_bclr(bclr, p);

        bclr4 = sloppy_mod_p(bclr4 - block_start, p4, pinv4);
        bclr3 = sloppy_mod_p(bclr3 - block_start, p3, pinv3);
        bclr2 = sloppy_mod_p(bclr2 - block_start, p2, pinv2);
        bclr  = sloppy_mod_p(bclr - block_start, p, pinv);

        // Optionally clear bit
        if (bclr4 < block_size) bitOr(locsieve, bclr4);
        if (bclr3 < block_size) bitOr(locsieve, bclr3);
        if (bclr2 < block_size) bitOr(locsieve, bclr2);
        if (bclr < block_size) bitOr(locsieve, bclr);
    }

    // sync before copying
    __syncthreads();

    // Copy our shared bit array results to the global big bit array

    // Point to the block of the big bit array we are copying to
    big_bit_array_dev += blockIdx.x * block_size_in_bytes;

    // Have each thread copy a part of the array.
    for (j = 0; j < block_size / (threadsPerBlock * 32); j++)
        big_bit_array32[j * threadsPerBlock + threadIdx.x] = ~locsieve32[j * threadsPerBlock + threadIdx.x];
}

//
// Sieve initialization kernels
//

// Internal routine to compute 1/n mod d using extended Euclid GCD

__device__ unsigned int modularinverse(uint32 n, uint32 orig_d)
{
    uint32 d = orig_d;
    int x, lastx, q, t;
    x     = 0;
    lastx = 1;
    while (d != 0) {
        q     = n / d; // Floor(n/d)
        t     = d;
        d     = n - q * d;
        n     = t; // d = n mod d; n = lastd;
        t     = x;
        x     = lastx - q * x;
        lastx = t;
    }
    if (lastx < 0) return (lastx + orig_d);
    return (lastx);
}

// Calculate the modular inverses used in computing initial bit-to-clear values

__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) CalcModularInverses(uint32 exponent, int *calc_info)
{
    uint32 index; // Index for prime and modinv data in calc_info
    uint32 prime; // The prime to work on
    uint64 facdist; // Distance between two successive factors in a class

    // Handle the primes that are processed with special code.  That is, they are not part of an official "row" in pinfo_dev.

    if (blockIdx.x == 0) {
        if (threadIdx.x < primesNotSieved || threadIdx.x >= primesNotSieved + primesHandledWithSpecialCode) return;
        index = threadIdx.x;
    }

    // Handle primes that are in "rows" of pinfo_dev.

    else {
        // Get and apply the distance between prime numbers in the pinfo_dev "row"
        index = primesNotSieved + primesHandledWithSpecialCode + (blockIdx.x - 1) * threadsPerBlock + threadIdx.x;
    }

    // Calculate and save the modular inverse for one of the sieve primes
    // The modular inverse is one over the distance between the corresponding factors for two successive k values in a class.

    prime                                                = calc_info[MAX_PRIMES_PER_THREAD * 4 + index * 2];
    facdist                                              = (uint64)(2 * NUM_CLASSES) * (uint64)exponent;
    calc_info[MAX_PRIMES_PER_THREAD * 4 + index * 2 + 1] = modularinverse((uint32)(facdist % prime), prime);
}

// Calculate the initial bit-to-clear values

__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP)
    CalcBitToClear(uint32 exponent, int96 k_base, int *calc_info, uint8 *pinfo_dev)
{
    uint32 index; // Index for prime and modinv data in calc_info
    uint32 mask; // Mask that tells us what bits must be preserved in pinfo_dev when setting bit-to-clear
    uint32 prime; // Calculate the bit-to-clear of this prime number
    uint32 modinv; // Distance between successive factors mod prime
    uint32 bit_to_clear; // Calculated bit to clear

    // Handle the primes that are processed with special code.  That is, they are not part of an official "row" in pinfo_dev.

    if (blockIdx.x == 0) {
        if (threadIdx.x < primesNotSieved || threadIdx.x >= primesNotSieved + primesHandledWithSpecialCode) return;
        pinfo_dev += threadIdx.x * 2;
        index = threadIdx.x;
    }

    // Get info on the "row" of pinfo_dev we are working on.

    else {
        // Form the pointer to the start of the "row"
        pinfo_dev += calc_info[(blockIdx.x - 1)];

        // The distance between bit-to-clear values in the pinfo_dev "row" is always 4 bytes
        pinfo_dev += threadIdx.x * 4;

        // Get the index for the first prime number in the pinfo_dev "row"
        index = calc_info[MAX_PRIMES_PER_THREAD + (blockIdx.x - 1)];

        // Get and apply the distance between prime numbers in the pinfo_dev "row"
        index += threadIdx.x * calc_info[MAX_PRIMES_PER_THREAD * 2 + (blockIdx.x - 1)];

        // Get the mask to apply to word where we set the bit-to-clear value
        mask = calc_info[MAX_PRIMES_PER_THREAD * 3 + (blockIdx.x - 1)];
    }

    // Read the prime and its modular inverse

    prime  = calc_info[MAX_PRIMES_PER_THREAD * 4 + index * 2];
    modinv = calc_info[MAX_PRIMES_PER_THREAD * 4 + index * 2 + 1];

    // Compute lowest possible value such that the factor (2 * k * exponent + 1) is divisible by our prime

    uint64 k_mod_p; // k_base mod prime
    uint64 factor_mod_p; // factor mod prime

    k_mod_p      = (((uint64)k_base.d1 << 32) + k_base.d0) % prime;
    factor_mod_p = (2 * k_mod_p * exponent + 1) % prime;
    bit_to_clear = ((uint64)prime - factor_mod_p) * modinv % prime;

    //k_base.d0 = __add_cc (k_base.d0, __umul32  (bit_to_clear, NUM_CLASSES));
    //k_base.d1 = __addc   (k_base.d1, __umul32hi(bit_to_clear, NUM_CLASSES)); /* k is limited to 2^64 -1 so there is no need for k.d2 */
    //k_mod_p = (((uint64) k_base.d1 << 32) + k_base.d0) % prime;
    //factor_mod_p = (2 * k_mod_p * exponent + 1) % prime;
    //if (factor_mod_p != 0)
    //printf ("FAIL!: %d, %d, %d\n", index, prime, bit_to_clear);

    // Handle the primes that are processed with special code.  That is, they are not part of an official "row" in pinfo_dev.
    // For these primes we store bit-to-clear in a 16-bit word.

    if (blockIdx.x == 0) {
        *pinfo16 = bit_to_clear;
    }

    // Store the bit-to-clear in a masked 32-bit value

    else {
        *pinfo32 = (*pinfo32 & mask) + bit_to_clear;
    }
}

//
// Sieve initialization done on the CPU
//

// Simple CPU sieve of erathosthenes for small limits - not efficient for large limits.

void tiny_soe(uint32 limit, uint32 *primes)
{
    uint8 *flags;
    uint16 prime;
    uint32 i, j, sieve_size;
    uint32 it;

    // Allocate flags (assume we can generate N primes by sieving up to 40*N.  We only need flags for odd numbers)
    sieve_size = limit * 40 / 2;
    flags      = (uint8 *)malloc(sieve_size);
    if (flags == NULL) {
        printf("error allocating tiny_soe flags\n");
        exit(1);
    }
    memset(flags, 1, sieve_size);

    primes[0] = 2;
    it        = 1;

    // sieve using primes less than the sqrt of the desired limit
    for (i = 1; i < (uint32)sqrt((double)(limit * 40)); i++) {
        if (flags[i] == 1) {
            prime = (uint32)(2 * i + 1);
            for (j = i + prime; j < sieve_size; j += prime)
                flags[j] = 0;

            primes[it] = prime;
            it++;
        }
    }

    //now find the rest of the prime flags and compute the sieving primes
    for (; it < limit; i++) {
        if (flags[i] == 1) {
            primes[it] = (uint32)(2 * i + 1);
            it++;
        }
    }

    free(flags);
}

// GPU sieve initialization that only needs to be done one time.

extern "C" __host__ void gpusieve_init(mystuff_t *mystuff)
{
    uint32 *primes;
    uint8 *pinfo, *saveptr;
    uint32 *rowinfo, *row;
    uint32 i, j, pinfo_size, rowinfo_size;
    uint32 k, loop_count, loop_end;
    static int gpusieve_initialized = 0;

    // If we've already allocated GPU memory, return
    if (gpusieve_initialized) return;
    gpusieve_initialized = 1;

    // Prefer shared memory over L1 cache
    if (cudaDeviceSetCacheConfig(cudaFuncCachePreferShared) != cudaSuccess) {
        printf("WARNING: cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); failed!\n");
    }

    // Allocate the big sieve array (default is 128M bits)
    checkCudaErrors(cudaMalloc((void **)&mystuff->d_bitarray, mystuff->gpu_sieve_size / 8));

#ifdef RAW_GPU_BENCH
    // Quick hack to eliminate sieve time from GPU-code benchmarks.  Can also be used
    // to isolate a bug by eliminating the GPU sieving code as a possible cause.
    checkCudaErrors(cudaMemset(mystuff->d_bitarray, 0xFF, mystuff->gpu_sieve_size / 8));
#endif

#undef pinfo32
#define pinfo32 ((uint32 *)pinfo)

    // Round up SIEVE_PRIMES so that all threads stay busy in the last sieving loop
    // The first several primes are handled with special code.  After that, they
    // are processed in chunks of threadsPerBlock (256).

    mystuff->gpu_sieve_primes =
        ((mystuff->gpu_sieve_primes - primesNotSieved - primesHandledWithSpecialCode) / threadsPerBlock) * threadsPerBlock +
        primesNotSieved + primesHandledWithSpecialCode;

    // Loop finding a suitable SIEVE_PRIMES value.  Initial value sieves primes below around 1.05M.

    for (;; mystuff->gpu_sieve_primes += threadsPerBlock) {
        // compute how many "rows" of the primes info array each thread will be responsible for
        primes_per_thread = (mystuff->gpu_sieve_primes - primesNotSieved - primesHandledWithSpecialCode) / threadsPerBlock;

        // Make sure there are 0 mod 3 rows in the under 64K section!
        if (primes_per_thread > 1) {
            loop_count = min(primes_per_thread, sieving64KCrossover) - 1;
            if ((loop_count % 3) != 0) continue;
        }

        // Make sure we don't try the 64K crossover row
        if (primes_per_thread == sieving64KCrossover + 1) continue;

        // Make sure there are 1 mod 3 rows in 64K to 128K section!
        if (primes_per_thread > sieving64KCrossover + 1) {
            loop_count = min(primes_per_thread, sieving128KCrossover + 1) - (sieving64KCrossover + 1);
            if ((loop_count % 3) != 1) continue;
        }

        // Make sure there are 1 mod 4 rows in 128K to 1M section!
        if (primes_per_thread > sieving128KCrossover + 1) {
            loop_count = min(primes_per_thread, sieving1MCrossover) - (sieving128KCrossover + 1);
            if ((loop_count % 4) != 1) continue;
        }

        // Make sure there are 1 mod 4 rows in 1M to 16M section!
        loop_count = primes_per_thread - sieving1MCrossover;
        if (primes_per_thread > sieving1MCrossover) {
            loop_count = primes_per_thread - sieving1MCrossover;
            if ((loop_count % 4) != 1) continue;
        }

        // We've found the SIEVE_PRIMES value to use
        break;
    }

    // find seed primes
    primes = (uint32 *)malloc(mystuff->gpu_sieve_primes * sizeof(uint32));
    if (primes == NULL) {
        printf("error in malloc primes\n");
        exit(1);
    }
    tiny_soe(mystuff->gpu_sieve_primes, primes);
    mystuff->gpu_sieve_min_exp = primes[mystuff->gpu_sieve_primes - 1] + 1;
    if (mystuff->verbosity >= 1) {
        printf("  GPUSievePrimes (adjusted) %d\n", mystuff->gpu_sieve_primes);
        printf("  GPUsieve minimum exponent %u\n", mystuff->gpu_sieve_min_exp);
    }

    // allocate memory for compressed prime info -- assumes prime data can be stored in 12 bytes
    pinfo = (uint8 *)malloc(mystuff->gpu_sieve_primes * 12);
    if (pinfo == NULL) {
        printf("error in malloc pinfo\n");
        exit(1);
    }

    // allocate memory for info that describes each row of 256 primes AND has the primes and modular inverses
    rowinfo_size = MAX_PRIMES_PER_THREAD * 4 * sizeof(uint32) + mystuff->gpu_sieve_primes * 8;
    rowinfo      = (uint32 *)malloc(rowinfo_size);
    if (rowinfo == NULL) {
        printf("error in malloc rowinfo\n");
        exit(1);
    }

    // In first section (very small primes) we only store a 16-bit value of the bit to clear which is computed later
    saveptr = pinfo;
    i       = primesNotSieved + primesHandledWithSpecialCode;
    pinfo += PINFO_PAD1;

    // In this section (primes below 64K) we store p in 16 bits, bit-to-clr in 16 bits, and pinv in 32 bits.
    row      = rowinfo;
    loop_end = min(primes_per_thread, sieving64KCrossover);
    for (; i < primesNotSieved + primesHandledWithSpecialCode + loop_end * threadsPerBlock;
         i += threadsPerBlock, pinfo += threadsPerBlock * 8) {
        row[0] = (pinfo - saveptr);                  // Offset to first pinfo byte in the row
        row[MAX_PRIMES_PER_THREAD]     = i;          // First pinfo entry is for the i-th prime number
        row[MAX_PRIMES_PER_THREAD * 2] = 1;          // Pinfo entries represent successive prime numbers
        row[MAX_PRIMES_PER_THREAD * 3] = 0xFFFF0000; // Mask of bits to preserve when setting bit-to-clear
        row++;
        for (j = 0; j < threadsPerBlock; j++) {
            pinfo32[j]                   = (primes[i + j] << 16) + 0;
            pinfo32[j + threadsPerBlock] = gen_pinv(primes[i + j]);
        }
    }

    // In this section (primes both below and above 64K) we store bit-to-clr in 32 bits, pinv in 32 bits, and p in 32 bits.
    loop_end = min(primes_per_thread, sieving64KCrossover + 1);
    for (; i < primesNotSieved + primesHandledWithSpecialCode + loop_end * threadsPerBlock;
         i += threadsPerBlock, pinfo += threadsPerBlock * 12) {
        row[0] = (pinfo - saveptr);         // Offset to first pinfo byte in the row
        row[MAX_PRIMES_PER_THREAD]     = i; // First pinfo entry is for the i-th prime number
        row[MAX_PRIMES_PER_THREAD * 2] = 1; // Pinfo entries represent successive prime numbers
        row[MAX_PRIMES_PER_THREAD * 3] = 0; // Mask of bits to preserve when setting bit-to-clear
        row++;
        for (j = 0; j < threadsPerBlock; j++) {
            pinfo32[j]                       = 0;
            pinfo32[j + threadsPerBlock]     = gen_pinv(primes[i + j]);
            pinfo32[j + threadsPerBlock * 2] = primes[i + j];
        }
    }

    // In this section (transitioning to dense primes storage) we store bit-to-clr 32 bits, pinv in 32 bits, and p in 32 bits.
    if (primes_per_thread > sieving64KCrossover + 1) {
        // clang-format off
        loop_count = min(primes_per_thread, sieving128KCrossover + 1) - (sieving64KCrossover + 1);
        row[0] = (pinfo - saveptr);                  // Offset to first pinfo byte in the row
        row[MAX_PRIMES_PER_THREAD]     = i;          // First pinfo entry is for the i-th prime number
        row[MAX_PRIMES_PER_THREAD * 2] = loop_count; // Pinfo entries skip loop_count prime numbers
        row[MAX_PRIMES_PER_THREAD * 3] = 0;          // Mask of bits to preserve when setting bit-to-clear
        row++;
        // clang-format on
        for (j = 0; j < threadsPerBlock; j++) {
            pinfo32[j]                       = 0;
            pinfo32[j + threadsPerBlock]     = gen_pinv(primes[i + j * loop_count]);
            pinfo32[j + threadsPerBlock * 2] = primes[i + j * loop_count];
        }
        pinfo += threadsPerBlock * 12;
    }

    // In this section (primes from 64K through 128K) we store bit-to-clr 18 bits, (p diff) / 2 in 7 bits, and pinv diff in 7-bits.
    if (primes_per_thread > sieving64KCrossover + 2) {
        for (k = 1; k < loop_count; k++) {
            // clang-format off
            row[0]  = (pinfo - saveptr) + (k - 1) * threadsPerBlock * 4; // Offset to first pinfo byte in the row
            row[MAX_PRIMES_PER_THREAD]     = i + k;                      // First pinfo entry is for the i+k-th prime number
            row[MAX_PRIMES_PER_THREAD * 2] = loop_count;                 // Pinfo entries skip loop_count prime numbers
            row[MAX_PRIMES_PER_THREAD * 3] = 0xFFFC0000;                 // Mask of bits to preserve when setting bit-to-clear
            row++;
            // clang-format on
        }
        for (k = 1; k < loop_count; k += 3) {
            for (j = 0; j < threadsPerBlock; j++) {
                int index       = i + j * loop_count + k;
                uint32 pdiff    = (primes[index] - primes[index - 1]) / 2;
                uint32 pinvdiff = gen_pinv(primes[index - 1]) - gen_pinv(primes[index]);
                if (pdiff > 127 || pinvdiff > 127) printf("Bad compress: %d, %d, %d\n", primes[index], pdiff, pinvdiff);
                pinfo32[(k - 1) * threadsPerBlock + j] = (pinvdiff << 25) + (pdiff << 18) + 0;

                index++;
                pdiff    = (primes[index] - primes[index - 2]) / 2;
                pinvdiff = gen_pinv(primes[index - 2]) - gen_pinv(primes[index]);
                if (pdiff > 127 || pinvdiff > 127) printf("Bad compress: %d, %d, %d\n", primes[index], pdiff, pinvdiff);
                pinfo32[k * threadsPerBlock + j] = (pinvdiff << 25) + (pdiff << 18) + 0;

                index++;
                pdiff    = (primes[index] - primes[index - 3]) / 2;
                pinvdiff = gen_pinv(primes[index - 3]) - gen_pinv(primes[index]);
                if (pdiff > 127 || pinvdiff > 127) printf("Bad compress: %d, %d, %d\n", primes[index], pdiff, pinvdiff);
                pinfo32[(k + 1) * threadsPerBlock + j] = (pinvdiff << 25) + (pdiff << 18) + 0;
            }
        }
        pinfo += (loop_count - 1) * threadsPerBlock * 4;
        i += loop_count * threadsPerBlock;
    }

    // In this section (first complete row of primes above 128K) we store bit-to-clr 32 bits, pinv in 32 bits, and p in 32-bits.
    if (primes_per_thread > sieving128KCrossover + 1) {
        // clang-format off
        loop_count = min(primes_per_thread, sieving1MCrossover) - (sieving128KCrossover + 1);
        row[0] = (pinfo - saveptr);                  // Offset to first pinfo byte in the row
        row[MAX_PRIMES_PER_THREAD]     = i;          // First pinfo entry is for the i-th prime number
        row[MAX_PRIMES_PER_THREAD * 2] = loop_count; // Pinfo entries skip loop_count prime numbers
        row[MAX_PRIMES_PER_THREAD * 3] = 0;          // Mask of bits to preserve when setting bit-to-clear
        row++;
        // clang-format on
        for (j = 0; j < threadsPerBlock; j++) {
            pinfo32[j] = 0;
            pinfo32[j + threadsPerBlock]     = gen_sloppy_pinv(primes[i + j * loop_count]);
            pinfo32[j + threadsPerBlock * 2] = primes[i + j * loop_count];
        }
        pinfo += threadsPerBlock * 12;
    }

    // In this section (primes from 128K to 1M) we store bit-to-clr 20 bits, (p diff) / 2 in 7 bits, and pinv diff in 5 bits.
    if (primes_per_thread > sieving128KCrossover + 2) {
        for (k = 1; k < loop_count; k++) {
            // clang-format off
            row[0] = (pinfo - saveptr) + (k - 1) * threadsPerBlock * 4; // Offset to first pinfo byte in the row
            row[MAX_PRIMES_PER_THREAD]     = i + k;                     // First pinfo entry is for the i+k-th prime number
            row[MAX_PRIMES_PER_THREAD * 2] = loop_count;                // Pinfo entries skip loop_count prime numbers
            row[MAX_PRIMES_PER_THREAD * 3] = 0xFFF00000;                // Mask of bits to preserve when setting bit-to-clear
            row++;
            // clang-format on
        }
        for (k = 1; k < loop_count; k += 4) {
            for (j = 0; j < threadsPerBlock; j++) {
                int index       = i + j * loop_count + k;
                uint32 pdiff    = (primes[index] - primes[index - 1]) / 2;
                uint32 pinvdiff = gen_sloppy_pinv(primes[index - 1]) - gen_sloppy_pinv(primes[index]);
                if (pdiff > 127 || pinvdiff > 31) printf("Bad compress: %d, %d, %d\n", primes[index], pdiff, pinvdiff);
                pinfo32[(k - 1) * threadsPerBlock + j] = (pinvdiff << 27) + (pdiff << 20) + 0;

                index++;
                pdiff    = (primes[index] - primes[index - 2]) / 2;
                pinvdiff = gen_sloppy_pinv(primes[index - 2]) - gen_sloppy_pinv(primes[index]);
                if (pdiff > 127 || pinvdiff > 31) printf("Bad compress: %d, %d, %d\n", primes[index], pdiff, pinvdiff);
                pinfo32[k * threadsPerBlock + j] = (pinvdiff << 27) + (pdiff << 20) + 0;

                index++;
                pdiff    = (primes[index] - primes[index - 3]) / 2;
                pinvdiff = gen_sloppy_pinv(primes[index - 3]) - gen_sloppy_pinv(primes[index]);
                if (pdiff > 127 || pinvdiff > 31) printf("Bad compress: %d, %d, %d\n", primes[index], pdiff, pinvdiff);
                pinfo32[(k + 1) * threadsPerBlock + j] = (pinvdiff << 27) + (pdiff << 20) + 0;

                index++;
                pdiff    = (primes[index] - primes[index - 4]) / 2;
                pinvdiff = gen_sloppy_pinv(primes[index - 4]) - gen_sloppy_pinv(primes[index]);
                if (pdiff > 127 || pinvdiff > 31) printf("Bad compress: %d, %d, %d\n", primes[index], pdiff, pinvdiff);
                pinfo32[(k + 2) * threadsPerBlock + j] = (pinvdiff << 27) + (pdiff << 20) + 0;
            }
        }
        pinfo += (loop_count - 1) * threadsPerBlock * 4;
        i += loop_count * threadsPerBlock;
    }

    // In this section (primes both below and above 1M) we store bit-to-clr 32 bits, pinv in 32 bits, and p in 32-bits.
    if (primes_per_thread > sieving1MCrossover) {
        // clang-format off
        loop_count = primes_per_thread - sieving1MCrossover;
        row[0] = (pinfo - saveptr);                  // Offset to first pinfo byte in the row
        row[MAX_PRIMES_PER_THREAD]     = i;          // First pinfo entry is for the i-th prime number
        row[MAX_PRIMES_PER_THREAD * 2] = loop_count; // Pinfo entries skip loop_count prime numbers
        row[MAX_PRIMES_PER_THREAD * 3] = 0;          // Mask of bits to preserve when setting bit-to-clear
        row++;
        // clang-format on
        for (j = 0; j < threadsPerBlock; j++) {
            pinfo32[j]                       = 0;
            pinfo32[j + threadsPerBlock]     = gen_sloppy_pinv(primes[i + j * loop_count]);
            pinfo32[j + threadsPerBlock * 2] = primes[i + j * loop_count];
        }
        pinfo += threadsPerBlock * 12;
    }

    // In this section (primes above 1M to 16M) we store bit-to-clr 24 bits, (p diff) / 2 in 7 bits, and pinv diff in 1 bit.
    if (primes_per_thread > sieving1MCrossover + 1) {
        for (k = 1; k < loop_count; k++) {
            // clang-format off
            row[0] = (pinfo - saveptr) + (k - 1) * threadsPerBlock * 4; // Offset to first pinfo byte in the row
            row[MAX_PRIMES_PER_THREAD]     = i + k;                     // First pinfo entry is for the i+k-th prime number
            row[MAX_PRIMES_PER_THREAD * 2] = loop_count;                // Pinfo entries skip loop_count prime numbers
            row[MAX_PRIMES_PER_THREAD * 3] = 0xFF000000;                // Mask of bits to preserve when setting bit-to-clear
            row++;
            // clang-format on
        }
        for (k = 1; k < loop_count; k += 4) {
            for (j = 0; j < threadsPerBlock; j++) {
                int index       = i + j * loop_count + k;
                uint32 pdiff    = (primes[index] - primes[index - 1]) / 2;
                uint32 pinvdiff = gen_sloppy_pinv(primes[index - 1]) - gen_sloppy_pinv(primes[index]);
                if (pdiff > 127 || pinvdiff > 1) printf("Bad compress: %d, %d, %d\n", primes[index], pdiff, pinvdiff);
                pinfo32[(k - 1) * threadsPerBlock + j] = (pinvdiff << 31) + (pdiff << 24) + 0;

                index++;
                pdiff    = (primes[index] - primes[index - 2]) / 2;
                pinvdiff = gen_sloppy_pinv(primes[index - 2]) - gen_sloppy_pinv(primes[index]);
                if (pdiff > 127 || pinvdiff > 1) printf("Bad compress: %d, %d, %d\n", primes[index], pdiff, pinvdiff);
                pinfo32[k * threadsPerBlock + j] = (pinvdiff << 31) + (pdiff << 24) + 0;

                index++;
                pdiff    = (primes[index] - primes[index - 3]) / 2;
                pinvdiff = gen_sloppy_pinv(primes[index - 3]) - gen_sloppy_pinv(primes[index]);
                if (pdiff > 127 || pinvdiff > 1) printf("Bad compress: %d, %d, %d\n", primes[index], pdiff, pinvdiff);
                pinfo32[(k + 1) * threadsPerBlock + j] = (pinvdiff << 31) + (pdiff << 24) + 0;

                index++;
                pdiff    = (primes[index] - primes[index - 4]) / 2;
                pinvdiff = gen_sloppy_pinv(primes[index - 4]) - gen_sloppy_pinv(primes[index]);
                if (pdiff > 127 || pinvdiff > 1) printf("Bad compress: %d, %d, %d\n", primes[index], pdiff, pinvdiff);
                pinfo32[(k + 2) * threadsPerBlock + j] = (pinvdiff << 31) + (pdiff << 24) + 0;
            }
        }
        pinfo += (loop_count - 1) * threadsPerBlock * 4;
        i += loop_count * threadsPerBlock;
    }
    pinfo_size = pinfo - saveptr;
    pinfo      = saveptr;

    // Finally, also copy the primes to rowinfo to be used in later calculating bit-to-clear values
    for (i = primesNotSieved; i < (uint32)mystuff->gpu_sieve_primes; i++) {
        rowinfo[MAX_PRIMES_PER_THREAD * 4 + 2 * i] = primes[i];
    }

    // Allocate and copy the device compressed prime sieving info
    checkCudaErrors(cudaMalloc((void **)&mystuff->d_sieve_info, pinfo_size));
    checkCudaErrors(cudaMemcpy(mystuff->d_sieve_info, pinfo, pinfo_size, cudaMemcpyHostToDevice));

    // Allocate and copy the device row info, primes and modular inverses info used to calculate bit-to-clear
    checkCudaErrors(cudaMalloc((void **)&mystuff->d_calc_bit_to_clear_info, rowinfo_size));
    checkCudaErrors(cudaMemcpy(mystuff->d_calc_bit_to_clear_info, rowinfo, rowinfo_size, cudaMemcpyHostToDevice));

    // Free allocated memory
    free(primes);
    free(pinfo);
    free(rowinfo);
}

// GPU sieve initialization that needs to be done once for each Mersenne exponent to be factored.

void gpusieve_init_exponent(mystuff_t *mystuff)
{
    static uint32 last_exponent_initialized = 0;

#ifdef RAW_GPU_BENCH
    // Quick hack (leave bit array set to all ones) to eliminate sieve time from GPU-code benchmarks.
    // Can also be used to isolate a bug by eliminating the GPU sieving code as a possible cause.
    return;
#endif

    // If we've already initialized this exponent, return
    if (mystuff->exponent == last_exponent_initialized) return;
    last_exponent_initialized = mystuff->exponent;

    // Calculate the modular inverses that will be used by each class to calculate initial bit-to-clear for each prime
    CalcModularInverses<<<primes_per_thread + 1, threadsPerBlock>>>(mystuff->exponent, (int *)mystuff->d_calc_bit_to_clear_info);
    cudaDeviceSynchronize();
}

// GPU sieve initialization that needs to be done once for each class to be factored.

void gpusieve_init_class(mystuff_t *mystuff, unsigned long long k_min)
{
    int96 k_base;

#ifdef RAW_GPU_BENCH
    // Quick hack (leave bit array set to all ones) to eliminate sieve time from GPU-code benchmarks.
    // Can also be used to isolate a bug by eliminating the GPU sieving code as a possible cause.
    return;
#endif

    k_base.d0 = (int)(k_min & 0xFFFFFFFF);
    k_base.d1 = (int)(k_min >> 32);
    k_base.d2 = 0;

    // Calculate the initial bit-to-clear for each prime
    CalcBitToClear<<<primes_per_thread + 1, threadsPerBlock>>>(mystuff->exponent, k_base, (int *)mystuff->d_calc_bit_to_clear_info,
                                                                 (uint8 *)mystuff->d_sieve_info);
    cudaDeviceSynchronize();
}

// GPU sieve the next chunk

void gpusieve(mystuff_t *mystuff, unsigned long long num_k_remaining)
{
    int sieve_size;

#ifdef RAW_GPU_BENCH
    // Quick hack (leave bit array set to all ones) to eliminate sieve time from GPU-code benchmarks.
    // Can also be used to isolate a bug by eliminating the GPU sieving code as a possible cause.
    return;
#endif

    // Sieve at most 128 million k values.
    if ((unsigned long long)mystuff->gpu_sieve_size < num_k_remaining)
        sieve_size = mystuff->gpu_sieve_size;
    else
        sieve_size = (int)num_k_remaining;

    // Do some sieving on the GPU!
    SegSieve<<<(sieve_size + block_size - 1) / block_size, threadsPerBlock>>>((uint8 *)mystuff->d_bitarray,
                                                                              (uint8 *)mystuff->d_sieve_info, primes_per_thread);
    cudaDeviceSynchronize();
}
