# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = 0
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
from sympy import prime, primerange, isprime
import datetime
import platform
import json
import time

# Paste your assigenments below
Assignments = """
Factor=4651D83B08CCD25B0A63160B4D3BA595,143850793,77,78
Factor=9998552797,1,78
"""

# The meanings and minimum / maximum values of the following variables are consistent with those explained in mfaktc.ini
V5UserID = ""
ComputerID = ""
GPUSievePrimes = 82486
GPUSieveSize = 64
GPUSieveProcessSize = 32
# End of variables in mfaktc.ini

NUM_CLASSES = 4620
MAX_PRIMES_PER_THREAD = 4224
THREADS_PER_BLOCK = 256
block_size_in_bytes = 8192
block_size = block_size_in_bytes * 8
threadsPerBlock = 256
primesNotSieved = 5
primesHandledWithSpecialCode = 49
primesBelow64K = 6542
primesBelow128K = 12251
primesBelow1M = 82025
sieving64KCrossover = (primesBelow64K - primesNotSieved - primesHandledWithSpecialCode) // threadsPerBlock
sieving128KCrossover = (primesBelow128K - primesNotSieved - primesHandledWithSpecialCode) // threadsPerBlock
sieving1MCrossover = (primesBelow1M - primesNotSieved - primesHandledWithSpecialCode) // threadsPerBlock - 3

kernel_code_gpusieve = """
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
typedef long long unsigned int uint64;
typedef struct
{
  unsigned int d0,d1,d2;
}int96;

#define NUM_CLASSES 4620
#define MAX_PRIMES_PER_THREAD 4224
const uint32 block_size_in_bytes = 8192;
const uint32 block_size = block_size_in_bytes * 8;
const uint32 threadsPerBlock = 256;
const uint32 primesNotSieved = 5;
const uint32 primesHandledWithSpecialCode = 49;
const uint32 primesBelow64K = 6542;
const uint32 primesBelow128K = 12251;
const uint32 primesBelow1M = 82025;
const uint32 sieving64KCrossover = (primesBelow64K - primesNotSieved - primesHandledWithSpecialCode) / threadsPerBlock;
const uint32 sieving128KCrossover = (primesBelow128K - primesNotSieved - primesHandledWithSpecialCode) / threadsPerBlock;
const uint32 sieving1MCrossover = (primesBelow1M - primesNotSieved - primesHandledWithSpecialCode) / threadsPerBlock - 3;
#define BITSLL11 (1 | (1<<11) | (1<<22))
#define BITSLL13 (1 | (1<<13) | (1<<26))
#define BITSLL17 (1 | (1<<17))
#define BITSLL19 (1 | (1<<19))
#define BITSLL23 (1 | (1<<23))
#define BITSLL29 (1 | (1<<29))
#define BITSLL31 (1 | (1<<31))
#define PINFO_PAD1 1024
#define gen_pinv(p) (0xFFFFFFFF / (p) + 1)

__device__ int mod_p(int x, int p, int pinv) {
    int r;
    asm (
        "mul.hi.s32 %0, %1, %2;\\n\\t"
        "mul.lo.s32 %0, %0, %3;\\n\\t"
        "sub.s32   %1, %1, %0;\\n\\t"
        "sub.s32   %0, %1, %3;\\n\\t"
        "slct.s32.s32 %0, %0, %1, %0;"
        : "=r" (r), "+r" (x) : "r" (pinv), "r" (p)
    );
    return r;
}

__device__ __inline static int mod_const_p (int x, int p)
{
    return mod_p (x, p, gen_pinv (p));
}

#define gen_sloppy_pinv(p) ((uint32) floor (4294967296.0 / (p) - 0.5))

__device__ __inline static int sloppy_mod_p (int x, int p, int pinv)
{
    int q, r;
    q = __mulhi (x, pinv);
    r = x - q * p;
    return r;
}

__device__ __inline static int bump_mod_p (int i, int inc, int p)
{
    int x, j;
    i = i + inc % p; j = i + p;
    asm("slct.s32.s32 %0, %1, %2, %1;" : "=r" (x) : "r" (i), "r" (j));
    return x;
}

__device__ __inline static void bitOr (uint8 *locsieve, uint32 bclr)
{
#define locsieve8 ((uint8 *) locsieve)
#define locsieve8v ((volatile uint8 *) locsieve)
#define locsieve32 ((uint32 *) locsieve)
#define locsieve32v ((volatile uint32 *) locsieve)
    locsieve8[bclr >> 3] |= 1 << (bclr & 7);
}

__device__ __inline static void bitOrSometimesIffy (uint8 *locsieve, uint32 bclr)
{
    uint32 bytenum = bclr >> 3;
    uint8 mask = 1 << (bclr & 7);
    uint32 val = locsieve8[bytenum];
    if (! (val & mask)) locsieve8[bytenum] = val | mask;
}

__global__ void __launch_bounds__(256,6) SegSieve (uint8 *big_bit_array_dev, uint8 *pinfo_dev, uint32 maxp)
{
    __shared__ uint8 locsieve[block_size_in_bytes];
    uint32 block_start = blockIdx.x * block_size;
    uint32 i, j, p, pinv, bclr;
#define big_bit_array32 ((uint32 *) big_bit_array_dev)
#define locsieve32 ((uint32 *) locsieve)
#define locsieve64 ((uint64 *) locsieve)
#define pinfo16 ((uint16 *) pinfo_dev)
#define pinfo32 ((uint32 *) pinfo_dev)
#define bit_to_clr pinfo16
    uint32 thread_start = block_start + threadIdx.x * block_size / threadsPerBlock;
    uint32 mask, mask2, mask3, mask4, i11, i13, i17, i19, i23, i29, i31, i37, i41, i43, i47, i53, i59, i61;
    if (primesNotSieved == 4) {
        i11 = mod_const_p (bit_to_clr[4] - thread_start, 11);
        i13 = mod_const_p (bit_to_clr[5] - thread_start, 13);
        i17 = mod_const_p (bit_to_clr[6] - thread_start, 17);
    }
    if (primesNotSieved == 5) {
        i13 = mod_const_p (bit_to_clr[5] - thread_start, 13);
        i17 = mod_const_p (bit_to_clr[6] - thread_start, 17);
    }
    if (primesNotSieved == 6) i17 = mod_const_p (bit_to_clr[6] - thread_start, 17);
    i19 = mod_const_p (bit_to_clr[7] - thread_start, 19);
    i23 = mod_const_p (bit_to_clr[8] - thread_start, 23);
    i29 = mod_const_p (bit_to_clr[9] - thread_start, 29);
    i31 = mod_const_p (bit_to_clr[10] - thread_start, 31);
    if (primesNotSieved == 4) mask = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 5) mask = (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 6) mask = BITSLL17 << i17;
    mask |= (BITSLL19 << i19) | (BITSLL23 << i23);
    mask |= (BITSLL29 << i29) | (BITSLL31 << i31);
    if (primesNotSieved == 4) {
        i11 = bump_mod_p (i11, -32, 11);
        i13 = bump_mod_p (i13, -32, 13);
        i17 = bump_mod_p (i17, -32, 17);
    }
    if (primesNotSieved == 5) {
        i13 = bump_mod_p (i13, -32, 13);
        i17 = bump_mod_p (i17, -32, 17);
    }
    if (primesNotSieved == 6) {
        i17 = bump_mod_p (i17, -32, 17);
    }
    i19 = bump_mod_p (i19, -32, 19);
    i23 = bump_mod_p (i23, -32, 23);
    i29 = bump_mod_p (i29, -32, 29);
    i31 = bump_mod_p (i31, -32, 31);
    if (primesNotSieved == 4) mask2 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 5) mask2 = (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 6) mask2 = BITSLL17 << i17;
    mask2 |= (BITSLL19 << i19) | (BITSLL23 << i23);
    mask2 |= (BITSLL29 << i29) | (BITSLL31 << i31);
    if (primesNotSieved == 4) {
        i11 = bump_mod_p (i11, -32, 11);
        i13 = bump_mod_p (i13, -32, 13);
        i17 = bump_mod_p (i17, -32, 17);
    }
    if (primesNotSieved == 5) {
        i13 = bump_mod_p (i13, -32, 13);
        i17 = bump_mod_p (i17, -32, 17);
    }
    if (primesNotSieved == 6) i17 = bump_mod_p (i17, -32, 17);
    i19 = bump_mod_p (i19, -32, 19);
    i23 = bump_mod_p (i23, -32, 23);
    i29 = bump_mod_p (i29, -32, 29);
    i31 = bump_mod_p (i31, -32, 31);
    if (primesNotSieved == 4) mask3 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 5) mask3 = (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 6) mask3 = BITSLL17 << i17;
    mask3 |= (BITSLL19 << i19) | (BITSLL23 << i23);
    mask3 |= (BITSLL29 << i29) | (BITSLL31 << i31);
    if (primesNotSieved == 4) {
        i11 = bump_mod_p (i11, -32, 11);
        i13 = bump_mod_p (i13, -32, 13);
        i17 = bump_mod_p (i17, -32, 17);
    }
    if (primesNotSieved == 5) {
        i13 = bump_mod_p (i13, -32, 13);
        i17 = bump_mod_p (i17, -32, 17);
    }
    if (primesNotSieved == 6) i17 = bump_mod_p (i17, -32, 17);
    i19 = bump_mod_p (i19, -32, 19);
    i23 = bump_mod_p (i23, -32, 23);
    i29 = bump_mod_p (i29, -32, 29);
    i31 = bump_mod_p (i31, -32, 31);
    if (primesNotSieved == 4) mask4 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 5) mask4 = (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 6) mask4 = BITSLL17 << i17;
    mask4 |= (BITSLL19 << i19) | (BITSLL23 << i23);
    mask4 |= (BITSLL29 << i29) | (BITSLL31 << i31);
    if (primesNotSieved == 4) {
        i11 = bump_mod_p (i11, -32, 11);
        i13 = bump_mod_p (i13, -32, 13);
        i17 = bump_mod_p (i17, -32, 17);
    }
    if (primesNotSieved == 5) {
        i13 = bump_mod_p (i13, -32, 13);
        i17 = bump_mod_p (i17, -32, 17);
    }
    if (primesNotSieved == 6) i17 = bump_mod_p (i17, -32, 17);
    i19 = bump_mod_p (i19, -32, 19);
    i23 = bump_mod_p (i23, -32, 23);
    i29 = bump_mod_p (i29, -32, 29);
    i31 = bump_mod_p (i31, -32, 31);
    locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 0] = mask;
    locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 1] = mask2;
    locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 2] = mask3;
    locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 3] = mask4;
    if (primesNotSieved == 4) mask = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 5) mask = (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 6) mask = BITSLL17 << i17;
    mask |= (BITSLL19 << i19) | (BITSLL23 << i23);
    mask |= (BITSLL29 << i29) | (BITSLL31 << i31);
    if (primesNotSieved == 4) {
        i11 = bump_mod_p (i11, -32, 11);
        i13 = bump_mod_p (i13, -32, 13);
        i17 = bump_mod_p (i17, -32, 17);
    }
    if (primesNotSieved == 5) {
        i13 = bump_mod_p (i13, -32, 13);
        i17 = bump_mod_p (i17, -32, 17);
    }
    if (primesNotSieved == 6) i17 = bump_mod_p (i17, -32, 17);
    i19 = bump_mod_p (i19, -32, 19);
    i23 = bump_mod_p (i23, -32, 23);
    i29 = bump_mod_p (i29, -32, 29);
    i31 = bump_mod_p (i31, -32, 31);
    if (primesNotSieved == 4) mask2 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 5) mask2 = (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 6) mask2 = BITSLL17 << i17;
    mask2 |= (BITSLL19 << i19) | (BITSLL23 << i23);
    mask2 |= (BITSLL29 << i29) | (BITSLL31 << i31);
    if (primesNotSieved == 4) {
        i11 = bump_mod_p (i11, -32, 11);
        i13 = bump_mod_p (i13, -32, 13);
        i17 = bump_mod_p (i17, -32, 17);
    }
    if (primesNotSieved == 5) {
        i13 = bump_mod_p (i13, -32, 13);
        i17 = bump_mod_p (i17, -32, 17);
    }
    if (primesNotSieved == 6) i17 = bump_mod_p (i17, -32, 17);
    i19 = bump_mod_p (i19, -32, 19);
    i23 = bump_mod_p (i23, -32, 23);
    i29 = bump_mod_p (i29, -32, 29);
    i31 = bump_mod_p (i31, -32, 31);
    if (primesNotSieved == 4) mask3 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 5) mask3 = (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 6) mask3 = BITSLL17 << i17;
    mask3 |= (BITSLL19 << i19) | (BITSLL23 << i23);
    mask3 |= (BITSLL29 << i29) | (BITSLL31 << i31);
    if (primesNotSieved == 4) {
        i11 = bump_mod_p (i11, -32, 11);
        i13 = bump_mod_p (i13, -32, 13);
        i17 = bump_mod_p (i17, -32, 17);
    }
    if (primesNotSieved == 5) {
        i13 = bump_mod_p (i13, -32, 13);
        i17 = bump_mod_p (i17, -32, 17);
    }
    if (primesNotSieved == 6) i17 = bump_mod_p (i17, -32, 17);
    i19 = bump_mod_p (i19, -32, 19);
    i23 = bump_mod_p (i23, -32, 23);
    i29 = bump_mod_p (i29, -32, 29);
    i31 = bump_mod_p (i31, -32, 31);
    if (primesNotSieved == 4) mask4 = (BITSLL11 << i11) | (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 5) mask4 = (BITSLL13 << i13) | (BITSLL17 << i17);
    if (primesNotSieved == 6) mask4 = BITSLL17 << i17;
    mask4 |= (BITSLL19 << i19) | (BITSLL23 << i23);
    mask4 |= (BITSLL29 << i29) | (BITSLL31 << i31);
    locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 4] = mask;
    locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 5] = mask2;
    locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 6] = mask3;
    locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + 7] = mask4;
    i37 = mod_const_p (bit_to_clr[11] - thread_start, 37);
    i41 = mod_const_p (bit_to_clr[12] - thread_start, 41);
    i43 = mod_const_p (bit_to_clr[13] - thread_start, 43);
    i47 = mod_const_p (bit_to_clr[14] - thread_start, 47);
    i53 = mod_const_p (bit_to_clr[15] - thread_start, 53);
    i59 = mod_const_p (bit_to_clr[16] - thread_start, 59);
    i61 = mod_const_p (bit_to_clr[17] - thread_start, 61);
    for (j = 0; ; ) {
        mask = 1 << i37;
        mask |= (1 << i41) | (1 << i43);
        mask |= (1 << i47) | (1 << i53);
        mask |= (1 << i59) | (1 << i61);
        locsieve32[threadIdx.x * block_size / threadsPerBlock / 32 + j] |= mask;
        j++;
        if (j == block_size / threadsPerBlock / 32) break;
        i37 = bump_mod_p (i37, -32, 37);
        i41 = bump_mod_p (i41, -32, 41);
        i43 = bump_mod_p (i43, -32, 43);
        i47 = bump_mod_p (i47, -32, 47);
        i53 = bump_mod_p (i53, -32, 53);
        i59 = bump_mod_p (i59, -32, 59);
        i61 = bump_mod_p (i61, -32, 61);
    }
    if (primesNotSieved + primesHandledWithSpecialCode > 18) {
        uint32 i67, i71, i73, i79, i83, i89, i97, i101, i103, i107, i109, i113, i127;
        uint64 mask;
        i67 = mod_const_p (bit_to_clr[18] - thread_start, 67);
        i71 = mod_const_p (bit_to_clr[19] - thread_start, 71);
        i73 = mod_const_p (bit_to_clr[20] - thread_start, 73);
        i79 = mod_const_p (bit_to_clr[21] - thread_start, 79);
        i83 = mod_const_p (bit_to_clr[22] - thread_start, 83);
        i89 = mod_const_p (bit_to_clr[23] - thread_start, 89);
        i97 = mod_const_p (bit_to_clr[24] - thread_start, 97);
        for (j = 0; ; ) {
            mask = (uint64) 1 << i67;
            mask |= ((uint64) 1 << i71) | ((uint64) 1 << i73);
            mask |= ((uint64) 1 << i79) | ((uint64) 1 << i83);
            mask |= ((uint64) 1 << i89) | ((uint64) 1 << i97);
            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j] |= mask;
            j++;
            if (j == block_size / threadsPerBlock / 64) break;
            i67 = bump_mod_p (i67, -64, 67);
            i71 = bump_mod_p (i71, -64, 71);
            i73 = bump_mod_p (i73, -64, 73);
            i79 = bump_mod_p (i79, -64, 79);
            i83 = bump_mod_p (i83, -64, 83);
            i89 = bump_mod_p (i89, -64, 89);
            i97 = bump_mod_p (i97, -64, 97);
        }
        i101 = mod_const_p (bit_to_clr[25] - thread_start, 101);
        i103 = mod_const_p (bit_to_clr[26] - thread_start, 103);
        i107 = mod_const_p (bit_to_clr[27] - thread_start, 107);
        i109 = mod_const_p (bit_to_clr[28] - thread_start, 109);
        i113 = mod_const_p (bit_to_clr[29] - thread_start, 113);
        i127 = mod_const_p (bit_to_clr[30] - thread_start, 127);
        for (j = 0; ; ) {
            mask = (uint64) 1 << i101;
            mask |= ((uint64) 1 << i103) | ((uint64) 1 << i107);
            mask |= ((uint64) 1 << i109) | ((uint64) 1 << i113);
            mask |= (uint64) 1 << i127;
            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j] |= mask;
            j++;
            if (j == block_size / threadsPerBlock / 64) break;
            i101 = bump_mod_p (i101, -64, 101);
            i103 = bump_mod_p (i103, -64, 103);
            i107 = bump_mod_p (i107, -64, 107);
            i109 = bump_mod_p (i109, -64, 109);
            i113 = bump_mod_p (i113, -64, 113);
            i127 = bump_mod_p (i127, -64, 127);
        }
    }
    if (primesNotSieved + primesHandledWithSpecialCode > 31) {
        uint32 i131, i137, i139, i149, i151, i157, i163, i167, i173, i179, i181, i191;
        uint32 i193, i197, i199, i211, i223, i227, i229, i233, i239, i241, i251;
        uint64 mask1, mask2;
        i131 = mod_const_p (bit_to_clr[31] - thread_start, 131);
        i137 = mod_const_p (bit_to_clr[32] - thread_start, 137);
        i139 = mod_const_p (bit_to_clr[33] - thread_start, 139);
        i149 = mod_const_p (bit_to_clr[34] - thread_start, 149);
        i151 = mod_const_p (bit_to_clr[35] - thread_start, 151);
        i157 = mod_const_p (bit_to_clr[36] - thread_start, 157);
        for (j = 0; ; ) {
            mask1 = ((uint64) 1 << i131) | ((uint64) 1 << i137);
            mask1 |= ((uint64) 1 << i139) | ((uint64) 1 << i149);
            mask1 |= ((uint64) 1 << i151) | ((uint64) 1 << i157);
            mask2 = ((uint64) 1 << (i131 - 64)) | ((uint64) 1 << (i137 - 64));
            mask2 |= ((uint64) 1 << (i139 - 64)) | ((uint64) 1 << (i149 - 64));
            mask2 |= ((uint64) 1 << (i151 - 64)) | ((uint64) 1 << (i157 - 64));
            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2] |= mask1;
            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2 + 1] |= mask2;
            j++;
            if (j == block_size / threadsPerBlock / 128) break;
            i131 = bump_mod_p (i131, -128, 131);
            i137 = bump_mod_p (i137, -128, 137);
            i139 = bump_mod_p (i139, -128, 139);
            i149 = bump_mod_p (i149, -128, 149);
            i151 = bump_mod_p (i151, -128, 151);
            i157 = bump_mod_p (i157, -128, 157);
        }
        i163 = mod_const_p (bit_to_clr[37] - thread_start, 163);
        i167 = mod_const_p (bit_to_clr[38] - thread_start, 167);
        i173 = mod_const_p (bit_to_clr[39] - thread_start, 173);
        i179 = mod_const_p (bit_to_clr[40] - thread_start, 179);
        i181 = mod_const_p (bit_to_clr[41] - thread_start, 181);
        i191 = mod_const_p (bit_to_clr[42] - thread_start, 191);
        for (j = 0; ; ) {
            mask1 = ((uint64) 1 << i163) | ((uint64) 1 << i167);
            mask1 |= ((uint64) 1 << i173) | ((uint64) 1 << i179);
            mask1 |= ((uint64) 1 << i181) | ((uint64) 1 << i191);
            mask2 = ((uint64) 1 << (i163 - 64)) | ((uint64) 1 << (i167 - 64));
            mask2 |= ((uint64) 1 << (i173 - 64)) | ((uint64) 1 << (i179 - 64));
            mask2 |= ((uint64) 1 << (i181 - 64)) | ((uint64) 1 << (i191 - 64));
            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2] |= mask1;
            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2 + 1] |= mask2;
            j++;
            if (j == block_size / threadsPerBlock / 128) break;

            i163 = bump_mod_p (i163, -128, 163);
            i167 = bump_mod_p (i167, -128, 167);
            i173 = bump_mod_p (i173, -128, 173);
            i179 = bump_mod_p (i179, -128, 179);
            i181 = bump_mod_p (i181, -128, 181);
            i191 = bump_mod_p (i191, -128, 191);
        }
        i193 = mod_const_p (bit_to_clr[43] - thread_start, 193);
        i197 = mod_const_p (bit_to_clr[44] - thread_start, 197);
        i199 = mod_const_p (bit_to_clr[45] - thread_start, 199);
        i211 = mod_const_p (bit_to_clr[46] - thread_start, 211);
        i223 = mod_const_p (bit_to_clr[47] - thread_start, 223);
        i227 = mod_const_p (bit_to_clr[48] - thread_start, 227);
        for (j = 0; ; ) {
            mask1 = ((uint64) 1 << i193) | ((uint64) 1 << i197);
            mask1 |= ((uint64) 1 << i199) | ((uint64) 1 << i211);
            mask1 |= ((uint64) 1 << i223) | ((uint64) 1 << i227);
            mask2 = ((uint64) 1 << (i193 - 64)) | ((uint64) 1 << (i197 - 64));
            mask2 |= ((uint64) 1 << (i199 - 64)) | ((uint64) 1 << (i211 - 64));
            mask2 |= ((uint64) 1 << (i223 - 64)) | ((uint64) 1 << (i227 - 64));
            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2] |= mask1;
            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2 + 1] |= mask2;
            j++;
            if (j == block_size / threadsPerBlock / 128) break;
            i193 = bump_mod_p (i193, -128, 193);
            i197 = bump_mod_p (i197, -128, 197);
            i199 = bump_mod_p (i199, -128, 199);
            i211 = bump_mod_p (i211, -128, 211);
            i223 = bump_mod_p (i223, -128, 223);
            i227 = bump_mod_p (i227, -128, 227);
        }
        i229 = mod_const_p (bit_to_clr[49] - thread_start, 229);
        i233 = mod_const_p (bit_to_clr[50] - thread_start, 233);
        i239 = mod_const_p (bit_to_clr[51] - thread_start, 239);
        i241 = mod_const_p (bit_to_clr[52] - thread_start, 241);
        i251 = mod_const_p (bit_to_clr[53] - thread_start, 251);
        for (j = 0; ; ) {
            mask1 = (uint64) 1 << i229;
            mask1 |= ((uint64) 1 << i233) | ((uint64) 1 << i239);
            mask1 |= ((uint64) 1 << i241) | ((uint64) 1 << i251);
            mask2 = (uint64) 1 << (i229 - 64);
            mask2 |= ((uint64) 1 << (i233 - 64)) | ((uint64) 1 << (i239 - 64));
            mask2 |= ((uint64) 1 << (i241 - 64)) | ((uint64) 1 << (i251 - 64));
            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2] |= mask1;
            locsieve64[threadIdx.x * block_size / threadsPerBlock / 64 + j * 2 + 1] |= mask2;
            j++;
            if (j == block_size / threadsPerBlock / 128) break;
            i229 = bump_mod_p (i229, -128, 229);
            i233 = bump_mod_p (i233, -128, 233);
            i239 = bump_mod_p (i239, -128, 239);
            i241 = bump_mod_p (i241, -128, 241);
            i251 = bump_mod_p (i251, -128, 251);
        }
    }
#define SIEVE_256_BIT(n,p) i = mod_const_p (bit_to_clr[n] - thread_start, p); if (i < 256) locsieve[j * threadsPerBlock * 32 + threadIdx.x * 32 + (i >> 3)] |= 1 << (i & 7);
    if (primesNotSieved + primesHandledWithSpecialCode > 54)
        for (j = 0; j < block_size / (threadsPerBlock * 256); j++) {
            SIEVE_256_BIT (54, 257);
            SIEVE_256_BIT (55, 263);
            SIEVE_256_BIT (56, 269);
            SIEVE_256_BIT (57, 271);
            SIEVE_256_BIT (58, 277);
            SIEVE_256_BIT (59, 281);
            SIEVE_256_BIT (60, 283);
            SIEVE_256_BIT (61, 293);
            SIEVE_256_BIT (62, 307);
            SIEVE_256_BIT (63, 311);
            SIEVE_256_BIT (64, 313);
            SIEVE_256_BIT (65, 317);
            SIEVE_256_BIT (66, 331);
            SIEVE_256_BIT (67, 337);
            SIEVE_256_BIT (68, 347);
            SIEVE_256_BIT (69, 349);
            SIEVE_256_BIT (70, 353);
            SIEVE_256_BIT (71, 359);
            SIEVE_256_BIT (72, 367);
            SIEVE_256_BIT (73, 373);
            SIEVE_256_BIT (74, 379);
            SIEVE_256_BIT (75, 383);
            SIEVE_256_BIT (76, 389);
            SIEVE_256_BIT (77, 397);
            SIEVE_256_BIT (78, 401);
            SIEVE_256_BIT (79, 409);
            SIEVE_256_BIT (80, 419);
            SIEVE_256_BIT (81, 421);
            SIEVE_256_BIT (82, 431);
            SIEVE_256_BIT (83, 433);
            SIEVE_256_BIT (84, 439);
            SIEVE_256_BIT (85, 443);
            SIEVE_256_BIT (86, 449);
            SIEVE_256_BIT (87, 457);
            SIEVE_256_BIT (88, 461);
            SIEVE_256_BIT (89, 463);
            SIEVE_256_BIT (90, 467);
            SIEVE_256_BIT (91, 479);
            SIEVE_256_BIT (92, 487);
            SIEVE_256_BIT (93, 491);
            SIEVE_256_BIT (94, 499);
            SIEVE_256_BIT (95, 503);
            SIEVE_256_BIT (96, 509);
        }
#undef bit_to_clr
    __syncthreads();
    pinfo_dev += PINFO_PAD1;
    i = 0;
    for ( ; i < 1 && i < maxp; i++, pinfo_dev += threadsPerBlock * 8) {
        for (j = 0; j < 8; j++) {
            uint8 mask;
            bclr = pinfo32[j * threadsPerBlock / 8 + threadIdx.x / 8];
            p = bclr >> 16;
            bclr &= 0xFFFF;
            pinv = pinfo32[threadsPerBlock + j * threadsPerBlock / 8 + threadIdx.x / 8];
            bclr = mod_p (bclr - block_start, p, pinv) + (threadIdx.x & 7) * p;
            mask = 1 << (bclr & 7);
            bclr = bclr >> 3;
            do {
                uint8 val = locsieve8[bclr];
                if (! (val & mask)) locsieve8[bclr] = val | mask;
                bclr += p;
            } while (bclr < block_size_in_bytes);
        }
    }
    for ( ; i < sieving64KCrossover && i < maxp; i += 3, pinfo_dev += threadsPerBlock * 24) {
        uint32 p3, pinv3, bclr3, p2, pinv2, bclr2;
        bclr3 = pinfo32[threadIdx.x];
        bclr2 = pinfo32[threadsPerBlock*2 + threadIdx.x];
        bclr = pinfo32[threadsPerBlock*4 + threadIdx.x];
        p3 = bclr3 >> 16;
        p2 = bclr2 >> 16;
        p = bclr >> 16;
        bclr3 &= 0xFFFF;
        bclr2 &= 0xFFFF;
        bclr &= 0xFFFF;
        pinv3 = pinfo32[threadsPerBlock + threadIdx.x];
        pinv2 = pinfo32[threadsPerBlock*3 + threadIdx.x];
        pinv = pinfo32[threadsPerBlock*5 + threadIdx.x];
        bclr3 = mod_p (bclr3 - block_start, p3, pinv3);
        bclr2 = mod_p (bclr2 - block_start, p2, pinv2);
        bclr = mod_p (bclr - block_start, p, pinv);
        do {
            bitOrSometimesIffy (locsieve, bclr3);
            bclr3 += p3;
        } while (bclr3 < block_size);
        do {
            bitOrSometimesIffy (locsieve, bclr2);
            bclr2 += p2;
        } while (bclr2 < block_size);
        do {
            bitOrSometimesIffy (locsieve, bclr);
            bclr += p;
        } while (bclr < block_size);
    }
    if (i < maxp) {
        uint32 bclr2, pinv2, p2;
        bclr2 = pinfo32[threadIdx.x];
        pinv2 = pinfo32[threadsPerBlock + threadIdx.x];
        p2 = pinfo32[threadsPerBlock * 2 + threadIdx.x];
        bclr2 = mod_p (bclr2 - block_start, p2, pinv2);
        if (bclr2 < block_size) {
            bitOr (locsieve, bclr2);
            bclr2 += p2;
            if (bclr2 < block_size) bitOr (locsieve, bclr2);
        }
        bclr = pinfo32[threadsPerBlock * 3 + threadIdx.x];
        pinv = pinfo32[threadsPerBlock * 4 + threadIdx.x];
        p = pinfo32[threadsPerBlock * 5 + threadIdx.x];
        bclr = mod_p (bclr - block_start, p, pinv);
        if (bclr < block_size) bitOr (locsieve, bclr);
        i += 2, pinfo_dev += threadsPerBlock * 24;
    }
    for ( ; i < sieving128KCrossover + 1 && i < maxp; i += 3, pinfo_dev += threadsPerBlock * 12) {
        uint32 tmp3 = pinfo32[threadIdx.x];
        uint32 tmp2 = pinfo32[threadsPerBlock + threadIdx.x];
        uint32 tmp = pinfo32[threadsPerBlock*2 + threadIdx.x];
        uint32 bclr3, p3, pinv3, bclr2, p2, pinv2;
        bclr3 = tmp3 & 0x0003FFFF;
        bclr2 = tmp2 & 0x0003FFFF;
        bclr = tmp & 0x0003FFFF;
        pinv3 = pinv - (tmp3 >> 25);
        pinv2 = pinv - (tmp2 >> 25);
        pinv -= tmp >> 25;
        p3 = p + ((tmp3 & 0x01FC0000) >> 17);
        p2 = p + ((tmp2 & 0x01FC0000) >> 17);
        p += (tmp & 0x01FC0000) >> 17;
        bclr3 = mod_p (bclr3 - block_start, p3, pinv3);
        bclr2 = mod_p (bclr2 - block_start, p2, pinv2);
        bclr = mod_p (bclr - block_start, p, pinv);
        if (bclr3 < block_size) bitOr (locsieve, bclr3);
        if (bclr2 < block_size) bitOr (locsieve, bclr2);
        if (bclr < block_size) bitOr (locsieve, bclr);
    }
    if (i < maxp) {
        bclr = pinfo32[threadIdx.x];
        pinv = pinfo32[threadsPerBlock + threadIdx.x];
        p = pinfo32[threadsPerBlock * 2 + threadIdx.x];
        bclr = sloppy_mod_p (bclr - block_start, p, pinv);
        if (bclr < block_size) bitOr (locsieve, bclr);
        i++, pinfo_dev += threadsPerBlock * 12;
    }
    for ( ; i < sieving1MCrossover && i < maxp; i += 4, pinfo_dev += threadsPerBlock * 16) {
        uint32 tmp4 = pinfo32[threadIdx.x];
        uint32 tmp3 = pinfo32[threadsPerBlock + threadIdx.x];
        uint32 tmp2 = pinfo32[threadsPerBlock*2 + threadIdx.x];
        uint32 tmp = pinfo32[threadsPerBlock*3 + threadIdx.x];
        uint32 bclr4, p4, pinv4, bclr3, p3, pinv3, bclr2, p2, pinv2;
        bclr4 = tmp4 & 0x000FFFFF;
        bclr3 = tmp3 & 0x000FFFFF;
        bclr2 = tmp2 & 0x000FFFFF;
        bclr = tmp & 0x000FFFFF;
        pinv4 = pinv - (tmp4 >> 27);
        pinv3 = pinv - (tmp3 >> 27);
        pinv2 = pinv - (tmp2 >> 27);
        pinv -= tmp >> 27;
        p4 = p + ((tmp4 & 0x07F00000) >> 19);
        p3 = p + ((tmp3 & 0x07F00000) >> 19);
        p2 = p + ((tmp2 & 0x07F00000) >> 19);
        p += (tmp & 0x07F00000) >> 19;
        bclr4 = sloppy_mod_p (bclr4 - block_start, p4, pinv4);
        bclr3 = sloppy_mod_p (bclr3 - block_start, p3, pinv3);
        bclr2 = sloppy_mod_p (bclr2 - block_start, p2, pinv2);
        bclr = sloppy_mod_p (bclr - block_start, p, pinv);
        if (bclr4 < block_size) bitOr (locsieve, bclr4);
        if (bclr3 < block_size) bitOr (locsieve, bclr3);
        if (bclr2 < block_size) bitOr (locsieve, bclr2);
        if (bclr < block_size) bitOr (locsieve, bclr);
    }
    if (i < maxp) {
        bclr = pinfo32[threadIdx.x];
        pinv = pinfo32[threadsPerBlock + threadIdx.x];
        p = pinfo32[threadsPerBlock * 2 + threadIdx.x];
        bclr = sloppy_mod_p (bclr - block_start, p, pinv);
        if (bclr < block_size)
            bitOr (locsieve, bclr);
        i++, pinfo_dev += threadsPerBlock * 12;
    }
    for ( ; i < maxp; i += 4, pinfo_dev += threadsPerBlock * 16) {
        uint32 tmp4 = pinfo32[threadIdx.x];
        uint32 tmp3 = pinfo32[threadsPerBlock + threadIdx.x];
        uint32 tmp2 = pinfo32[threadsPerBlock*2 + threadIdx.x];
        uint32 tmp = pinfo32[threadsPerBlock*3 + threadIdx.x];
        uint32 bclr4, p4, pinv4, bclr3, p3, pinv3, bclr2, p2, pinv2;
        bclr4 = tmp4 & 0x00FFFFFF;
        bclr3 = tmp3 & 0x00FFFFFF;
        bclr2 = tmp2 & 0x00FFFFFF;
        bclr = tmp & 0x00FFFFFF;
        pinv4 = pinv - (tmp4 >> 31);
        pinv3 = pinv - (tmp3 >> 31);
        pinv2 = pinv - (tmp2 >> 31);
        pinv -= tmp >> 31;
        p4 = p + ((tmp4 & 0x7F000000) >> 23);
        p3 = p + ((tmp3 & 0x7F000000) >> 23);
        p2 = p + ((tmp2 & 0x7F000000) >> 23);
        p += (tmp & 0x7F000000) >> 23;
        bclr4 = sloppy_mod_p (bclr4 - block_start, p4, pinv4);
        bclr3 = sloppy_mod_p (bclr3 - block_start, p3, pinv3);
        bclr2 = sloppy_mod_p (bclr2 - block_start, p2, pinv2);
        bclr = sloppy_mod_p (bclr - block_start, p, pinv);
        if (bclr4 < block_size) bitOr (locsieve, bclr4);
        if (bclr3 < block_size) bitOr (locsieve, bclr3);
        if (bclr2 < block_size) bitOr (locsieve, bclr2);
        if (bclr < block_size) bitOr (locsieve, bclr);
    }
    __syncthreads();
    big_bit_array_dev += blockIdx.x * block_size_in_bytes;
    for (j = 0; j < block_size / (threadsPerBlock * 32); j++)
        big_bit_array32[j * threadsPerBlock + threadIdx.x] = ~locsieve32[j * threadsPerBlock + threadIdx.x];
}

__device__ unsigned int modularinverse (uint32 n, uint32 orig_d)
{
    uint32 d = orig_d;
    int x, lastx, q, t;
    x = 0;
    lastx = 1;
    while (d != 0)
    {
        q = n / d;
        t = d; d = n - q * d; n = t;
        t = x; x = lastx - q * x; lastx = t;
    }
    if (lastx < 0) return (lastx + orig_d);
    return (lastx);
}

__global__ void __launch_bounds__(256,6) CalcModularInverses (uint64 exponent, int *calc_info)
{
    uint32 index, prime;
    uint64 facdist;
    if (blockIdx.x == 0) {
        if (threadIdx.x < primesNotSieved || threadIdx.x >= primesNotSieved + primesHandledWithSpecialCode) return;
        index = threadIdx.x;
    }
    else index = primesNotSieved + primesHandledWithSpecialCode + (blockIdx.x - 1) * threadsPerBlock + threadIdx.x;
    prime = calc_info[MAX_PRIMES_PER_THREAD*4 + index * 2];
    facdist = (uint64) (2 * NUM_CLASSES) * exponent;
    calc_info[MAX_PRIMES_PER_THREAD*4 + index * 2 + 1] = modularinverse ((uint32) (facdist % prime), prime);
}

__global__ void __launch_bounds__(256,6) CalcBitToClear (uint64 exponent, int96 *k_base, int *calc_info, uint8 *pinfo_dev)
{
    uint32 index, mask, prime, modinv, bit_to_clear;
    if (blockIdx.x == 0) {
        if (threadIdx.x < primesNotSieved || threadIdx.x >= primesNotSieved + primesHandledWithSpecialCode) return;
        pinfo_dev += threadIdx.x * 2;
        index = threadIdx.x;
    }
    else {
        pinfo_dev += calc_info[(blockIdx.x - 1)];
        pinfo_dev += threadIdx.x * 4;
        index = calc_info[MAX_PRIMES_PER_THREAD + (blockIdx.x - 1)];
        index += threadIdx.x * calc_info[MAX_PRIMES_PER_THREAD*2 + (blockIdx.x - 1)];
        mask = calc_info[MAX_PRIMES_PER_THREAD*3 + (blockIdx.x - 1)];
    }
    prime = calc_info[MAX_PRIMES_PER_THREAD*4 + index * 2];
    modinv = calc_info[MAX_PRIMES_PER_THREAD*4 + index * 2 + 1];
    uint64 k_mod_p, factor_mod_p;
    k_mod_p = (((uint64) k_base->d1 << 32) + k_base->d0) % prime;
    factor_mod_p = (2 * k_mod_p * exponent + 1) % prime;
    bit_to_clear = ((uint64) prime - factor_mod_p) * modinv % prime;
    if (blockIdx.x == 0) *pinfo16 = bit_to_clear;
    else *pinfo32 = (*pinfo32 & mask) + bit_to_clear;
}
"""
mod_gpusieve = SourceModule(kernel_code_gpusieve)
CalcModularInverses = mod_gpusieve.get_function("CalcModularInverses")
CalcBitToClear = mod_gpusieve.get_function("CalcBitToClear")
SegSieve = mod_gpusieve.get_function("SegSieve")

kernel_code_tf = """
typedef long long unsigned int uint64;
typedef struct
{
  unsigned int d0,d1,d2;
}int96;
typedef struct
{
  unsigned int d0,d1,d2,d3,d4,d5;
}int192;

__device__ static unsigned int __umul32(unsigned int a, unsigned int b)
{
    unsigned int r;
    asm("mul.lo.u32 %0, %1, %2;" : "=r" (r) : "r" (a) , "r" (b));
    return r;
}

__device__ static unsigned int __umul32hi(unsigned int a, unsigned int b)
{
    return __umulhi(a,b);
}

__device__ static unsigned int __add_cc(unsigned int a, unsigned int b)
{
    unsigned int r;
    asm("add.cc.u32 %0, %1, %2;" : "=r" (r) : "r" (a) , "r" (b));
    return r;
}

__device__ static unsigned int __addc_cc(unsigned int a, unsigned int b)
{
    unsigned int r;
    asm("addc.cc.u32 %0, %1, %2;" : "=r" (r) : "r" (a) , "r" (b));
    return r;
}

__device__ static unsigned int __addc(unsigned int a, unsigned int b)
{
    unsigned int r;
    asm("addc.u32 %0, %1, %2;" : "=r" (r) : "r" (a) , "r" (b));
    return r;
}

__device__ static unsigned int __sub_cc(unsigned int a, unsigned int b)
{
    unsigned int r;
    asm("sub.cc.u32 %0, %1, %2;" : "=r" (r) : "r" (a) , "r" (b));
    return r;
}

__device__ static unsigned int __subc_cc(unsigned int a, unsigned int b)
{
    unsigned int r;
    asm("subc.cc.u32 %0, %1, %2;" : "=r" (r) : "r" (a) , "r" (b));
    return r;
}

__device__ static unsigned int __subc(unsigned int a, unsigned int b)
{
    unsigned int r;
    asm("subc.u32 %0, %1, %2;" : "=r" (r) : "r" (a) , "r" (b));
    return r;
}

__device__ static int cmp_ge_96(int96 a, int96 b)
{
    if(a.d2 == b.d2)
    {
        if(a.d1 == b.d1)return(a.d0 >= b.d0);
        else            return(a.d1 >  b.d1);
    }
    else                return(a.d2 >  b.d2);
}

__device__ static void shl_96(int96 *a)
{
    a->d0 = __add_cc (a->d0, a->d0);
    a->d1 = __addc_cc(a->d1, a->d1);
    a->d2 = __addc   (a->d2, a->d2);
}

__device__ static void shl_192(int192 *a)
{
    a->d0 = __add_cc (a->d0, a->d0);
    a->d1 = __addc_cc(a->d1, a->d1);
    a->d2 = __addc_cc(a->d2, a->d2);
    a->d3 = __addc_cc(a->d3, a->d3);
    a->d4 = __addc_cc(a->d4, a->d4);
}

__device__ static void sub_96(int96 *res, int96 a, int96 b)
{
    res->d0 = __sub_cc (a.d0, b.d0);
    res->d1 = __subc_cc(a.d1, b.d1);
    res->d2 = __subc   (a.d2, b.d2);
}

__device__ static void square_96_192(int192 *res, int96 a)
{
#if (__CUDA_ARCH__ >= 200) && (CUDART_VERSION >= 4010)
    asm("{\\n\\t"
        ".reg .u32 a2;\\n\\t"
        "mul.lo.u32      %0, %6, %6;\\n\\t"
        "mul.lo.u32      %1, %6, %7;\\n\\t"
        "mul.hi.u32      %2, %6, %7;\\n\\t"
        "add.cc.u32      %1, %1, %1;\\n\\t"
        "addc.cc.u32     %2, %2, %2;\\n\\t"
        "madc.hi.cc.u32  %3, %7, %7, 0;\\n\\t"
        "madc.lo.u32     %4, %8, %8, 0;\\n\\t"
        "add.u32         a2, %8, %8;\\n\\t"
        "mad.hi.cc.u32   %1, %6, %6, %1;\\n\\t"
        "madc.lo.cc.u32  %2, %7, %7, %2;\\n\\t"
        "madc.lo.cc.u32  %3, %7, a2, %3;\\n\\t"
        "addc.u32        %4, %4,  0;\\n\\t"
        "mad.lo.cc.u32   %2, %6, a2, %2;\\n\\t"
        "madc.hi.cc.u32  %3, %6, a2, %3;\\n\\t"
        "madc.hi.cc.u32  %4, %7, a2, %4;\\n\\t"
        "madc.hi.u32     %5, %8, %8, 0;\\n\\t"
        "}"
        : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5)
        : "r" (a.d0), "r" (a.d1), "r" (a.d2));
#else
    asm("{\\n\\t"
        ".reg .u32 a2, t1;\\n\\t"
        "mul.lo.u32      %0, %6, %6;\\n\\t"
        "mul.lo.u32      %1, %6, %7;\\n\\t"
        "mul.hi.u32      %2, %6, %7;\\n\\t"
        "add.cc.u32      %1, %1, %1;\\n\\t"
        "addc.cc.u32     %2, %2, %2;\\n\\t"
        "mul.hi.u32      t1, %7, %7;\\n\\t"
        "addc.cc.u32     %3, t1,  0;\\n\\t"
        "mul.lo.u32      t1, %8, %8;\\n\\t"
        "addc.u32        %4, t1,  0;\\n\\t"
        "add.u32         a2, %8, %8;\\n\\t"
        "mul.hi.u32      t1, %6, %6;\\n\\t"
        "add.cc.u32      %1, %1, t1;\\n\\t"
        "mul.lo.u32      t1, %7, %7;\\n\\t"
        "addc.cc.u32     %2, %2, t1;\\n\\t"
        "mul.lo.u32      t1, %7, a2;\\n\\t"
        "addc.cc.u32     %3, %3, t1;\\n\\t"
        "addc.u32        %4, %4,  0;\\n\\t"
        "mul.lo.u32      t1, %6, a2;\\n\\t"
        "add.cc.u32      %2, %2, t1;\\n\\t"
        "mul.hi.u32      t1, %6, a2;\\n\\t"
        "addc.cc.u32     %3, %3, t1;\\n\\t"
        "mul.hi.u32      t1, %7, a2;\\n\\t"
        "addc.cc.u32     %4, %4, t1;\\n\\t"
        "mul.hi.u32      t1, %8, %8;\\n\\t"
        "addc.u32        %5, t1,  0;\\n\\t"
        "}"
        : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5)
        : "r" (a.d0), "r" (a.d1), "r" (a.d2));
#endif
}

__device__ static void mod_192_96(int96 *res, int192 q, int96 n, float nf)
{
    float qf;
    unsigned int qi;
    int192 nn;
    qf= __uint2float_rn(q.d4);
    qf= qf * 4294967296.0f + __uint2float_rn(q.d3);
    qf= qf * 4294967296.0f + __uint2float_rn(q.d2);
    qf*= 512.0f;
    qi=__float2uint_rz(qf*nf);
    nn.d1 =                                 __umul32(n.d0, qi);
    nn.d2 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
    nn.d3 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
    nn.d4 = __addc   (__umul32hi(n.d2, qi),                  0);
    nn.d4 = (nn.d4 << 23) + (nn.d3 >> 9);
    nn.d3 = (nn.d3 << 23) + (nn.d2 >> 9);
    nn.d2 = (nn.d2 << 23) + (nn.d1 >> 9);
    nn.d1 =  nn.d1 << 23;
    q.d1 = __sub_cc (q.d1, nn.d1);
    q.d2 = __subc_cc(q.d2, nn.d2);
    q.d3 = __subc_cc(q.d3, nn.d3);
    q.d4 = __subc   (q.d4, nn.d4);
    qf= __uint2float_rn(q.d4);
    qf= qf * 4294967296.0f + __uint2float_rn(q.d3);
    qf= qf * 4294967296.0f + __uint2float_rn(q.d2);
    qf*= 536870912.0f;
    qi=__float2uint_rz(qf*nf);
    qi <<= 3;
    nn.d1 =                                 __umul32(n.d0, qi);
    nn.d2 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
    nn.d3 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
    nn.d4 = __addc   (__umul32hi(n.d2, qi),                  0);
    q.d1 = __sub_cc (q.d1, nn.d1);
    q.d2 = __subc_cc(q.d2, nn.d2);
    q.d3 = __subc_cc(q.d3, nn.d3);
    q.d4 = __subc   (q.d4, nn.d4);
    qf= __uint2float_rn(q.d4);
    qf= qf * 4294967296.0f + __uint2float_rn(q.d3);
    qf= qf * 4294967296.0f + __uint2float_rn(q.d2);
    qf= qf * 4294967296.0f + __uint2float_rn(q.d1);
    qf*= 131072.0f;
    qi=__float2uint_rz(qf*nf);
    nn.d0 =                                 __umul32(n.d0, qi);
    nn.d1 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
    nn.d2 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
    nn.d3 = __addc   (__umul32hi(n.d2, qi),                  0);
    nn.d3 = (nn.d3 << 15) + (nn.d2 >> 17);
    nn.d2 = (nn.d2 << 15) + (nn.d1 >> 17);
    nn.d1 = (nn.d1 << 15) + (nn.d0 >> 17);
    nn.d0 =  nn.d0 << 15;
    q.d0 = __sub_cc (q.d0, nn.d0);
    q.d1 = __subc_cc(q.d1, nn.d1);
    q.d2 = __subc_cc(q.d2, nn.d2);
    q.d3 = __subc   (q.d3, nn.d3);
    qf= __uint2float_rn(q.d3);
    qf= qf * 4294967296.0f + __uint2float_rn(q.d2);
    qf= qf * 4294967296.0f + __uint2float_rn(q.d1);
    qf= qf * 4294967296.0f + __uint2float_rn(q.d0);
    qi=__float2uint_rz(qf*nf);
    nn.d0 =                                 __umul32(n.d0, qi);
    nn.d1 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
    nn.d2 = __addc   (__umul32hi(n.d1, qi), __umul32(n.d2, qi));
    q.d0 = __sub_cc (q.d0, nn.d0);
    q.d1 = __subc_cc(q.d1, nn.d1);
    q.d2 = __subc   (q.d2, nn.d2);
    res->d0=q.d0;
    res->d1=q.d1;
    res->d2=q.d2;
}

__device__ static void check_factor96(int96 f, int96 a, unsigned int *RES)
{
    int index;
    if((a.d2|a.d1) == 0 && a.d0 == 1)
    {
        if(f.d2 != 0 || f.d1 != 0 || f.d0 != 1)
        {
            index=atomicInc(&RES[0], 10000);
            if(index < 10)
            {
                RES[index * 3 + 1] = f.d2;
                RES[index * 3 + 2] = f.d1;
                RES[index * 3 + 3] = f.d0;
            }
        }
    }
}

__device__ static void test_FC96_mfaktc_95(int96 f, int192 b, uint64 exp, unsigned int *RES, int shiftcount)
{
    int96 a;
    float ff;
    ff= __uint2float_rn(f.d2);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d1);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d0);
    ff=__int_as_float(0x3f7ffffb) / ff;
    mod_192_96(&a,b,f,ff);
    exp <<= 64 - shiftcount;
    while(exp)
    {
        square_96_192(&b,a);
        if(exp&0x8000000000000000ULL) shl_192(&b);
        mod_192_96(&a,b,f,ff);
        exp<<=1;
    }
    if(cmp_ge_96(a,f))
    {
        sub_96(&a,a,f);
    }
    check_factor96(f, a, RES);
}

__device__ static void create_k_deltas(unsigned int *bit_array, unsigned int bits_to_process, int *total_bit_count, unsigned short *k_deltas)
{
    int i, words_per_thread, sieve_word, k_bit_base;
    __shared__ volatile unsigned short bitcount[256];
    words_per_thread = bits_to_process / 8192;
    bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;
    bitcount[threadIdx.x] = 0;
    for (i = 0; i < words_per_thread; i++) bitcount[threadIdx.x] += __popc(bit_array[i]);
    if (threadIdx.x & 1) bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];
    if (threadIdx.x & 2) bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];
    if (threadIdx.x & 4) bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];
    if (threadIdx.x & 8) bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];
    if (threadIdx.x & 16) bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];
    __syncthreads();
    if (threadIdx.x & 32) bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];
    __syncthreads();
    if (threadIdx.x & 64) bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];
    __syncthreads();
    if (threadIdx.x & 128) bitcount[threadIdx.x] += bitcount[127];
    __syncthreads();
    *total_bit_count = bitcount[255];
    sieve_word = *bit_array;
    k_bit_base = threadIdx.x * words_per_thread * 32;
    for (i = *total_bit_count - bitcount[threadIdx.x]; ; i++) {
        int bit_to_test;
        while (sieve_word == 0) {
            if (--words_per_thread == 0) break;
            sieve_word = *++bit_array;
            k_bit_base += 32;
        }
        if (sieve_word == 0) break;
        bit_to_test = 31 - __clz(sieve_word);
        sieve_word &= ~(1 << bit_to_test);
        k_deltas[i] = k_bit_base + bit_to_test;
    }
}

#define NUM_CLASSES 4620
#define THREADS_PER_BLOCK 256

__device__ static void create_fbase96(int96 *f_base, int96 *k_base, unsigned int exp_hi, unsigned int exp_low, unsigned int bits_to_process)
{
    unsigned int k_base_d0, k_base_d1;
    k_base_d0 = __add_cc (k_base->d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
    k_base_d1 = __addc   (k_base->d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES));
    f_base->d0 =                                          __umul32(k_base_d0, exp_low);
    f_base->d1 = __add_cc(__umul32hi(k_base_d0, exp_low), __umul32(k_base_d1, exp_low));
    f_base->d2 = __addc  (__umul32hi(k_base_d1, exp_low), __umul32hi(k_base_d0, exp_hi));
    f_base->d1 = __add_cc(                    f_base->d1, __umul32(k_base_d0, exp_hi));
    f_base->d2 = __addc  (                    f_base->d2, __umul32(k_base_d1, exp_hi));
    shl_96(f_base);
    f_base->d0 = f_base->d0 + 1;
}

__global__ void __launch_bounds__(THREADS_PER_BLOCK, 2) mfaktc_95_gs(uint64 exp, int96 *k_base_dev, unsigned int *bit_array, unsigned int bits_to_process, int shiftcount, int192 *b_preinit_dev, unsigned int *RES)
{
    int192 b_preinit;
    memcpy(&b_preinit, b_preinit_dev, 24);
    unsigned int exp_hi, exp_low;
    exp_low = (unsigned int) (exp & 0xFFFFFFFF);
    exp_hi = (unsigned int) (exp >> 32);
    int96 f, f_base;
    int i, total_bit_count, k_delta;
    extern __shared__ unsigned short k_deltas[];
    create_k_deltas(bit_array, bits_to_process, &total_bit_count, k_deltas);
    create_fbase96(&f_base, k_base_dev, exp_hi, exp_low, bits_to_process);
    for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
        k_delta = 2 * NUM_CLASSES * k_deltas[i];
        f.d0 = __add_cc (f_base.d0, __umul32(k_delta, exp_low));
        f.d1 = __addc_cc(f_base.d1, __umul32hi(k_delta, exp_low));
        f.d2 = __addc   (f_base.d2, 0);
        f.d1 = __add_cc (     f.d1, __umul32(k_delta, exp_hi));
        f.d2 = __addc   (     f.d2, __umul32hi(k_delta, exp_hi));
        test_FC96_mfaktc_95(f, b_preinit, exp, RES, shiftcount);
    }
}
"""
mod_tf = SourceModule(kernel_code_tf)
mfaktc_95_gs = mod_tf.get_function("mfaktc_95_gs")

def format_seconds(seconds):
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    formatted_time = f"{int(minutes):2d}m{int(seconds):2d}s".replace(" ", "0")
    if hours >= 1:
        formatted_time = f"{int(hours)}h{formatted_time}"
    return formatted_time

def class_needed(exp, k_min, c):
    if (((2 * (exp %  8) * ((k_min + c) %  8)) %  8 ==  2) or
        ((2 * (exp %  8) * ((k_min + c) %  8)) %  8 ==  4) or
        ((2 * (exp %  3) * ((k_min + c) %  3)) %  3 ==  2) or
        ((2 * (exp %  5) * ((k_min + c) %  5)) %  5 ==  4) or
        ((2 * (exp %  7) * ((k_min + c) %  7)) %  7 ==  6) or
        ((2 * (exp % 11) * ((k_min + c) % 11)) % 11 == 10)):
        return False
    return True

def tf_class(exponent, bit_min, k_min, k_max):
    cuda.memset_d32(d_RES, 0, 1)
    shiftcount = 0
    while (1<<shiftcount) < exponent:
        shiftcount += 1
    shiftcount -= 1
    ln2b = 1
    while ln2b<20 or ln2b<bit_min:
        shiftcount -= 1
        ln2b <<= 1
        if exponent&(1<<shiftcount):
            ln2b += 1
    b_preinit = np.zeros(6, dtype=np.uint32)
    if ln2b < 32:
        b_preinit[0] = 1 << ln2b
    elif ln2b < 64:
        b_preinit[1] = 1 << (ln2b - 32)
    elif ln2b < 96:
        b_preinit[2] = 1 << (ln2b - 64)
    elif ln2b < 128:
        b_preinit[3] = 1 << (ln2b - 96)
    elif ln2b < 160:
        b_preinit[4] = 1 << (ln2b - 128)
    else:
        b_preinit[5] = 1 << (ln2b - 160)
    cuda.memcpy_htod(b_preinit_dev, b_preinit)
    k_base = np.zeros(3, dtype=np.uint32)
    k_base[0] = k_min & 0xFFFFFFFF
    k_base[1] = k_min >> 32
    k_base[2] = 0
    cuda.memcpy_htod(k_base_dev, k_base)
    CalcBitToClear(np.uint64(exponent), k_base_dev, d_calc_bit_to_clear_info, d_sieve_info, block=(threadsPerBlock, 1, 1), grid=(primes_per_thread+1, 1))
    cuda.Context.synchronize()
    while True:
        k_remaining = (k_max - k_min + NUM_CLASSES) // NUM_CLASSES;
        if k_remaining < gpu_sieve_size:
            numblocks = (k_remaining + gpu_sieve_processing_size - 1) // gpu_sieve_processing_size
            k_remaining = numblocks * gpu_sieve_processing_size
        else:
            numblocks = gpu_sieve_size // gpu_sieve_processing_size
        sieve_size = min(gpu_sieve_size, k_remaining)
        SegSieve(d_bitarray, d_sieve_info, np.uint32(primes_per_thread), block=(threadsPerBlock, 1, 1), grid=((sieve_size + block_size - 1) // block_size, 1))
        cuda.Context.synchronize()
        mfaktc_95_gs(np.uint64(exponent), k_base_dev, d_bitarray, np.uint32(gpu_sieve_processing_size), np.int32(shiftcount), b_preinit_dev, d_RES, block=(THREADS_PER_BLOCK, 1, 1), grid=(numblocks, 1), shared=shared_mem_required)
        cuda.Context.synchronize()
        k_min += gpu_sieve_size * NUM_CLASSES
        if k_min > k_max:
            break
        k_base[0] = k_min & 0xFFFFFFFF
        k_base[1] = k_min >> 32
        k_base[2] = 0
        cuda.memcpy_htod(k_base_dev, k_base)
        CalcBitToClear(np.uint64(exponent), k_base_dev, d_calc_bit_to_clear_info, d_sieve_info, block=(threadsPerBlock, 1, 1), grid=(primes_per_thread+1, 1))
        cuda.Context.synchronize()
    cuda.memcpy_dtoh(h_RES, d_RES)
    return h_RES[0]
        
def tf(aid, exponent, bit_min, bit_max):
    factorsfound = 0
    k_min = (1 << (bit_min-1)) // exponent
    k_max = (1 << (bit_max-1)) // exponent
    k_min -= k_min % NUM_CLASSES
    print(f"TF M{exponent} from 2^{bit_min} to 2^{bit_max}")
    print(f"k_min = {k_min}")
    print(f"k_max = {k_max}")
    print("class  Pct        ETA")
    start_time = time.time()
    allfactors = []
    for cur_class in range(NUM_CLASSES):
        if class_needed(exponent, k_min, cur_class):
            try:
                numfactors = tf_class(exponent, bit_min, k_min+cur_class, k_max)
            except Exception as e:
                print(f"Caught error: {e}")
                return -1
            print(f"{cur_class:4d}  {100*(cur_class+1)/NUM_CLASSES:.1f}% {format_seconds((time.time()-start_time)*(NUM_CLASSES/(cur_class+1)-1)):>11}", end="\r")
            if numfactors > 0:
                print()
                for i in range(min(10, numfactors)):
                    mfactor = (int(h_RES[3*i+1]) << 64) + (int(h_RES[3*i+2]) << 32) + int(h_RES[3*i+3])
                    print(f"M{exponent} has a factor: {mfactor}")
                    allfactors.append(str(mfactor))
                    with open('results.txt', 'a') as file:
                        file.write(f"UID: {V5UserID}/{ComputerID}, M{exponent} has a factor: {mfactor} [TF:{bit_min}:{bit_max}:mfaktp 0.04 95bit_mul64_gs]\n")
                if numfactors > 10:
                    print(f"M{exponent}: {numfactors-10} additional factors not shown")
                    with open('results.txt', 'a') as file:
                        file.write(f"UID: {V5UserID}/{ComputerID}, M{exponent}: {numfactors-10} additional factors not shown\n")
                print("class  Pct        ETA")
                factorsfound += numfactors
    if aid is not None:
        aid_log = '"aid":"' + aid + '", '
    else:
        aid_log = ''
    if factorsfound > 0:
        print(f"\nfound {factorsfound} factor{'s' if factorsfound > 1 else ''} for M{exponent} from 2^{bit_min:2d} to 2^{bit_max:2d}")
        with open('results.txt', 'a') as file:
            file.write(f"UID: {V5UserID}/{ComputerID}, found {factorsfound} factor{'s' if factorsfound > 1 else ''} for M{exponent} from 2^{bit_min:2d} to 2^{bit_max:2d} [mfaktp 0.04 95bit_mul64_gs]\n")
        with open('results.json.txt', 'a') as file:
            file.write(f'{{"exponent":{exponent}, "worktype":"TF", "status":"F", "bitlo":{bit_min}, "bithi":{bit_max}, "rangecomplete":true, "factors":{json.dumps(allfactors)}, "program":{{"name":"mfaktp", "version":"0.0.4", "subversion":"95bit_mul64_gs"}}, "timestamp":"{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}", {aid_log}"user":"{V5UserID}", "computer":"{ComputerID}", "os":{{"os": "{platform.system()}", "architecture": "{platform.machine()}"}}}}\n')
    else:
        print(f"\nno factor for M{exponent} from 2^{bit_min:2d} to 2^{bit_max:2d}")
        with open('results.txt', 'a') as file:
            file.write(f"UID: {V5UserID}/{ComputerID}, no factor for M{exponent} from 2^{bit_min:2d} to 2^{bit_max:2d} [mfaktp 0.04 95bit_mul64_gs]\n")
        with open('results.json.txt', 'a') as file:
            file.write(f'{{"exponent":{exponent}, "worktype":"TF", "status":"NF", "bitlo":{bit_min}, "bithi":{bit_max}, "rangecomplete":true, "program":{{"name":"mfaktp", "version":"0.0.4", "subversion":"95bit_mul64_gs"}}, "timestamp":"{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}", {aid_log}"user":"{V5UserID}", "computer":"{ComputerID}", "os":{{"os": "{platform.system()}", "architecture": "{platform.machine()}"}}}}\n')
    print(f"TF runtime: {format_seconds(time.time()-start_time)}\n")
    return 0


print("mfaktp v0.04\n")
gpu_sieve_size = GPUSieveSize * 1024 * 1024
gpu_sieve_primes = GPUSievePrimes
gpu_sieve_processing_size = GPUSieveProcessSize * 1024
if gpu_sieve_primes < 54:
    shared_mem_required = 100
elif gpu_sieve_primes < 310:
    shared_mem_required = 50
elif gpu_sieve_primes < 1846:
    shared_mem_required = 38
elif gpu_sieve_primes < 21814:
    shared_mem_required = 30
elif gpu_sieve_primes < 67894:
    shared_mem_required = 24
else:
    shared_mem_required = 22
shared_mem_required = gpu_sieve_processing_size * shared_mem_required // 25;
total_time = time.time()
try:
    h_RES = cuda.pagelocked_empty(32, dtype=np.uint32)
    d_RES = cuda.mem_alloc(128)
    k_base_dev = cuda.mem_alloc(12)
    b_preinit_dev = cuda.mem_alloc(24)
    cuda.Context.set_cache_config(cuda.func_cache.PREFER_SHARED)
    d_bitarray = cuda.mem_alloc(gpu_sieve_size // 8)
    gpu_sieve_primes = ((gpu_sieve_primes - primesNotSieved - primesHandledWithSpecialCode) // threadsPerBlock) * threadsPerBlock + primesNotSieved + primesHandledWithSpecialCode
    while True:
        primes_per_thread = (gpu_sieve_primes - primesNotSieved - primesHandledWithSpecialCode) // threadsPerBlock
        if primes_per_thread > 1:
            loop_count = min(primes_per_thread, sieving64KCrossover) - 1
            if (loop_count % 3) != 0:
                gpu_sieve_primes += threadsPerBlock
                continue
        if primes_per_thread == sieving64KCrossover + 1:
            gpu_sieve_primes += threadsPerBlock
            continue
        if primes_per_thread > sieving64KCrossover + 1:
            loop_count = min(primes_per_thread, sieving128KCrossover + 1) - (sieving64KCrossover + 1)
            if (loop_count % 3) != 1:
                gpu_sieve_primes += threadsPerBlock
                continue
        if primes_per_thread > sieving128KCrossover + 1:
            loop_count = min(primes_per_thread, sieving1MCrossover) - (sieving128KCrossover + 1)
            if (loop_count % 4) != 1:
                gpu_sieve_primes += threadsPerBlock
                continue
        loop_count = primes_per_thread - sieving1MCrossover
        if primes_per_thread > sieving1MCrossover:
            loop_count = primes_per_thread - sieving1MCrossover
            if (loop_count % 4) != 1:
                gpu_sieve_primes += threadsPerBlock
                continue
        break
    gpu_sieve_min_exp = prime(gpu_sieve_primes) + 1
    primes = list(primerange(gpu_sieve_min_exp))
    pinfo = np.zeros(gpu_sieve_primes * 3, dtype=np.uint32)
    rowinfo_size = MAX_PRIMES_PER_THREAD * 16 + gpu_sieve_primes * 8
    rowinfo = np.zeros(rowinfo_size // 4, dtype=np.uint32)
    i = primesNotSieved + primesHandledWithSpecialCode
    p = 256
    row = 0
    loop_end = min(primes_per_thread, sieving64KCrossover)
    while i < primesNotSieved + primesHandledWithSpecialCode + loop_end * threadsPerBlock:
        rowinfo[row] = 4*p
        rowinfo[row+MAX_PRIMES_PER_THREAD] = i
        rowinfo[row+MAX_PRIMES_PER_THREAD*2] = 1
        rowinfo[row+MAX_PRIMES_PER_THREAD*3] = 0xFFFF0000
        row += 1
        for j in range(threadsPerBlock):
            pinfo[p+j] = primes[i+j] << 16
            pinfo[p+j+threadsPerBlock] = 0xFFFFFFFF // primes[i+j] + 1
        i += threadsPerBlock
        p += threadsPerBlock * 2
    loop_end = min(primes_per_thread, sieving64KCrossover + 1)
    while i < primesNotSieved + primesHandledWithSpecialCode + loop_end * threadsPerBlock:
        rowinfo[row] = 4*p
        rowinfo[row+MAX_PRIMES_PER_THREAD] = i
        rowinfo[row+MAX_PRIMES_PER_THREAD*2] = 1
        rowinfo[row+MAX_PRIMES_PER_THREAD*3] = 0
        row += 1
        for j in range(threadsPerBlock):
            pinfo[p+j] = 0
            pinfo[p+j+threadsPerBlock] = 0xFFFFFFFF // primes[i+j] + 1
            pinfo[p+j+threadsPerBlock*2] = primes[i+j];
        i += threadsPerBlock
        p += threadsPerBlock * 3
    if primes_per_thread > sieving64KCrossover + 1:
        loop_count = min(primes_per_thread, sieving128KCrossover + 1) - (sieving64KCrossover + 1)
        rowinfo[row] = 4*p
        rowinfo[row+MAX_PRIMES_PER_THREAD] = i
        rowinfo[row+MAX_PRIMES_PER_THREAD*2] = loop_count
        rowinfo[row+MAX_PRIMES_PER_THREAD*3] = 0
        row += 1
        for j in range(threadsPerBlock):
            pinfo[p+j] = 0
            pinfo[p+j+threadsPerBlock] = 0xFFFFFFFF // primes[i+j*loop_count] + 1
            pinfo[p+j+threadsPerBlock*2] = primes[i+j*loop_count];
        p += threadsPerBlock * 3
    if primes_per_thread > sieving64KCrossover + 2:
        for k in range(1, loop_count):
            rowinfo[row] = 4*p + (k - 1) * threadsPerBlock * 4
            rowinfo[row+MAX_PRIMES_PER_THREAD] = i + k
            rowinfo[row+MAX_PRIMES_PER_THREAD*2] = loop_count
            rowinfo[row+MAX_PRIMES_PER_THREAD*3] = 0xFFFC0000
            row += 1
        for k in range(1, loop_count, 3):
            for j in range(threadsPerBlock):
                index = i + j * loop_count + k
                pdiff = (primes[index] - primes[index-1]) // 2
                pinvdiff = 0xFFFFFFFF // primes[index-1] - 0xFFFFFFFF // primes[index]
                pinfo[p + (k - 1) * threadsPerBlock + j] = (pinvdiff << 25) + (pdiff << 18)
                index += 1
                pdiff = (primes[index] - primes[index-2]) // 2
                pinvdiff = 0xFFFFFFFF // primes[index-2] - 0xFFFFFFFF // primes[index]
                pinfo[p + k * threadsPerBlock + j] = (pinvdiff << 25) + (pdiff << 18)
                index += 1
                pdiff = (primes[index] - primes[index-3]) // 2
                pinvdiff = 0xFFFFFFFF // primes[index-3] - 0xFFFFFFFF // primes[index]
                pinfo[p + (k + 1) * threadsPerBlock + j] = (pinvdiff << 25) + (pdiff << 18)
        p += (loop_count - 1) * threadsPerBlock
        i += loop_count * threadsPerBlock
    if primes_per_thread > sieving128KCrossover + 1:
        loop_count = min(primes_per_thread, sieving1MCrossover) - (sieving128KCrossover + 1)
        rowinfo[row] = 4*p
        rowinfo[row+MAX_PRIMES_PER_THREAD] = i
        rowinfo[row+MAX_PRIMES_PER_THREAD*2] = loop_count
        rowinfo[row+MAX_PRIMES_PER_THREAD*3] = 0
        row += 1
        for j in range(threadsPerBlock):
            pinfo[p+j] = 0
            pinfo[p+j+threadsPerBlock] = int(4294967296 / primes[i+j*loop_count] - 0.5)
            pinfo[p+j+threadsPerBlock*2] = primes[i+j*loop_count];
        p += threadsPerBlock * 3
    if primes_per_thread > sieving128KCrossover + 2:
        for k in range(1, loop_count):
            rowinfo[row] = 4*p + (k - 1) * threadsPerBlock * 4
            rowinfo[row+MAX_PRIMES_PER_THREAD] = i + k
            rowinfo[row+MAX_PRIMES_PER_THREAD*2] = loop_count
            rowinfo[row+MAX_PRIMES_PER_THREAD*3] = 0xFFF00000
            row += 1
        for k in range(1, loop_count, 4):
            for j in range(threadsPerBlock):
                index = i + j * loop_count + k
                pdiff = (primes[index] - primes[index-1]) // 2
                pinvdiff = int(4294967296/primes[index-1]-0.5) - int(4294967296 / primes[index] - 0.5)
                pinfo[p + (k - 1) * threadsPerBlock + j] = (pinvdiff << 27) + (pdiff << 20)
                index += 1
                pdiff = (primes[index] - primes[index-2]) // 2
                pinvdiff = int(4294967296/primes[index-2]-0.5) - int(4294967296 / primes[index] - 0.5)
                pinfo[p + k * threadsPerBlock + j] = (pinvdiff << 27) + (pdiff << 20)
                index += 1
                pdiff = (primes[index] - primes[index-3]) // 2
                pinvdiff = int(4294967296/primes[index-3]-0.5) - int(4294967296 / primes[index] - 0.5)
                pinfo[p + (k + 1) * threadsPerBlock + j] = (pinvdiff << 27) + (pdiff << 20)
                index += 1
                pdiff = (primes[index] - primes[index-4]) // 2
                pinvdiff = int(4294967296/primes[index-4]-0.5) - int(4294967296 / primes[index] - 0.5)
                pinfo[p + (k + 2) * threadsPerBlock + j] = (pinvdiff << 27) + (pdiff << 20)
        p += (loop_count - 1) * threadsPerBlock
        i += loop_count * threadsPerBlock
    if primes_per_thread > sieving1MCrossover:
        loop_count = primes_per_thread - sieving1MCrossover
        rowinfo[row] = 4*p
        rowinfo[row+MAX_PRIMES_PER_THREAD] = i
        rowinfo[row+MAX_PRIMES_PER_THREAD*2] = loop_count
        rowinfo[row+MAX_PRIMES_PER_THREAD*3] = 0
        row += 1
        for j in range(threadsPerBlock):
            pinfo[p+j] = 0
            pinfo[p+j+threadsPerBlock] = int(4294967296 / primes[i+j*loop_count] - 0.5)
            pinfo[p+j+threadsPerBlock*2] = primes[i+j*loop_count];
        p += threadsPerBlock * 3
    if primes_per_thread > sieving1MCrossover + 1:
        for k in range(1, loop_count):
            rowinfo[row] = 4*p + (k - 1) * threadsPerBlock * 4
            rowinfo[row+MAX_PRIMES_PER_THREAD] = i + k
            rowinfo[row+MAX_PRIMES_PER_THREAD*2] = loop_count
            rowinfo[row+MAX_PRIMES_PER_THREAD*3] = 0xFF000000
            row += 1
        for k in range(1, loop_count, 4):
            for j in range(threadsPerBlock):
                index = i + j * loop_count + k
                pdiff = (primes[index] - primes[index-1]) // 2
                pinvdiff = int(4294967296/primes[index-1]-0.5) - int(4294967296 / primes[index] - 0.5)
                pinfo[p + (k - 1) * threadsPerBlock + j] = (pinvdiff << 31) + (pdiff << 24)
                index += 1
                pdiff = (primes[index] - primes[index-2]) // 2
                pinvdiff = int(4294967296/primes[index-2]-0.5) - int(4294967296 / primes[index] - 0.5)
                pinfo[p + k * threadsPerBlock + j] = (pinvdiff << 31) + (pdiff << 24)
                index += 1
                pdiff = (primes[index] - primes[index-3]) // 2
                pinvdiff = int(4294967296/primes[index-3]-0.5) - int(4294967296 / primes[index] - 0.5)
                pinfo[p + (k + 1) * threadsPerBlock + j] = (pinvdiff << 31) + (pdiff << 24)
                index += 1
                pdiff = (primes[index] - primes[index-4]) // 2
                pinvdiff = int(4294967296/primes[index-4]-0.5) - int(4294967296 / primes[index] - 0.5)
                pinfo[p + (k + 2) * threadsPerBlock + j] = (pinvdiff << 31) + (pdiff << 24)
        p += (loop_count - 1) * threadsPerBlock
        i += loop_count * threadsPerBlock
    for i in range(primesNotSieved, gpu_sieve_primes):
        rowinfo[MAX_PRIMES_PER_THREAD*4 + 2*i] = primes[i]
    pinfo_size = 4*p
    d_sieve_info = cuda.mem_alloc(pinfo_size)
    cuda.memcpy_htod(d_sieve_info, pinfo[:p])
    d_calc_bit_to_clear_info = cuda.mem_alloc(rowinfo_size)
    cuda.memcpy_htod(d_calc_bit_to_clear_info, rowinfo)
    TF_assignments = Assignments.splitlines()
    for Assignment in TF_assignments:
        if Assignment.startswith("Factor="):
            TF_assignment = Assignment[7:].split(',')
            number_of_parts = len(TF_assignment)
            try:
                if number_of_parts == 3:
                    aid = None
                    exponent = int(TF_assignment[0])
                    bit_min = int(TF_assignment[1])
                    bit_max = int(TF_assignment[2])
                elif number_of_parts == 4:
                    aid = TF_assignment[0]
                    exponent = int(TF_assignment[1])
                    bit_min = int(TF_assignment[2])
                    bit_max = int(TF_assignment[3])
                else:
                    print(f'WARNING: ignoring line "{Assignment}"!')
                    continue
            except:
                print(f'WARNING: ignoring line "{Assignment}"!')
                continue
            if 100000 < exponent < 551314208619 and isprime(exponent) and 1 <= bit_min < bit_max <= 95:
                CalcModularInverses(np.uint64(exponent), d_calc_bit_to_clear_info, block=(threadsPerBlock, 1, 1), grid=(primes_per_thread+1, 1))
                cuda.Context.synchronize()
                tmp = tf(aid, exponent, bit_min, bit_max)
                if tmp != 0:
                    break
            else:
                print(f'WARNING: ignoring line "{Assignment}"!')
except Exception as e:
    print(f"Caught error: {e}")
print(f"total time spent: {time.time()-total_time} s")
