*** _/tf_f0_31.h	2012-09-23 00:42:46.000000000 -0700
--- tf_f0_31.h	2012-09-25 00:42:56.259187546 -0700
***************
*** 206,211 ****
--- 206,212 ----
      // Could write optimized div_192_96 with so many tmp192 elements known to be zero
      div_192_96(&u,tmp192,f,ff);				// u = floor(2^(95 + bits_in_f) / f), giving 96 bits of precision
  
+ #if 0
  							// b_preinit = 2^128
  							// a = b_preinit / 2 ^ (bits_in_f - 1)
  							// tmp192 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (95 + bits_in_f) / f)     (ignore the floor functions for now)
***************
*** 218,225 ****
      a.d0 = __sub_cc (0, tmp96.d0);			// Compute the remainder
      a.d1 = __subc_cc(0, tmp96.d1);			// we do not need the upper digits of b_preinit and tmp96 because the result is 0 after subtraction!
      a.d2 = __subc   (0, tmp96.d2);
  
!     for (shifter = 0; shifter < exp - 2 - 7; shifter++)
      {
  							// On input a is at most 91.807 bits (see end of this loop)
  
--- 219,229 ----
      a.d0 = __sub_cc (0, tmp96.d0);			// Compute the remainder
      a.d1 = __subc_cc(0, tmp96.d1);			// we do not need the upper digits of b_preinit and tmp96 because the result is 0 after subtraction!
      a.d2 = __subc   (0, tmp96.d2);
+ #endif
+ 
+     BASE_preinit;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n1; shifter++)
      {
  							// On input a is at most 91.807 bits (see end of this loop)
  
***************
*** 446,451 ****
--- 450,456 ----
      // Could write optimized div_224_96 with so many tmp224 elements known to be zero
      div_224_96(&u,tmp224,f,ff);				// u = floor(2^208 / f).  This requires f >= 81 bits.
  
+ #if 0
  							// b_preinit = 2^128
  							// a = b_preinit / 2^80 = 2^48
  							// tmp256 = a * u = (b_preinit / 2^80) * (2^208 / f)     (ignore the floor functions for now)
***************
*** 458,465 ****
      a.d1 = __subc_cc(0, tmp128.d1);			// we do not need the upper digits of b_preinit and tmp192 because the result is 0 after subtraction!
      a.d2 = __subc_cc(0, tmp128.d2);
      a.d3 = __subc   (0, tmp128.d3);
  
!     for (shifter = 0; shifter < exp - 2 - 7; shifter++)
      {
  							// On input a is at most 99.17 bits (see end of this loop)
  
--- 463,473 ----
      a.d1 = __subc_cc(0, tmp128.d1);			// we do not need the upper digits of b_preinit and tmp192 because the result is 0 after subtraction!
      a.d2 = __subc_cc(0, tmp128.d2);
      a.d3 = __subc   (0, tmp128.d3);
+ #endif
+     
+     a.d3 = 0; BASE_preinit;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n1; shifter++)
      {
  							// On input a is at most 99.17 bits (see end of this loop)
  
*** _/tf_f128_159.h	2012-09-09 09:46:09.000000000 -0700
--- tf_f128_159.h	2012-09-25 00:23:51.199250328 -0700
***************
*** 206,211 ****
--- 206,212 ----
      // Could write optimized div_352_192 with so many tmp352 elements known to be zero
      div_352_192(&u,tmp352,f,ff);			// u = floor(2^352 / f).  This requires f >= 161 bits.
  
+ #if 0
  							// b_preinit = 2^256
  							// a = b_preinit / 2^160 = 2^96
  							// tmp352 = a * u = (b_preinit / 2^160) * (2^352 / f)     (ignore the floor functions for now)
***************
*** 221,228 ****
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
  
!     for (shifter = 0; shifter < exp - 2 - 8; shifter++)
      {
  							// On input a is at most 175.700 bits (see end of this loop)
  
--- 222,231 ----
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
+ #endif
+     a.d5 = 0; BASE_preinit2;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n2; shifter++)
      {
  							// On input a is at most 175.700 bits (see end of this loop)
  
***************
*** 461,466 ****
--- 464,470 ----
      // Could write optimized div_384_192 with so many tmp384 elements known to be zero
      div_384_192(&u,tmp384,f,ff);			// u = floor(2^(191 + bits_in_f) / f), giving 192 bits of precision, requires f >= 161 bits
  
+ #if 0
  							// b_preinit = 2^256
  							// a = b_preinit / 2 ^ (bits_in_f - 1)
  							// tmp384 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (191 + bits_in_f) / f)     (ignore the floor functions for now)
***************
*** 477,484 ****
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
  
!     for (shifter = 0; shifter < exp - 2 - 8; shifter++)
      {
  							// On input a is at most 186.700 bits (see end of this loop)
  
--- 481,490 ----
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
+ #endif
+     a.d5 = 0; BASE_preinit2;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n2; shifter++)
      {
  							// On input a is at most 186.700 bits (see end of this loop)
  
***************
*** 718,723 ****
--- 724,730 ----
      // Could write optimized div_384_192 with so many tmp384 elements known to be zero
      div_384_192(&u,tmp384,f,ff);			// u = floor(2^(191 + bits_in_f) / f), giving 192 bits of precision, requires f >= 161 bits
  
+ #if 0
  							// b_preinit = 2^256
  							// a = b_preinit / 2 ^ (bits_in_f - 1)
  							// tmp384 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (191 + bits_in_f) / f)     (ignore the floor functions for now)
***************
*** 734,741 ****
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
  
!     for (shifter = 0; shifter < exp - 2 - 8; shifter++)
      {
  							// On input a is at most 185.858 bits (see end of this loop)
  
--- 741,750 ----
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
+ #endif
+     a.d5 = 0; BASE_preinit2;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n2; shifter++)
      {
  							// On input a is at most 185.858 bits (see end of this loop)
  
***************
*** 984,989 ****
--- 993,999 ----
      // Could write optimized div_384_192 with so many tmp384 elements known to be zero
      div_384_192(&u,tmp384,f,ff);			// u = floor(2^(191 + bits_in_f) / f), giving 192 bits of precision, requires f >= 161 bits
  
+ #if 0
  							// b_preinit = 2^256
  							// a = b_preinit / 2 ^ (bits_in_f - 1)
  							// tmp384 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (191 + bits_in_f) / f)     (ignore the floor functions for now)
***************
*** 1000,1007 ****
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
  
!     for (shifter = 0; shifter < exp - 2 - 8; shifter++)
      {
  							// On input a is at most 188 bits (see end of this loop)
  
--- 1010,1019 ----
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
+ #endif
+     a.d5 = 0; BASE_preinit2;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n2; shifter++)
      {
  							// On input a is at most 188 bits (see end of this loop)
  
*** _/tf_f32_63.h	2012-09-23 00:43:01.000000000 -0700
--- tf_f32_63.h	2012-09-25 00:43:32.182185576 -0700
***************
*** 313,318 ****
--- 313,319 ----
      // Could write optimized div_192_96 with so many tmp192 elements known to be zero
      div_192_96(&u,tmp192,f,ff);				// u = floor(2^(95 + bits_in_f) / f), giving 96 bits of precision
  
+ #if 0
  							// b_preinit = 2^128
  							// a = b_preinit / 2 ^ (bits_in_f - 1)
  							// tmp192 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (95 + bits_in_f) / f)     (ignore the floor functions for now)
***************
*** 325,332 ****
      a.d0 = __sub_cc (0, tmp96.d0);			// Compute the remainder
      a.d1 = __subc_cc(0, tmp96.d1);			// we do not need the upper digits of b_preinit and tmp96 because the result is 0 after subtraction!
      a.d2 = __subc   (0, tmp96.d2);
  
!     for (shifter = 0; shifter < exp - 2 - 7; shifter++)
      {
  							// On input a is at most 91.807 bits (see end of this loop)
  
--- 326,335 ----
      a.d0 = __sub_cc (0, tmp96.d0);			// Compute the remainder
      a.d1 = __subc_cc(0, tmp96.d1);			// we do not need the upper digits of b_preinit and tmp96 because the result is 0 after subtraction!
      a.d2 = __subc   (0, tmp96.d2);
+ #endif
+     BASE_preinit;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n1; shifter++)
      {
  							// On input a is at most 91.807 bits (see end of this loop)
  
***************
*** 553,558 ****
--- 556,562 ----
      // Could write optimized div_224_96 with so many tmp224 elements known to be zero
      div_224_96(&u,tmp224,f,ff);				// u = floor(2^208 / f).  This requires f >= 81 bits.
  
+ #if 0
  							// b_preinit = 2^128
  							// a = b_preinit / 2^80 = 2^48
  							// tmp256 = a * u = (b_preinit / 2^80) * (2^208 / f)     (ignore the floor functions for now)
***************
*** 565,572 ****
      a.d1 = __subc_cc(0, tmp128.d1);			// we do not need the upper digits of b_preinit and tmp192 because the result is 0 after subtraction!
      a.d2 = __subc_cc(0, tmp128.d2);
      a.d3 = __subc   (0, tmp128.d3);
  
!     for (shifter = 0; shifter < exp - 2 - 7; shifter++)
      {
  							// On input a is at most 99.17 bits (see end of this loop)
  
--- 569,578 ----
      a.d1 = __subc_cc(0, tmp128.d1);			// we do not need the upper digits of b_preinit and tmp192 because the result is 0 after subtraction!
      a.d2 = __subc_cc(0, tmp128.d2);
      a.d3 = __subc   (0, tmp128.d3);
+ #endif
+     a.d3 = 0; BASE_preinit;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n1; shifter++)
      {
  							// On input a is at most 99.17 bits (see end of this loop)
  
***************
*** 804,809 ****
--- 810,816 ----
      // Could write optimized div_224_128 with so many tmp224 elements known to be zero
      div_224_128(&u,tmp224,f,ff);			// u = floor(2^224 / f).  This requires f >= 97 bits.
  
+ #if 0
  							// b_preinit = 2^128
  							// a = b_preinit / 2^96 = 2^32
  							// tmp256 = a * u = (b_preinit / 2^96) * (2^224 / f)     (ignore the floor functions for now)
***************
*** 815,822 ****
      a.d1 = __subc_cc(0, tmp128.d1);			// we do not need the upper digits of b_preinit and tmp128 because the result is 0 after subtraction!
      a.d2 = __subc_cc(0, tmp128.d2);
      a.d3 = __subc   (0, tmp128.d3);
  
!     for (shifter = 0; shifter < exp - 2 - 7; shifter++)
      {
  							// On input a is at most 111.17 bits (see end of this loop)
  
--- 822,831 ----
      a.d1 = __subc_cc(0, tmp128.d1);			// we do not need the upper digits of b_preinit and tmp128 because the result is 0 after subtraction!
      a.d2 = __subc_cc(0, tmp128.d2);
      a.d3 = __subc   (0, tmp128.d3);
+ #endif
+     a.d3 = 0; BASE_preinit; 
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n1; shifter++)
      {
  							// On input a is at most 111.17 bits (see end of this loop)
  
***************
*** 1048,1053 ****
--- 1057,1063 ----
      // Could write optimized div_256_128 with so many tmp256 elements known to be zero
      div_256_128(&u,tmp256,f,ff);			// u = floor(2^(127 + bits_in_f) / f), giving 128 bits of precision
  
+ #if 0
  							// b_preinit = 2^128
  							// a = b_preinit / 2 ^ (bits_in_f - 1)
  							// tmp256 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (127 + bits_in_f) / f)     (ignore the floor functions for now)
***************
*** 1060,1067 ****
      a.d1 = __subc_cc(0, tmp128.d1);			// we do not need the upper digits of b_preinit and tmp128 because the result is 0 after subtraction!
      a.d2 = __subc_cc(0, tmp128.d2);
      a.d3 = __subc   (0, tmp128.d3);
  
!     for (shifter = 0; shifter < exp - 2 - 7; shifter++)
      {
  							// On input a is at most 123.17 bits (see end of this loop)
  
--- 1070,1079 ----
      a.d1 = __subc_cc(0, tmp128.d1);			// we do not need the upper digits of b_preinit and tmp128 because the result is 0 after subtraction!
      a.d2 = __subc_cc(0, tmp128.d2);
      a.d3 = __subc   (0, tmp128.d3);
+ #endif
+     a.d3 = 0; BASE_preinit; 
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n1; shifter++)
      {
  							// On input a is at most 123.17 bits (see end of this loop)
  
***************
*** 1293,1298 ****
--- 1305,1311 ----
      // Could write optimized div_288_128 with so many tmp288 elements known to be zero
      div_288_128(&u,tmp288,f,ff);			// u = floor(2^272 / f).  This requires f >= 113 bits.
  
+ #if 0
  							// b_preinit = 2^256
  							// a = b_preinit / 2^112 = 2^144
  							// tmp320 = a * u = (b_preinit / 2^112) * (2^272 / f)     (ignore the floor functions for now)
***************
*** 1309,1316 ****
      a.d2 = __subc_cc(0, tmp160.d2);
      a.d3 = __subc_cc(0, tmp160.d3);
      a.d4 = __subc   (0, tmp160.d4);
  
!     for (shifter = 0; shifter < exp - 2 - 8; shifter++)
      {
  							// On input a is at most 131.459 bits (see end of this loop)
  
--- 1322,1331 ----
      a.d2 = __subc_cc(0, tmp160.d2);
      a.d3 = __subc_cc(0, tmp160.d3);
      a.d4 = __subc   (0, tmp160.d4);
+ #endif
+     a.d4 = a.d3 = 0; BASE_preinit;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n1; shifter++)
      {
  							// On input a is at most 131.459 bits (see end of this loop)
  
*** _/tf_f64_95.h	2012-09-09 09:45:19.000000000 -0700
--- tf_f64_95.h	2012-09-25 00:24:10.345249278 -0700
***************
*** 340,345 ****
--- 340,346 ----
      // Could write optimized div_224_128 with so many tmp224 elements known to be zero
      div_224_128(&u,tmp224,f,ff);			// u = floor(2^224 / f).  This requires f >= 97 bits.
  
+ #if 0
  							// b_preinit = 2^128
  							// a = b_preinit / 2^96 = 2^32
  							// tmp256 = a * u = (b_preinit / 2^96) * (2^224 / f)     (ignore the floor functions for now)
***************
*** 351,358 ****
      a.d1 = __subc_cc(0, tmp128.d1);			// we do not need the upper digits of b_preinit and tmp128 because the result is 0 after subtraction!
      a.d2 = __subc_cc(0, tmp128.d2);
      a.d3 = __subc   (0, tmp128.d3);
  
!     for (shifter = 0; shifter < exp - 2 - 7; shifter++)
      {
  							// On input a is at most 111.17 bits (see end of this loop)
  
--- 352,361 ----
      a.d1 = __subc_cc(0, tmp128.d1);			// we do not need the upper digits of b_preinit and tmp128 because the result is 0 after subtraction!
      a.d2 = __subc_cc(0, tmp128.d2);
      a.d3 = __subc   (0, tmp128.d3);
+ #endif
+     a.d3 = 0; BASE_preinit;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n1; shifter++)
      {
  							// On input a is at most 111.17 bits (see end of this loop)
  
***************
*** 585,590 ****
--- 588,594 ----
      // Could write optimized div_256_128 with so many tmp256 elements known to be zero
      div_256_128(&u,tmp256,f,ff);			// u = floor(2^(127 + bits_in_f) / f), giving 128 bits of precision
  
+ #if 0
  							// b_preinit = 2^128
  							// a = b_preinit / 2 ^ (bits_in_f - 1)
  							// tmp256 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (127 + bits_in_f) / f)     (ignore the floor functions for now)
***************
*** 597,604 ****
      a.d1 = __subc_cc(0, tmp128.d1);			// we do not need the upper digits of b_preinit and tmp128 because the result is 0 after subtraction!
      a.d2 = __subc_cc(0, tmp128.d2);
      a.d3 = __subc   (0, tmp128.d3);
  
!     for (shifter = 0; shifter < exp - 2 - 7; shifter++)
      {
  							// On input a is at most 123.17 bits (see end of this loop)
  
--- 601,610 ----
      a.d1 = __subc_cc(0, tmp128.d1);			// we do not need the upper digits of b_preinit and tmp128 because the result is 0 after subtraction!
      a.d2 = __subc_cc(0, tmp128.d2);
      a.d3 = __subc   (0, tmp128.d3);
+ #endif
+     a.d3 = 0; BASE_preinit;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n1; shifter++)
      {
  							// On input a is at most 123.17 bits (see end of this loop)
  
***************
*** 831,836 ****
--- 837,843 ----
      // Could write optimized div_288_128 with so many tmp288 elements known to be zero
      div_288_128(&u,tmp288,f,ff);			// u = floor(2^272 / f).  This requires f >= 113 bits.
  
+ #if 0
  							// b_preinit = 2^256
  							// a = b_preinit / 2^112 = 2^144
  							// tmp320 = a * u = (b_preinit / 2^112) * (2^272 / f)     (ignore the floor functions for now)
***************
*** 847,854 ****
      a.d2 = __subc_cc(0, tmp160.d2);
      a.d3 = __subc_cc(0, tmp160.d3);
      a.d4 = __subc   (0, tmp160.d4);
  
!     for (shifter = 0; shifter < exp - 2 - 8; shifter++)
      {
  							// On input a is at most 131.459 bits (see end of this loop)
  
--- 854,863 ----
      a.d2 = __subc_cc(0, tmp160.d2);
      a.d3 = __subc_cc(0, tmp160.d3);
      a.d4 = __subc   (0, tmp160.d4);
+ #endif
+     a.d4 = a.d3 = 0; BASE_preinit;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n1; shifter++)
      {
  							// On input a is at most 131.459 bits (see end of this loop)
  
***************
*** 1089,1094 ****
--- 1098,1104 ----
      // Could write optimized div_288_160 with so many tmp288 elements known to be zero
      div_288_160(&u,tmp288,f,ff);			// u = floor(2^288 / f).  This requires f >= 129 bits.
  
+ #if 0
  							// b_preinit = 2^256
  							// a = b_preinit / 2^128 = 2^144
  							// tmp320 = a * u = (b_preinit / 2^128) * (2^288 / f)     (ignore the floor functions for now)
***************
*** 1104,1111 ****
      a.d2 = __subc_cc(0, tmp160.d2);
      a.d3 = __subc_cc(0, tmp160.d3);
      a.d4 = __subc   (0, tmp160.d4);
  
!     for (shifter = 0; shifter < exp - 2 - 8; shifter++)
      {
  							// On input a is at most 143.459 bits (see end of this loop)
  
--- 1114,1123 ----
      a.d2 = __subc_cc(0, tmp160.d2);
      a.d3 = __subc_cc(0, tmp160.d3);
      a.d4 = __subc   (0, tmp160.d4);
+ #endif
+     a.d4 = a.d3 = 0; BASE_preinit;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n1; shifter++)
      {
  							// On input a is at most 143.459 bits (see end of this loop)
  
***************
*** 1341,1346 ****
--- 1353,1359 ----
      // Could write optimized div_320_160 with so many tmp320 elements known to be zero
      div_320_160(&u,tmp320,f,ff);			// u = floor(2^(159 + bits_in_f) / f), giving 160 bits of precision
  
+ #if 0
  							// b_preinit = 2^256
  							// a = b_preinit / 2 ^ (bits_in_f - 1)
  							// tmp320 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (159 + bits_in_f) / f)     (ignore the floor functions for now)
***************
*** 1357,1364 ****
      a.d2 = __subc_cc(0, tmp160.d2);
      a.d3 = __subc_cc(0, tmp160.d3);
      a.d4 = __subc   (0, tmp160.d4);
  
!     for (shifter = 0; shifter < exp - 2 - 8; shifter++)
      {
  							// On input a is at most 155.459 bits (see end of this loop)
  
--- 1370,1379 ----
      a.d2 = __subc_cc(0, tmp160.d2);
      a.d3 = __subc_cc(0, tmp160.d3);
      a.d4 = __subc   (0, tmp160.d4);
+ #endif
+     BASE_preinit2;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n2; shifter++)
      {
  							// On input a is at most 155.459 bits (see end of this loop)
  
***************
*** 1594,1599 ****
--- 1609,1615 ----
      // Could write optimized div_352_160 with so many tmp352 elements known to be zero
      div_352_160(&u,tmp352,f,ff);			// u = floor(2^336 / f).  This requires f >= 145 bits.
  
+ #if 0
  							// b_preinit = 2^256
  							// a = b_preinit / 2^144 = 2^112
  							// tmp352 = a * u = (b_preinit / 2^144) * (2^336 / f)     (ignore the floor functions for now)
***************
*** 1610,1617 ****
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
  
!     for (shifter = 0; shifter < exp - 2 - 8; shifter++)
      {
  							// On input a is at most 163.700 bits (see end of this loop)
  
--- 1626,1635 ----
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
+ #endif
+     a.d5 = 0; BASE_preinit2;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n2; shifter++)
      {
  							// On input a is at most 163.700 bits (see end of this loop)
  
*** _/tf_f96_127.h	2012-09-09 09:45:47.000000000 -0700
--- tf_f96_127.h	2012-09-25 00:24:20.441248725 -0700
***************
*** 307,312 ****
--- 307,313 ----
      // Could write optimized div_288_160 with so many tmp288 elements known to be zero
      div_288_160(&u,tmp288,f,ff);			// u = floor(2^288 / f).  This requires f >= 129 bits.
  
+ #if 0
  							// b_preinit = 2^256
  							// a = b_preinit / 2^128 = 2^144
  							// tmp320 = a * u = (b_preinit / 2^128) * (2^288 / f)     (ignore the floor functions for now)
***************
*** 322,329 ****
      a.d2 = __subc_cc(0, tmp160.d2);
      a.d3 = __subc_cc(0, tmp160.d3);
      a.d4 = __subc   (0, tmp160.d4);
  
!     for (shifter = 0; shifter < exp - 2 - 8; shifter++)
      {
  							// On input a is at most 143.459 bits (see end of this loop)
  
--- 323,332 ----
      a.d2 = __subc_cc(0, tmp160.d2);
      a.d3 = __subc_cc(0, tmp160.d3);
      a.d4 = __subc   (0, tmp160.d4);
+ #endif
+     a.d4 = a.d3 = 0; BASE_preinit;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n1; shifter++)
      {
  							// On input a is at most 143.459 bits (see end of this loop)
  
***************
*** 559,564 ****
--- 562,568 ----
      // Could write optimized div_320_160 with so many tmp320 elements known to be zero
      div_320_160(&u,tmp320,f,ff);			// u = floor(2^(159 + bits_in_f) / f), giving 160 bits of precision
  
+ #if 0
  							// b_preinit = 2^256
  							// a = b_preinit / 2 ^ (bits_in_f - 1)
  							// tmp320 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (159 + bits_in_f) / f)     (ignore the floor functions for now)
***************
*** 575,582 ****
      a.d2 = __subc_cc(0, tmp160.d2);
      a.d3 = __subc_cc(0, tmp160.d3);
      a.d4 = __subc   (0, tmp160.d4);
  
!     for (shifter = 0; shifter < exp - 2 - 8; shifter++)
      {
  							// On input a is at most 155.459 bits (see end of this loop)
  
--- 579,588 ----
      a.d2 = __subc_cc(0, tmp160.d2);
      a.d3 = __subc_cc(0, tmp160.d3);
      a.d4 = __subc   (0, tmp160.d4);
+ #endif
+     BASE_preinit2;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n2; shifter++)
      {
  							// On input a is at most 155.459 bits (see end of this loop)
  
***************
*** 812,817 ****
--- 818,824 ----
      // Could write optimized div_352_160 with so many tmp352 elements known to be zero
      div_352_160(&u,tmp352,f,ff);			// u = floor(2^336 / f).  This requires f >= 145 bits.
  
+ #if 0
  							// b_preinit = 2^256
  							// a = b_preinit / 2^144 = 2^112
  							// tmp352 = a * u = (b_preinit / 2^144) * (2^336 / f)     (ignore the floor functions for now)
***************
*** 828,835 ****
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
  
!     for (shifter = 0; shifter < exp - 2 - 8; shifter++)
      {
  							// On input a is at most 163.700 bits (see end of this loop)
  
--- 835,844 ----
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
+ #endif
+     a.d5 = 0; BASE_preinit2;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n2; shifter++)
      {
  							// On input a is at most 163.700 bits (see end of this loop)
  
***************
*** 1075,1080 ****
--- 1084,1090 ----
      // Could write optimized div_352_192 with so many tmp352 elements known to be zero
      div_352_192(&u,tmp352,f,ff);			// u = floor(2^352 / f).  This requires f >= 161 bits.
  
+ #if 0
  							// b_preinit = 2^256
  							// a = b_preinit / 2^160 = 2^96
  							// tmp352 = a * u = (b_preinit / 2^160) * (2^352 / f)     (ignore the floor functions for now)
***************
*** 1090,1097 ****
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
  
!     for (shifter = 0; shifter < exp - 2 - 8; shifter++)
      {
  							// On input a is at most 175.700 bits (see end of this loop)
  
--- 1100,1109 ----
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
+ #endif
+     a.d5 = 0; BASE_preinit2;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n2; shifter++)
      {
  							// On input a is at most 175.700 bits (see end of this loop)
  
***************
*** 1330,1335 ****
--- 1342,1348 ----
      // Could write optimized div_384_192 with so many tmp384 elements known to be zero
      div_384_192(&u,tmp384,f,ff);			// u = floor(2^(191 + bits_in_f) / f), giving 192 bits of precision, requires f >= 161 bits
  
+ #if 0
  							// b_preinit = 2^256
  							// a = b_preinit / 2 ^ (bits_in_f - 1)
  							// tmp384 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (191 + bits_in_f) / f)     (ignore the floor functions for now)
***************
*** 1346,1353 ****
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
  
!     for (shifter = 0; shifter < exp - 2 - 8; shifter++)
      {
  							// On input a is at most 186.700 bits (see end of this loop)
  
--- 1359,1368 ----
      a.d3 = __subc_cc(0, tmp192.d3);
      a.d4 = __subc_cc(0, tmp192.d4);
      a.d5 = __subc   (0, tmp192.d5);
+ #endif
+     a.d5 = 0; BASE_preinit2;
  
!     for (shifter = 0; shifter < exp - 1 - BASE_n2; shifter++)
      {
  							// On input a is at most 186.700 bits (see end of this loop)
  
*** _/tf_validate.h	2012-09-23 13:26:13.000000000 -0700
--- tf_validate.h	2012-09-25 03:53:55.917559230 -0700
***************
*** 16,21 ****
--- 16,23 ----
  along with mfaktc.  If not, see <http://www.gnu.org/licenses/>.
  */
  
+ #include "tf_gfn.h"
+ 
  // Welcome to the "dumb num" routines
  
  // remove leading zeroes from a
***************
*** 249,255 ****
      remainder.d0=mystuff->h_RES[99];
      print_dez192(remainder,remainder_string);
  
!     printf("Verifying (2^(2^%d)) %% %s = %s\n", (int) mystuff->exponent, factor_string, remainder_string);
    }
  
    // make sure the factor has no really small factors - this would indicate calculating the factor or GPU sieving or class_needed is broken
--- 251,257 ----
      remainder.d0=mystuff->h_RES[99];
      print_dez192(remainder,remainder_string);
  
!     printf("Verifying (%d^(2^%d)) %% %s = %s\n", BASE, (int) mystuff->exponent, factor_string, remainder_string);
    }
  
    // make sure the factor has no really small factors - this would indicate calculating the factor or GPU sieving or class_needed is broken
***************
*** 273,285 ****
  
    // validate the exponentiation
  
!   tmp[0] = 1; tmp[1] = 65536;
!   for (i = 4; i < (mystuff->fermat_factoring ? mystuff->exponent - 2 : mystuff->exponent); i++) {
  	  dn_square (tmp, tmp);
  	  dn_mod (tmp, fac, tmp);
    }
    if (! dn_equal (tmp, rem))
!       printf ("ERROR: Exponentiation failure\n"), exit(1);
  }
  
  
--- 275,287 ----
  
    // validate the exponentiation
  
!   tmp[0] = 1; tmp[1] = BASE_v0;
!   for (i = BASE_n0; i < (mystuff->fermat_factoring ? mystuff->exponent - 1 : mystuff->exponent); i++) {
  	  dn_square (tmp, tmp);
  	  dn_mod (tmp, fac, tmp);
    }
    if (! dn_equal (tmp, rem))
!       printf ("ERROR: Exponentiation failure\n"); //, exit(1);
  }
  
  
***************
*** 302,311 ****
  
    // Look for the exponentiation that returns fac-1 rather than 1
  
!   for (exp = mystuff->exponent-2; exp > 5; exp--)
    {
!     tmp[0] = 1; tmp[1] = 65536;
!     for (i = 4; i < exp; i++) {
  	  dn_square (tmp, tmp);
  	  dn_mod (tmp, fac, tmp);
      }
--- 304,313 ----
  
    // Look for the exponentiation that returns fac-1 rather than 1
  
!   for (exp = mystuff->exponent-1; exp > 5; exp--)
    {
!     tmp[0] = 1; tmp[1] = BASE_v0;
!     for (i = BASE_n0; i < exp; i++) {
  	  dn_square (tmp, tmp);
  	  dn_mod (tmp, fac, tmp);
      }
*** _/mfaktc.c	2012-09-23 08:15:26.000000000 -0700
--- mfaktc.c	2012-09-25 03:50:40.361569952 -0700
***************
*** 628,634 ****
      i++;
    }
  
!   printf("mmff v%s (%dbit built)\n\n", MFAKTC_VERSION, (int)(sizeof(void*)*8));
  
  /* print current configuration */
    
--- 628,634 ----
      i++;
    }
  
!   printf("mmff-gfn v%s (%dbit built)\n\n", MFAKTC_VERSION, (int)(sizeof(void*)*8));
  
  /* print current configuration */
    
*** _/output.c	2012-09-23 12:57:20.000000000 -0700
--- output.c	2012-09-25 03:53:55.907559229 -0700
***************
*** 17,22 ****
--- 17,23 ----
  along with mfaktc.  If not, see <http://www.gnu.org/licenses/>.
  */
  
+ #include "tf_gfn.h"
  
  #include <stdio.h>
  #include <stdlib.h>
***************
*** 35,41 ****
  
  void print_help(char *string)
  {
!   printf("mmff v%s Copyright (C) 2009, 2010, 2011, 2012  George Woltman, Oliver Weihe\n", MFAKTC_VERSION);
    printf("This program comes with ABSOLUTELY NO WARRANTY; for details see COPYING.\n");
    printf("This is free software, and you are welcome to redistribute it\n");
    printf("under certain conditions; see COPYING for details.\n\n\n");
--- 36,42 ----
  
  void print_help(char *string)
  {
!   printf("mmff-gfn v%s Copyright (C) 2009, 2010, 2011, 2012  George Woltman, Oliver Weihe, Serge Batalov (GFN)\n", MFAKTC_VERSION);
    printf("This program comes with ABSOLUTELY NO WARRANTY; for details see COPYING.\n");
    printf("This is free software, and you are welcome to redistribute it\n");
    printf("under certain conditions; see COPYING for details.\n\n\n");
***************
*** 349,364 ****
      if((mystuff->mode == MODE_NORMAL) && (mystuff->stats.class_counter < 960))
  #endif
      {
!       sprintf(string, "found %d factor%s for %s in %s (partially tested) [mmff %s %s]", factorsfound, (factorsfound > 1) ? "s" : "", mystuff->exponent_string, krange, MFAKTC_VERSION, mystuff->stats.kernelname);
      }
      else
      {
!       sprintf(string, "found %d factor%s for %s in %s [mmff %s %s]", factorsfound, (factorsfound > 1) ? "s" : "", mystuff->exponent_string, krange, MFAKTC_VERSION, mystuff->stats.kernelname);
      }
    }
    else
    {
!     sprintf(string, "no factor for %s in %s [mmff %s %s]", mystuff->exponent_string, krange, MFAKTC_VERSION, mystuff->stats.kernelname);
    }
  
    if(mystuff->mode != MODE_SELFTEST_SHORT)
--- 350,365 ----
      if((mystuff->mode == MODE_NORMAL) && (mystuff->stats.class_counter < 960))
  #endif
      {
!       sprintf(string, "found %d factor%s for %s in %s (partially tested) [mmff-gfn %s %s]", factorsfound, (factorsfound > 1) ? "s" : "", mystuff->exponent_string, krange, MFAKTC_VERSION, mystuff->stats.kernelname);
      }
      else
      {
!       sprintf(string, "found %d factor%s for %s in %s [mmff-gfn %s %s]", factorsfound, (factorsfound > 1) ? "s" : "", mystuff->exponent_string, krange, MFAKTC_VERSION, mystuff->stats.kernelname);
      }
    }
    else
    {
!     sprintf(string, "no factor for %s in %s [mmff-gfn %s %s]", mystuff->exponent_string, krange, MFAKTC_VERSION, mystuff->stats.kernelname);
    }
  
    if(mystuff->mode != MODE_SELFTEST_SHORT)
***************
*** 386,392 ****
  
    if (mystuff->fermat_factoring)	// Figure out which Fermat number this factor divides
    {
!     sprintf(exponent_string, "F%d", which_fermat_number(mystuff, factor_number));
    }
    else
      sprintf(exponent_string, "%s", mystuff->exponent_string);
--- 387,393 ----
  
    if (mystuff->fermat_factoring)	// Figure out which Fermat number this factor divides
    {
!     sprintf(exponent_string, "GF(%d,%d)", which_fermat_number(mystuff, factor_number), BASE);
    }
    else
      sprintf(exponent_string, "%s", mystuff->exponent_string);
***************
*** 407,415 ****
      if(mystuff->mode == MODE_NORMAL)
      {
  #ifndef MORE_CLASSES
!       fprintf(resultfile, "%s%s has a factor: %s [TF:%d:%d%s:mmff %s %s]\n", UID, exponent_string, factor, mystuff->bit_min, mystuff->bit_max_stage, ((mystuff->stopafterfactor == 2) && (mystuff->stats.class_counter <  96)) ? "*" : "" , MFAKTC_VERSION, mystuff->stats.kernelname);
  #else      
!       fprintf(resultfile, "%s%s has a factor: %s [TF:%d:%d%s:mmff %s %s]\n", UID, exponent_string, factor, mystuff->bit_min, mystuff->bit_max_stage, ((mystuff->stopafterfactor == 2) && (mystuff->stats.class_counter < 960)) ? "*" : "" , MFAKTC_VERSION, mystuff->stats.kernelname);
  #endif
      }
    }
--- 408,416 ----
      if(mystuff->mode == MODE_NORMAL)
      {
  #ifndef MORE_CLASSES
!       fprintf(resultfile, "%s%s has a factor: %s [TF:%d:%d%s:mmff-gfn %s %s]\n", UID, exponent_string, factor, mystuff->bit_min, mystuff->bit_max_stage, ((mystuff->stopafterfactor == 2) && (mystuff->stats.class_counter <  96)) ? "*" : "" , MFAKTC_VERSION, mystuff->stats.kernelname);
  #else      
!       fprintf(resultfile, "%s%s has a factor: %s [TF:%d:%d%s:mmff-gfn %s %s]\n", UID, exponent_string, factor, mystuff->bit_min, mystuff->bit_max_stage, ((mystuff->stopafterfactor == 2) && (mystuff->stats.class_counter < 960)) ? "*" : "" , MFAKTC_VERSION, mystuff->stats.kernelname);
  #endif
      }
    }
*** _/parse.c	2012-09-23 13:48:55.000000000 -0700
--- parse.c	2012-09-24 19:21:49.692243906 -0700
***************
*** 93,99 ****
    int ret = 1;
  
    if (fermat) {
!        if(exp <= 27) {ret = 0; if(verbosity >= 1)printf("WARNING: Exponents <= 27 are not supported in Fermat factoring!\n");}
    else if(exp >= 160) {ret = 0; if(verbosity >= 1)printf("WARNING: Exponents >= 160 are not supported in Fermat factoring!\n");}
    else if (exp <= 31) { if (bit_min <  64 || bit_max > 96  || /*bit_min - exp < 13 ||*/ bit_max - exp > 64) {ret = 0; if(verbosity >= 1)printf("WARNING: bit range isn't supported!\n");} }
    else if (exp <= 63) { if (bit_min <  64 || bit_max > 128 || /*bit_min - exp < 13 ||*/ bit_max - exp > 64) {ret = 0; if(verbosity >= 1)printf("WARNING: bit range isn't supported!\n");} }
--- 93,99 ----
    int ret = 1;
  
    if (fermat) {
!        if(exp <= 24) {ret = 0; if(verbosity >= 1)printf("WARNING: Exponents <= 24 are not supported in Fermat factoring!\n");}
    else if(exp >= 160) {ret = 0; if(verbosity >= 1)printf("WARNING: Exponents >= 160 are not supported in Fermat factoring!\n");}
    else if (exp <= 31) { if (bit_min <  64 || bit_max > 96  || /*bit_min - exp < 13 ||*/ bit_max - exp > 64) {ret = 0; if(verbosity >= 1)printf("WARNING: bit range isn't supported!\n");} }
    else if (exp <= 63) { if (bit_min <  64 || bit_max > 128 || /*bit_min - exp < 13 ||*/ bit_max - exp > 64) {ret = 0; if(verbosity >= 1)printf("WARNING: bit range isn't supported!\n");} }
*** _/tf_barrett96_gs.cu	2012-09-23 00:52:17.000000000 -0700
--- tf_barrett96_gs.cu	2012-09-25 00:23:02.420253003 -0700
***************
*** 73,78 ****
--- 73,79 ----
  #undef KERNEL_MIN_BLOCKS
  #define KERNEL_MIN_BLOCKS 3
  
+ #include "tf_gfn.h"
  #include "tf_192.h"
  #include "tf_160.h"
  #include "tf_128.h"