diff --git a/README.md b/README.md
index 934ccdd..d5b181c 100755
--- a/README.md
+++ b/README.md
@@ -3,6 +3,18 @@ ccminer-cryptonight
 A modification of Christian Buchner's &amp; Christian H.'s
 ccminer project by tsiv for Cryptonight mining.
 
+July 5th 2014
+-------------
+
+Massive improvement to interactivity on Windows, should also further help with TDR issues.
+Introducing the --bfactor and --bsleep command line parameters allows for control over
+execution of the biggest resource hog of the algorithm. Use bfactor to determine how
+many parts the kernel is split into and bsleep to insert a short delay between the kernel
+launches. The defaults are no splitting / no sleep for Linux and split into 64 (bfactor 6)
+parts / sleep 100 microseconds between launches for Windows. These defaults seem to work
+wonders on my 750 Ti on Windows 7, once again you may want to tweak according to your 
+environment.
+
 June 30th 2014
 --------------
 
diff --git a/README.txt b/README.txt
index b254fb0..0197c42 100755
--- a/README.txt
+++ b/README.txt
@@ -61,6 +61,15 @@ most of their command line interface and options.
                        value between devices, you can just enter a single
 		       value and it will be used for all devices.
 		       (default: 8x40)
+     --bfactor=X       Enables running the Cryptonight kernel in smaller pieces.\n\
+                       The kernel will be run in 2^X parts according to bfactor,\n\
+                       with a small pause between parts, specified by --bsleep.\n\
+                       This is a per-device setting like the launch config.\n\
+                       (default: 0 (no splitting) on Linux, 6 (64 parts) on Windows)\n\
+     --bsleep=X        Insert a delay of X microseconds between kernel launches.\n\
+                       Use in combination with --bfactor to mitigate the lag\n\
+                       when running on your primary GPU.\n\
+                       This is a per-device setting like the launch config.\n\
  -f, --diff            Divide difficulty by this factor (std is 1) 
  -o, --url=URL         URL of mining server (default: " DEF_RPC_URL ")
  -O, --userpass=U:P    username:password pair for mining server
diff --git a/cpu-miner.c b/cpu-miner.c
index 687609e..0c85acc 100755
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -179,7 +179,16 @@ uint16_t opt_vote = 9999;
 static int num_processors;
 int device_map[8] = {0,1,2,3,4,5,6,7}; // CB
 char *device_name[8]; // CB
+int device_bfactor[8];
+int device_bsleep[8];
 int device_config[8][2];
+#ifdef WIN32
+static int default_bfactor = 6;
+static int default_bsleep = 100;
+#else
+static int default_bfactor = 0;
+static int default_bsleep = 0;
+#endif
 static char *rpc_url;
 static char *rpc_userpass;
 static char *rpc_user, *rpc_pass;
@@ -239,6 +248,15 @@ Options:\n\
                         the remaining devices. If you don't need to vary the\n\
                         value between devices, you can just enter a single value\n\
                         and it will be used for all devices. (default: 8x40)\n\
+      --bfactor=X       Enables running the Cryptonight kernel in smaller pieces.\n\
+                        The kernel will be run in 2^X parts according to bfactor,\n\
+                        with a small pause between parts, specified by --bsleep.\n\
+                        This is a per-device setting like the launch config.\n\
+                        (default: 0 (no splitting) on Linux, 6 (64 parts) on Windows)\n\
+      --bsleep=X        Insert a delay of X microseconds between kernel launches.\n\
+                        Use in combination with --bfactor to mitigate the lag\n\
+                        when running on your primary GPU.\n\
+                        This is a per-device setting like the launch config.\n\
   -m, --trust-pool      trust the max block reward vote (maxvote) sent by the pool\n\
   -o, --url=URL         URL of mining server\n\
   -O, --userpass=U:P    username:password pair for mining server\n\
@@ -314,6 +332,8 @@ static struct option const options[] = {
 	{ "devices", 1, NULL, 'd' },
 	{ "diff", 1, NULL, 'f' },
 	{ "launch", 1, NULL, 'l' },
+	{ "bfactor", 1, NULL, 1008 },
+	{ "bsleep", 1, NULL, 1009 },
 	{ 0, 0, 0, 0 }
 };
 
@@ -1732,6 +1752,42 @@ static void parse_arg (int key, char *arg)
             }
         }
         break;
+	case 1008:
+        {
+            p = strtok(arg, ",");
+            if( p == NULL ) show_usage_and_exit(1);
+            int last;
+            i = 0;
+            while( p != NULL && i < 8 ) {
+                device_bfactor[i++] = last = atoi(p);
+                if( last < 0 || last > 10 ) {
+                    applog(LOG_ERR, "Valid range for --bfactor is 0-10");
+                    exit(1);
+                }
+                p = strtok(NULL, ",");
+            }
+            while (i < 8) {
+                device_bfactor[i++] = last;
+            }
+        }
+        break;
+	case 1009:
+            p = strtok(arg, ",");
+            if( p == NULL ) show_usage_and_exit(1);
+            int last;
+            i = 0;
+            while( p != NULL && i < 8 ) {
+                device_bsleep[i++] = last = atoi(p);
+                if( last < 0 || last > 1000000 ) {
+                    applog(LOG_ERR, "Valid range for --bsleep is 0-1000000");
+                    exit(1);
+                }
+                p = strtok(NULL, ",");
+            }
+            while (i < 8) {
+                device_bsleep[i++] = last;
+            }
+        break;
 
 	case 'V':
 		show_version_and_exit();
@@ -1862,6 +1918,8 @@ int main(int argc, char *argv[])
 	for(i = 0; i < 8; i++) {
         device_config[i][0] = opt_cn_blocks;
         device_config[i][1] = opt_cn_threads;
+        device_bfactor[i] = default_bfactor;
+        device_bsleep[i] = default_bsleep;
     }
 
     /* parse command line */
diff --git a/cryptonight.h b/cryptonight.h
index bcb6683..3977d74 100755
--- a/cryptonight.h
+++ b/cryptonight.h
@@ -122,6 +122,7 @@ struct cryptonight_gpu_ctx {
     uint32_t b[4];
     uint32_t key1[40];
     uint32_t key2[40];
+    uint32_t text[32];
 };
 
 void hash_permutation(union hash_state *state);
diff --git a/cryptonight/cuda_cryptonight_core.cu b/cryptonight/cuda_cryptonight_core.cu
index 1a8a587..1e9cf10 100755
--- a/cryptonight/cuda_cryptonight_core.cu
+++ b/cryptonight/cuda_cryptonight_core.cu
@@ -1,6 +1,7 @@
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
+#include <sys/time.h>
 #include "cuda.h"
 #include "cuda_runtime.h"
 #include "cryptonight.h"
@@ -9,31 +10,15 @@
 #include <unistd.h>
 #endif
 
-#include "cuda_cryptonight_aes.cu"
+extern int device_bfactor[8];
+extern int device_bsleep[8];
 
-#define hi_dword(x) (x >> 32)
-#define lo_dword(x) (x & 0xFFFFFFFF)
+#include "cuda_cryptonight_aes.cu"
 
 __device__ __forceinline__ uint64_t cuda_mul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
 {
-  uint64_t a = hi_dword(multiplier);
-  uint64_t b = lo_dword(multiplier);
-  uint64_t c = hi_dword(multiplicand);
-  uint64_t d = lo_dword(multiplicand);
-
-  uint64_t ac = a * c;
-  uint64_t ad = a * d;
-  uint64_t bc = b * c;
-  uint64_t bd = b * d;
-
-  uint64_t adbc = ad + bc;
-  uint64_t adbc_carry = adbc < ad ? 1 : 0;
-
-  uint64_t product_lo = bd + (adbc << 32);
-  uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
-  *product_hi = ac + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
-
-  return product_lo;
+    *product_hi = __umul64hi(multiplier, multiplicand);
+    return(multiplier * multiplicand);
 }
 
 __global__ void cryptonight_core_gpu_phase1(int threads, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx)
@@ -50,6 +35,7 @@ __global__ void cryptonight_core_gpu_phase1(int threads, uint8_t *d_long_state,
     if (thread < threads)
     {
         int i, j;
+        int start = 0, end = MEMORY;
         uint8_t *long_state = &d_long_state[MEMORY * thread];
         uint32_t *ls32;
         struct cryptonight_gpu_ctx *ctx = &d_ctx[thread];
@@ -61,7 +47,7 @@ __global__ void cryptonight_core_gpu_phase1(int threads, uint8_t *d_long_state,
         for( i = 0; i < 4; i++ )
             text[i] = state[i];
 
-        for (i = 0; i < MEMORY; i += INIT_SIZE_BYTE) {
+        for (i = start; i < end; i += INIT_SIZE_BYTE) {
 
             ls32 = (uint32_t *)&long_state[i];
 
@@ -73,7 +59,7 @@ __global__ void cryptonight_core_gpu_phase1(int threads, uint8_t *d_long_state,
     }
 }
 
-__global__ void cryptonight_core_gpu_phase2(int threads, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx)
+__global__ void cryptonight_core_gpu_phase2(int threads, int partcount, int partidx, uint8_t *d_long_state, struct cryptonight_gpu_ctx *d_ctx)
 {
 	__shared__ uint32_t sharedMemory[1024];
 
@@ -86,6 +72,7 @@ __global__ void cryptonight_core_gpu_phase2(int threads, uint8_t *d_long_state,
     if (thread < threads)
     {
         int i, j;
+        int start = 0, end = ITER / 4;
         uint8_t *long_state = &d_long_state[MEMORY * thread];
         struct cryptonight_gpu_ctx *ctx = &d_ctx[thread];
         uint32_t a[4], b[4], c[4];
@@ -93,16 +80,29 @@ __global__ void cryptonight_core_gpu_phase2(int threads, uint8_t *d_long_state,
         MEMCPY8(a, ctx->a, 2);
         MEMCPY8(b, ctx->b, 2);
 
-        for (i = 0; i < ITER / 4; ++i) {
+        if( partcount > 1 ) {
+            
+            int batchsize = (ITER / 4) / partcount;
+            start = partidx * batchsize;
+            end = start + batchsize;
+        }
+
+        for (i = start; i < end; ++i) {
 
-            j = E2I(a) * AES_BLOCK_SIZE;
+            j = ((uint32_t *)a)[0] & 0x1FFFF0;
             cn_aes_single_round(sharedMemory, &long_state[j], c, a);
             XOR_BLOCKS_DST(c, b, &long_state[j]);
-            MUL_SUM_XOR_DST(c, a, &long_state[E2I(c) * AES_BLOCK_SIZE]);
-            j = E2I(a) * AES_BLOCK_SIZE;
+            MUL_SUM_XOR_DST(c, a, &long_state[((uint32_t *)c)[0] & 0x1FFFF0]);
+            j = ((uint32_t *)a)[0] & 0x1FFFF0;
             cn_aes_single_round(sharedMemory, &long_state[j], b, a);
             XOR_BLOCKS_DST(b, c, &long_state[j]);
-            MUL_SUM_XOR_DST(b, a, &long_state[E2I(b) * AES_BLOCK_SIZE]);
+            MUL_SUM_XOR_DST(b, a, &long_state[((uint32_t *)b)[0] & 0x1FFFF0]);
+        }
+        
+        if( partcount > 1 ) {
+
+            MEMCPY8(ctx->a, a, 2);
+            MEMCPY8(ctx->b, b, 2);
         }
     }
 }
@@ -121,6 +121,7 @@ __global__ void cryptonight_core_gpu_phase3(int threads, uint8_t *d_long_state,
     if (thread < threads)
     {
         int i, j;
+        int start = 0, end = MEMORY;
         uint8_t *long_state = &d_long_state[MEMORY * thread];
         uint32_t *ls32;
         struct cryptonight_gpu_ctx *ctx = &d_ctx[thread];
@@ -132,7 +133,7 @@ __global__ void cryptonight_core_gpu_phase3(int threads, uint8_t *d_long_state,
         for( i = 0; i < 4; i++ )
             text[i] = state[i];
 
-        for (i = 0; i < MEMORY; i += INIT_SIZE_BYTE) {
+        for (i = start; i < end; i += INIT_SIZE_BYTE) {
 
             ls32 = (uint32_t *)&long_state[i];
 
@@ -159,12 +160,17 @@ __host__ void cryptonight_core_cpu_hash(int thr_id, int blocks, int threads, uin
     dim3 block8(threads << 3);
 
     size_t shared_size = 1024;
+    int i, partcount = 1 << device_bfactor[thr_id];
 
     cryptonight_core_gpu_phase1<<<grid, block8, shared_size>>>(blocks*threads, d_long_state, d_ctx);
     cudaDeviceSynchronize();
+    if( partcount > 1 ) usleep(device_bsleep[thr_id]);
 
-    cryptonight_core_gpu_phase2<<<grid, block, shared_size>>>(blocks*threads, d_long_state, d_ctx);
-    cudaDeviceSynchronize();
+    for( i = 0; i < partcount; i++ ) {
+        cryptonight_core_gpu_phase2<<<grid, block, shared_size>>>(blocks*threads, partcount, i, d_long_state, d_ctx);
+        cudaDeviceSynchronize();
+        if( partcount > 1 ) usleep(device_bsleep[thr_id]);
+    }
 
     cryptonight_core_gpu_phase3<<<grid, block8, shared_size>>>(blocks*threads, d_long_state, d_ctx);
     cudaDeviceSynchronize();