I think what is slowing this down is the SHIFTOUT.

I am running a 16f88 @ 16MHz and I need to know the fastest way to get through this loop:

Code:
LOOP:
	FOR DATA = 4095 to 0 step -1
    		GOSUB SUB1
	    	GOSUB SUB2
	NEXT
	FOR DATA = 0 to 4095 step 1
    		GOSUB SUB1
    		GOSUB SUB2
	NEXT
GOTO LOOP

SUB1:
	FOR C1 = 0 TO 15
    		shiftout dpin,clk,1,[DATA]
	NEXT
    	PORTB = %00100000 
    	PORTB = %00000000  
RETURN

SUB2:
	PORTB = %00000100 
	PORTB = %00000000 
	FOR C3 = 0 TO 4095
    		PORTA = %00001000
    		PORTA = %00000000
	NEXT
RETURN