Code:
;****************************************************************
;* DIV        : 32 x 32 divide                                  *
;* Input      : R0 / R1                                         *
;* Output     : R0 = quotient                                   * 
;*            : R2 = remainder                                  *
;* Notes      : R2 = R0 MOD R1                                  *
;****************************************************************

    ifdef DIVS_USED
  LIST
DIVS	clrf	R3 + 3		; Clear sign difference indicator
	btfss	R0 + 3, 7	; Check for R0 negative
	bra	divchkr1	; Not negative
	btg	R3 + 3, 7	; Flip sign indicator
	clrf	WREG		; Clear W for subtracts
	negf	R0		; Flip value to plus
	subfwb	R0 + 1, F
	subfwb	R0 + 2, F
	subfwb	R0 + 3, F
divchkr1 btfss	R1 + 3, 7	; Check for R1 negative
	bra	divdo		; Not negative
	btg	R3 + 3, 7	; Flip sign indicator
	clrf	WREG		; Clear W for subtracts
	negf	R1		; Flip value to plus
	subfwb	R1 + 1, F
	subfwb	R1 + 2, F
	subfwb	R1 + 3, F
	bra	divdo		; Skip unsigned entry
  NOLIST
DIV_USED = 1
    endif

    ifdef DIV_USED
  LIST
DIV
      ifdef DIVS_USED
	clrf	R3 + 3		; Clear sign difference indicator	
      endif
divdo	clrf	R2		; Do the divide
	clrf	R2 + 1
	clrf	R2 + 2
	clrf	R2 + 3

	movlw	32             ; start with 32 loops
	movwf	R3

        ifdef SKI_DIV_SPEEDUP
SkiOpt
	movf    R0 + 3, W      ; IF R0.byte3 = 0 
	bnz     divloop
	movf    R1 + 3, W      ;   AND R1.byte3 = 0 then 
	bnz     divloop

	movlw   8              ;      loops - 8  ; movlw 24
	subwf   R3, F

	movff   R0 + 2, R0 + 3 ;      and preshift R0
	movff   R0 + 1, R0 + 2
	movff   R0 + 0, R0 + 1
	clrf    R0

	movff   R1 + 2, R1 + 3 ;      and R1 over 8 bits
	movff   R1 + 1, R1 + 2
	movff   R1 + 0, R1 + 1
	clrf    R1

	movf    R3, W
	btfss   STATUS, Z      ; stop if no loop's left (0/0)
	bra     SkiOpt

        endif

divloop	rlcf	R0 + 3, W
	rlcf	R2, F
	rlcf	R2 + 1, F
	rlcf	R2 + 2, F
	rlcf	R2 + 3, F
	movf	R1, W
	subwf	R2, F
	movf	R1 + 1, W
	subwfb	R2 + 1, F
	movf	R1 + 2, W
	subwfb	R2 + 2, F
	movf	R1 + 3, W
	subwfb	R2 + 3, F

	bc	divok
	movf	R1, W
	addwf	R2, F
	movf	R1 + 1, W
	addwfc	R2 + 1, F
	movf	R1 + 2, W
	addwfc	R2 + 2, F
	movf	R1 + 3, W
	addwfc	R2 + 3, F

	bcf	STATUS, C

divok	rlcf	R0, F
	rlcf	R0 + 1, F
	rlcf	R0 + 2, F
	rlcf	R0 + 3, F

	decfsz	R3, F
	bra	divloop

      ifdef DIVS_USED
	btfss	R3 + 3, 7	; Should result be negative?
	bra	divdone		; Not negative
	clrf	WREG		; Clear W for subtracts
	negf	R0		; Flip quotient to minus
	subfwb	R0 + 1, F
	subfwb	R0 + 2, F
	subfwb	R0 + 3, F
	negf	R2		; Flip remainder to minus
	subfwb	R2 + 1, F
	subfwb	R2 + 2, F
	subfwb	R2 + 3, F
divdone
    endif

	movf	R0, W		; Get low byte to W
	goto	DUNN
  NOLIST
DUNN_USED = 1
    endif
This optimization (byte level optimize) seems to work very well on my end using MPLAB sim and the stopwatch.
I've got to finish up a 2nd demo board with an LCD, one running the non-optimized version, one running the optimized version, just to see which one counts faster and by how much.
I still want to change the optimization down to the bit level and have the library recognize the DEFINE as none (not defined), byte or bit.
How do you get MPASM to recognize different parameters using DEFINE in PBP? I can't seem to get it to work right...