I'm trying to use the asm code found in Microchip's technical brief TB40 to accomplish a square root of a LONG.
I'm using PBPL. Sometimes it makes it through the ASM routine, sometimes not. The answer is always wrong.

Are there any PBP/ASM gurus out there that would be willing to help?



Code:
; ASM part modified version of Microchip TB40
 
       DEFINE OSC 40
                                                                                ' Got to start somewhere!
        DEFINE NO_CLRWDT 1
        DEFINE _18F8720 1
        DEFINE HSER_RCSTA 90H
        DEFINE HSER_TXSTA 24H                                                   
       
        DEFINE HSER_CLROERR 1
        DEFINE CCP1_REG PORTC
        DEFINE CCP1_BIT 2
        DEFINE LOADER_USED 1                                                    ' Bootloader
        Define USE_LFSR 1
        DEFINE ADC_BITS 10
        DEFINE ADC_SAMPLEUS 6
        
        SPBRG = 255
 
    INPUTVAL VAR LONG
 
    ARGA0 VAR     BYTE bankA SYSTEM ; various argument registers
    ARGA1 VAR     Byte bankA system
    ARGA2 VAR     Byte bankA system
    ARGA3 VAR     Byte bankA system
   
    ARG1H VAR     Byte SYSTEM
    ARG1L VAR     Byte SYSTEM
    ARG2H VAR     Byte SYSTEM
    ARG2L VAR     Byte SYSTEM

    SARG1 VAR     Byte SYSTEM
    SARG2 VAR     Byte SYSTEM

    RES1 VAR     Byte bankA system
    RES0 VAR     Byte bankA system

    SQRES0 VAR     Byte SYSTEM
    SQRES1 VAR     Byte SYSTEM
    SQRES2 VAR     Byte SYSTEM

    SQRES3 VAR     Byte SYSTEM

    BITLOC0 VAR   Byte  SYSTEM
    BITLOC1 VAR   Byte  SYSTEM
    TEMP0 VAR     Byte  SYSTEM
    TEMP1 VAR     Byte  SYSTEM
    ; *******************************************************************
    ; *******************************************************************
    ; The function of this square root routine is to determine the root
    ; to the nearest integer. At the same time the root is found at the
    ; best possible speed; therefore, the root is found a little differently
    ; for the two basic sizes of numbers, 16-bit and 32-bit. The following
    ; differentiates the two and jumps to the appropriate function.
    ; Sqrt(ARGA3:ARGA2:ARGA1:ARGA0) = RES1:RES0

     TRISC = %10111111
     pause 10
 
hello: 
 
    hserout ["hello world",13,10]

    pause 100
   
   
   For InputVal = 0 to $FFFFFFFF
      
   
   ARGA0 = INPUTVAL.BYTE0
   ARGA1 = INPUTVAL.BYTE1
   ARGA2 = INPUTVal.BYTE2
   ARGA3 = INPUTVAL.BYTE3


ASM
 
  
Sqrt tstfsz ARGA3,1 ; determine if the number is 16-bit
        bra Sqrt32 ; or 32-bit and call the best function
        tstfsz ARGA2, 1
        bra Sqrt32
        clrf RES1, 1
        bra Sqrt16


Sqrt16 clrf TEMP0, 1 ; clear the temp solution
        movlw 0x80 ; setup the first bit
        movwf BITLOC0, 1
        movwf RES0, 1
Square8 movf RES0, W, 1 ; square the guess
        mulwf RES0, 1
        movf PRODL, W, 1 ; ARGA - PROD test
        subwf ARGA0, W, 1
        movf PRODH, W, 1
        subwfb ARGA1, W, 1
        btfsc STATUS, C, 1
        bra NextBit ; if positive then next bit
    ; if negative then rotate right
        movff TEMP0, RES0 ; move last good value back into RES0
        rrncf BITLOC0, F, 1 ; then rotote the bit and put it
        movf BITLOC0, W, 1 ; back into RES0
        iorwf RES0, F, 1
        btfsc BITLOC0, 7, 1; if last value was tested then get
        bra Done ; out
        bra Square8 ; elso go back for another test
NextBit movff RES0, TEMP0 ; copy the last good approximation
        rrncf BITLOC0, F, 1 ; rotate the bit location register
        movf BITLOC0, W, 1
        iorwf RES0, F, 1
        btfsc BITLOC0, 7, 1 ; if last value was tested then get
        bra Done ; out
        bra Square8
Done movff TEMP0,RES0 ; put the final result in RES0
        bra TotallyDone
    
   
   
Sqrt32 clrf TEMP0, 1 ; clear the temp solution
        clrf TEMP1, 1
        clrf BITLOC0, 1 ; setup the first bit
        clrf RES0, 1
        movlw 0x80
        movwf BITLOC1, 1 ; BitLoc = 0x8000
        movwf RES1, 1 ; RES = 0x8000
Squar16 movff RES0, ARG1L ; square the guess
        movff RES1, ARG1H
        call Sq16
        movf SQRES0, W, 1 ; ARGA - PROD test
        subwf ARGA0, W, 1
        movf SQRES1, W, 1
        subwfb ARGA1, W, 1
        movf SQRES2, W, 1
        subwfb ARGA2, W, 1
        movf SQRES3, W, 1
        subwfb ARGA3, W, 1
        btfsc STATUS, C, 1
        bra NxtBt16 ; if positive then next bit
    ; if negative then rotate right
        addlw 0x00 ; clear carry
        movff TEMP0, RES0 ; move last good value back into RES0
        movff TEMP1, RES1
        rrcf BITLOC1, F, 1 ; then rotote the bit and put it
        rrcf BITLOC0, F, 1
        movf BITLOC1, W, 1 ; back into RES1:RES0
        iorwf RES1, F, 1
        movf BITLOC0, W, 1
        iorwf RES0, F, 1
        btfsc STATUS, C, 1 ; if last value was tested then get
        bra Done32 ; out
        bra Squar16 ; elso go back for another test
NxtBt16 addlw 0x00 ; clear carry
        movff RES0, TEMP0 ; copy the last good approximation
        movff RES1, TEMP1
        rrcf BITLOC1, F, 1 ; rotate the bit location register
        rrcf BITLOC0, F, 1
        movf BITLOC1, W, 1 ; and put back into RES1:RES0
        iorwf RES1, F, 1
        movf BITLOC0, W, 1
        iorwf RES0, F, 1


        btfsc STATUS, C, 1 ; if last value was tested then get
        bra Done32 ; out
        bra Squar16
Done32 movff TEMP0,RES0 ; put the final result in RES1:RES0
        movff TEMP1,RES1
        bra TotallyDone

  
Sq16 movf ARG1L, W, 1
        mulwf ARG1L ; ARG1L * ARG2L ->
    ; PRODH:PRODL
        movff PRODH, SQRES1 ;
        movff PRODL, SQRES0 ;
        movf ARG1H, W, 1
        mulwf ARG1H ; ARG1H * ARG2H ->
    ; PRODH:PRODL
        movff PRODH, SQRES3 ;
        movff PRODL, SQRES2 ;
        movf ARG1L, W, 1
        mulwf ARG1H ; ARG1L * ARG2H ->
    ; PRODH:PRODL
        movf PRODL, W, 1 ;
        addwf SQRES1, F, 1 ; Add cross
        movf PRODH, W, 1 ; products
        addwfc SQRES2, F, 1 ;
        clrf WREG, 1 ;
        addwfc SQRES3, F, 1 ;
        movf ARG1H, W, 1 ;
        mulwf ARG1L ; ARG1H * ARG2L ->
    ; PRODH:PRODL
        movf PRODL, W, 1 ;
        addwf SQRES1, F, 1 ; Add cross
        movf PRODH, W, 1 ; products
        addwfc SQRES2, F, 1 ;
        clrf WREG, W ;
        addwfc SQRES3, F, 1 ;
        return

TotallyDone    
    
ENDASM

  HSEROUT ["outputval = ",HEX2 RES1,HEX2 RES0,13,10]

  next inputval

end