ASM code optimieren

Kann man das da unten noch weiter optimieren (Pentium-Optimierung/MMX???/Vectorisieren?)?

PUBLIC divid
.386
.MODEL FLAT
.CODE
divid  proc near
;
a       equ     0                       ; Dividend (Arbeitskopie)
b       equ     1030                    ; Divisor  (Arbeitskopie)
q       equ     1546                    ; Quotient qdach (Arbeitsspeicher)
q1      equ     2566                    ; hoechstwertige Divisorstelle (normalisiert)
v1      equ     2570                    ; 2. Divisorstelle (normalisiert)
v2      equ     2574                    ; Exponent fuer Normalisierung
d       equ     2578                    ; uj+1 (normalisiert)
uj1     equ     2582                    ; uj+2 (normalisiert)
uj2     equ     2586
;
_dvd    equ     [ebp+8]                 ; Dividend (Parameter)
_dvs    equ     [ebp+12]                ; Divisor  (Parameter)
_qot    equ     [ebp+16]                ; Quotient (Parameter)
_rst    equ     [ebp+20]                ; Rest     (Parameter)
;
WORKSP  equ     2600                    ; Arbeitsspeicher auf dem Stack
;
divs:   push    ebp                     ; Sichern fuer aufrufende Prozedur
        mov     ebp,esp
        sub     esp,WORKSP              ; Lokaler Speicher
        mov     eax,esp
        push    ebx
        push    edi
        push    esi
        mov     ebx,eax                 ; Startadresse sichern
;
        mov     esi,_dvd                ; Offset von Var a
        mov     edi,_dvs                ; Offset von Var b
        mov     ax,[esi]                ; l(a)
        mov     dx,[edi]                ; l(b)
;
;>>>>>> Lade Operanden
;
@@:     push    edi                     ; Rette Seg:Offs b
        lea     edi,a[ebx]              ; Zieloffset ist a[bx]
        xor     ecx,ecx
        mov     cx,ax                   ; l(a) in cx
        shr     cx,1
        jnc     @F
        inc     cx
@@:     cld
rep     movsd                           ; Lade a als ULONG
        movsw                           ;  plus ein USHORT
        mov     cx,ax
        shl     cx,1
        lea     edi,a[ebx+ecx+2]
        mov     word ptr [edi],0
;
        pop     esi                     ; Hole Seg:Offs von b
        lea     edi,b[ebx]              ; Zieloffset ist b[bx]
        xor     ecx,ecx
        mov     cx,dx                   ; l(b) in cx
        shr     cx,1
        jnc     @F
        inc     cx
@@:     cld
rep     movsd                           ; Lade b als ULONG
        movsw                           ;  plus ein USHORT
        mov     cx,dx
        shl     cx,1
        lea     edi,b[ebx+ecx+2]
        mov     word ptr [edi],0
;
;>>>>>> Adressierung des lokalen Arbeitsspeichers vorbereiten
;
        push    ebp                     ; bp retten
        mov     ebp,ebx                 ; Basisind. Adr. in ss
;
;>>>>>> Reduziere Darstellung der Operanden um fhrende Nullen
;
        xor     eax,eax
        xor     ecx,ecx
        mov     ax,a[ebp]               ; #USHORTs in ax
        test    ax,ax ;cmp     ax,0
        je      next1
        shl     eax,1                   ; Index auf niederstwertiges Byte
                                        ; der hoechstwertigen Stelle
        mov     esi,eax
@@:     cmp     word ptr a[ebp][esi],0  ; USHORT = 0 ?
        jne     @F                      ; Nein, dann fertig
        sub     esi,2                   ; Sonst: eine Stelle zurueck
        test    esi,esi ;cmp     esi,0                   ; Index = 0?
        je      mazer1                  ; Dann Zahl = 0
        jmp     @B                      ; Naechster Vergleich
@@:                                     ; Laenge gefunden
mazer1: shr     esi,1                   ; #Stellen
        mov     eax,esi                 ; Basisadresse des Operanden holen
        mov     word ptr a[ebp],ax      ; #Stellen speichern

;Operand2
next1:  mov     ax,b[ebp]               ; #USHORTs in ax
        test    ax,ax ;cmp     ax,0
        je      end1
        shl     eax,1                   ; #Bytes, Index auf niederstwertiges Byte
                                        ; der hoechstwertigen Stelle
        mov     esi,eax
@@:     cmp     word ptr b[ebp][esi],0  ; Stelle = 0 ?
        jne     @F                      ; Nein, dann fertig
        sub     esi,2                   ; Sonst: eine Stelle zurueck
        test    esi,esi ;cmp     esi,0                   ; Index schon = 0?
        je      mazer2                  ; Dann Zahl = 0
        jmp     @B                      ; Naechster Vergleich
@@:                                     ; Laenge gefunden
mazer2: shr     esi,1                   ; #Stellen
        mov     eax,esi                 ; Basisadresse des Operanden holen
        mov     b[ebp],ax               ; #Stellen speichern

end1:
;
;>>>>>> Test a < b ?
;
@@:     xor     ecx,ecx ;mov     ecx,0
        mov     cx,a[ebp]
        cmp     cx,b[ebp]               ; l(a) - l(b)
        jnc     div03
        jmp     dra
div03:  jne     div05                   ; Bei Gleichheit Stellen vergleichen
        mov     eax,ecx                 ; cl = l(a) = l(b)
        shl     eax,1
        lea     esi,a[ebp]
        lea     edi,b[ebp]
        add     esi,eax                 ; si zeigt auf hoechste Stelle von a
        add     edi,eax                 ; di zeigt auf hoechste Stelle von b
        std
repe    cmpsw
        cld
        jnc     div05                   ; Falls kein carry ist q positiv
        jmp     dra                     ; Sonst q := 0 und r := a
div05:  mov     ax,b[ebp]
        shr     ax,1
        jnc     div05a
        inc     ax
div05a: cmp     ax,1                    ; Test ob l(b) = 1
        jne     dstart
        jmp     dshort                  ; Dann kuerzere Variante
;
;>>>>>> Beginn der Division
;
dstart: xor     eax,eax ;mov     eax,0
        mov     ax,b[ebp]
        shr     ax,1
        jnc     dm0
        inc     ax
dm0:    mov     esi,eax
        shl     esi,2
        sub     esi,2                   ; Zeiger auf loByte von b[l(b)] (hoechstes ULONG)
        cmp     esi,10                  ; Hat Divisor 3 oder mehr ULONG-Stellen?
        jb      dm1a                    ; Zwei Stellen hat er mindestens!

        mov     ebx,b[ebp][esi]         ; b[l(b)] in bx
        mov     eax,b[ebp][esi-4]
        mov     edx,b[ebp][esi-8]
;
        xor     ecx,ecx ;mov     ecx,0                   ; Zaehler vorbereiten
;
@@:     cmp     ebx,080000000h          ; v1 >= 2^31 ?
        jae     dm2                     ; Falls nicht...
        inc     cx
        clc
        rcl     edx,1
        rcl     eax,1
        rcl     ebx,1                   ; v1 = v1 * 2
        jmp     short @B                ; bis v1 >= 2^31

dm1a:   mov     ebx,b[ebp][esi]         ; b[l(b)] in bx
        mov     eax,b[ebp][esi-4]
;
        xor     ecx,ecx ;mov     ecx,0                   ; Zaehler vorbereiten
;
@@:     cmp     ebx,080000000h          ; v1 >= 2^31 ?
        jae     dm2                     ; Falls nicht...
        inc     cx
        clc
        rcl     eax,1
        rcl     ebx,1                   ; v1 = v1 * 2
        jmp     short @B                ; bis v1 >= 2^31
;
dm2:    mov     d[ebp],cx               ; Exponenten speichern
        mov     v1[ebp],ebx             ; v1
        mov     v2[ebp],eax             ; v2

dm3:    inc     word ptr a[ebp]         ; l(a) = l(a) + 2
        inc     word ptr a[ebp]
        xor     eax, eax ;mov     eax,0
        mov     ax,a[ebp]
        shr     ax,1
        jnc     dm3a
        inc     ax
dm3a:   mov     esi,eax
        shl     esi,2
        sub     esi,2
        mov     dword ptr a[ebp][esi],0 ; a[l(a)] = 0
;
d2:     xor     eax,eax ;mov     eax,0
        mov     ax,b[ebp]
        shr     ax,1
        jnc     d2a
        inc     ax
d2a:    mov     esi,eax
        shl     esi,2
        sub     esi,2                   ; si zeigt auf loByte von b[l(b)]
;
        xor     ecx,ecx ;mov     ecx,0
        mov     cx,a[ebp]
        shr     cx,1
        jnc     d2b
        inc     cx
d2b:    mov     edi,ecx
        shl     edi,2
        sub     edi,2                   ; di zeigt auf loByte von a[l(a)]
        push    edi
        sub     edi,esi
        inc     edi                     ; di zeigt auf hiByte von a[l(a)-l(b)]
        mov     ecx,edi
        shr     ecx,2                   ; Zaehler in cx
        pop     esi                     ; si zeigt auf loByte von a[l(a)] (j+l(b))
        sub     edi,3                   ; di zeigt auf loByte von a[l(a)-l(b)] (j)
        push    edi                     ; moegliche Laenge von q
;
;-----> Divisionsschleife
;
d3:     cmp     esi,14
        jb      d3a
        push    edi
        mov     edx,a[ebp][esi]         ; uj   ungeshifet in dx (hiWord)
        mov     eax,a[ebp][esi-4]       ; uj+1 ungeshiftet
        mov     ebx,a[ebp][esi-8]       ; uj+2 ungeshiftet
        mov     edi,a[ebp][esi-12]      ; uj+3 ungeshiftet
        push    ecx
        mov     cx,d[ebp]
@@:     test    cx,cx ;cmp     cx,0
        je      @F
        rcl     edi,1
        rcl     ebx,1
        rcl     eax,1
        rcl     edx,1
        dec     cx
        jmp     @B
@@:     pop     ecx
        pop     edi
        jmp     qdach
;
d3a:    mov     edx,a[ebp][esi]         ; uj   ungeshiftet
        mov     eax,a[ebp][esi-4]       ; uj+1 ungeschiftet
        mov     ebx,a[ebp][esi-8]       ; uj+2 ungeshiftet
        push    ecx
        mov     cx,d[ebp]
@@:     test    cx,cx ;cmp     cx,0
        je      @F
        rcl     ebx,1
        rcl     eax,1
        rcl     edx,1
        dec     cx
        jmp     @B
@@:     pop     ecx
;
;-----> qdach berechnen und testen
;
qdach:  mov uj1[ebp],eax       ;  geshiftetes uj+1 speichern
        mov uj2[ebp],ebx       ;  geshiftetes uj+2 speichern
        mov ebx,v1[ebp]        ;  v1 in bx
        cmp ebx,edx
        je bm1                 ; Falls v1 = uj q = b - 1
;
        div ebx                ; eax <- qdach
;                              ; edx <- rdach: = uj*b + uj+1 -qdach*v1 = (uj*b + uj+1) mod v1
        mov q1[ebp],eax        ; qdach zwischenspeichern
        test eax,eax ;cmp eax,0
        jne @F
        jmp dml                ; Falls q=0 naechste Stelle
;
bm1:    mov eax,0ffffffffh     ; eax <- qdach
        mov q1[ebp],eax        ; qdach zwischenspeichern
        mov edx,uj1[ebp]       ; edx <- uj+1
        add edx,v1[ebp]        ; edx <- uj+1 + v1 =: rdach
        jc d4                  ; rdach >= b => v2 * qdach < rdach * b
;
@@:     mov ebx,edx            ; ebx <- rdach
        mul dword ptr v2[ebp]  ; eax <- low(qdach * v2), edx <- high(qdach * v2)
        cmp edx,ebx            ; rdach >= high(qdach * v2) ?
        jb  d4                 ; CF == 1?  Dann fertig
        ja  @F                 ; rdach < high(qdach * v2) => reduziere qdach
        cmp eax,uj2[ebp]       ; uj+2 >= low(qdach * v2) ?
        jbe d4                 ; CF == 1 OR ZF == 1? Dann fertig
;
@@:     dec dword ptr q1[ebp]  ; Korrektur qdach--
        add ebx,v1[ebp]        ; ebx <- rdach + v1
        jc d4                  ; rdach >= b => v2 * qdach < rdach * b
        sub eax,v2[ebp]        ; (qdach * v2) - v2
        sbb edx,0
;
        cmp edx,ebx            ; rdach >= high(qdach * v2) ?
        jb  d4                 ; CF == 1?  Dann fertig
        ja  @B                 ; rdach < high(qdach * v2) => reduziere qdach
        cmp eax,uj2[ebp]       ; uj+2 >= low(qdach * v2) ?
        ja  @B                 ; CF==0 + ZF==0? => Reduktion von qdach wiederholen
;
d4:     push    ecx            ; Hauptzaehler retten
        push    esi            ; j + l(b) retten
        push    edi            ; j retten
        mov     cx,b[ebp]      ; Zaehler mit l(b) laden
        shr     cx,1
        jnc     d4a
        inc     cx
d4a:    mov     ebx,q1[ebp]    ; q in bx
        mov     esi,2
        xor     edx,edx ;mov     edx,0          ; Dummy šbertrag
;
;-----> Multiplikation und Subtraktion
;
dms:    push    edx
        mov     eax,ebx                 ; q in ax
        mul     dword ptr b[ebp][esi]   ; hi:dx lo:ax
        sub     a[ebp][edi],eax
        adc     edx,0                   ; Carry zum naechsten Subtrahenden
        pop     eax
        sub     a[ebp][edi],eax
        adc     edx,0                   ; Carry zum naechsten Subtrahenden
        inc     edi
        inc     edi
        inc     edi
        inc     edi
        inc     esi
        inc     esi
        inc     esi
        inc     esi
        loop    dms
        sub     a[ebp][edi],edx
        jnc     dnc                     ; Korrektur falls šbertrag
;
;-----> Korrektur
;
        pop     edi
        push    edi
        xor     ecx,ecx
        mov     cx,b[ebp]               ; Zaehler mit l(b) laden
        shr     cx,1
        jnc     d4b
        inc     cx
d4b:    mov     esi,2
        clc
d5:     mov     eax,b[ebp][esi]         ; b[i]
        adc     a[ebp][edi],eax         ; a[j+i] + b[i]
        inc     edi
        inc     edi
        inc     edi
        inc     edi
        inc     esi
        inc     esi
        inc     esi
        inc     esi
        loop    d5                      ; Aeussere Schleife
        jnc     d51
        inc     dword ptr a[ebp][edi]
d51:    dec     dword ptr q1[ebp]       ; q = q - 1
;
;
dnc:    pop     edi                     ; Zeiger und Zaehler holen
        pop     esi
        pop     ecx
dml:    mov     eax,q1[ebp]             ; q holen
        mov     q[ebp][edi],eax         ; q[j] = q
        sub     edi,4
        sub     esi,4
        dec     ecx                     ; Innere Schleife
        jz      d6
        jmp     d3
;
d6:     pop     edi                     ; l(a)-l(b)-1 in ULONGs holen
        add     edi,2                   ; di zeigt auf loByte des letzten USHORT
        xor     eax,eax ;mov     eax,0
@@:     cmp     ax,q[ebp][edi]          ; q[l(a)-l(b)]=0 ?
        jne     d7                      ; Falls ja ...
        ;dec     edi
        sub     edi,2 ;dec     edi                     ; ... l(q) = l(a) - l(b) - 1
        test    edi,edi ;cmp     edi,0
        jne     short @B
d7:     mov     edx,edi
        shr     edx,1                   ; l(q) (#USHORTs) in dx
        mov     q[ebp],dx               ; l(q) speichern
;
;-----> Laenge des Restes finden
;
        xor     eax,eax ;mov     eax,0
        mov     ax,b[ebp]
        mov     ecx,eax
        shl     ax,1
        mov     edi,eax
        xor     ebx,ebx ;mov     ebx,0
        inc     edi
        inc     edi
d8:     dec     edi
        dec     edi                     ; di zeigt auf loByte von a[l(b)]
        cmp     a[ebp][edi],bx
        loope   d8
        jz      d9                      ; Falls a[bp][di]!= 0 muss ...
        inc     cx                      ; ... l korrigiert werden
d9:     mov     a[ebp],cx               ; l(r) speichern
;
;-----> Ergebnisse speichern
;
dstore: mov     esi,ebp
        mov     ebx,ebp
        pop     ebp
        push    esi
        add     esi,q
        mov     edi,_qot                ; Ziel-Offset des Quotienten
        xor     ecx,ecx ;mov     ecx,0
        mov     cx,[esi]
        shr     cx,1
        jnc     @F
        inc     cx
@@:     cld
        test    cx,cx ;cmp     cx,0
        je      qzero
rep     movsd
qzero:  movsw
;
;-----> Rest speichern
;
        pop     esi
drs:
        mov     edi,_rst
        add     esi,a
        xor     ecx,ecx ;mov     ecx,0
        mov     cx,[esi]
        shr     cx,1
        jnc     @F
        inc     cx
@@:     cld
        cmp     cx,0
        je      rzero
rep     movsd
rzero:  movsw
        ;mov     eax,0                   ; Return-Value = 0: alles OK !!! __FAST__ => kein Rückgabewert
;
;*******************************************************************************

divret: 
        pop     esi
        pop     edi
        pop     ebx
        mov     esp,ebp
        pop     ebp
        ret
;
;
;-----> Behandlung des Falles q = 0 und r = a
;
dra:    mov     esi,ebp
        mov     ebx,ebp
        pop     ebp
        mov     edi,_qot
        mov     word ptr [edi],0        ; q = 0
        jmp     short drs               ; Restparameter belegen
;
;
;>>>>>> Kurze Division
;
;
dshort: xor     ecx,ecx ;mov     ecx,0
        mov     cx,a[ebp]
        shr     cx,1
        jnc     dsh0
        inc     cx
dsh0:   mov     edi,ecx
        shl     edi,2
        sub     edi,2
        xor     edx,edx ;mov     edx,0
        mov     ebx,b[ebp+2]
dsh1:   mov     eax,a[ebp][edi]
        div     ebx
        mov     q[ebp][edi],eax
        sub     edi,4
        loop    dsh1
dsh2:   xor     ecx,ecx ;mov     ecx,0
        mov     cx,a[ebp]
        mov     esi,ecx
        shl     esi,1
@@:     mov     bx,q[ebp][esi]
        test    bx,bx ;cmp     bx,0
        jne     dsh3
        dec     esi
        dec     esi
        dec     ecx
        test    cx,cx ;cmp     cx,0
        jne     short @B
dsh3:   mov     q[ebp],cx
        mov     a[ebp+2],edx
        mov     word ptr a[ebp],2
        cmp     word ptr a[ebp+4],0
        jne     dst
        mov     word ptr a[ebp],1
        cmp     word ptr a[ebp+2],0
        jne     dst
        mov     word ptr a[ebp],0
dst:    jmp     dstore
;
divid  endp
END

Boah, du bist ja voll der Freak...aber wozu ist dieser Code da?

TriPhoenix

Lass mich mal raten....Division für Große Integer nach dem Buch "Kryptographie in C und C++" von Welschenbach? Da kommt mir der Code nämlich sehr bekannt vor

Aber da kann ich dir auch net optimieren helfen, geht aber bestimmt

Sag mir vielleicht erst mal kurz in ein paar Sätzen, was der Code tuen soll (evtl. woher du ihn hast), damit ich den ganzen Code etwas motivirter lesen kann (Es stört mich wenn ich ihn erst ein paarmal lesen muß um überhaupt zu verstehen was er soll).

Tipp zu Optimierung allgem.: Schreibe den Code in C jage das durch einen Compiler, den du das ganze optimieren lässt, und guckst dir dessen asm Ausgabe an. Denn mittlerweile können die Compiler schon sehr gut optimieren, unzwar auch für die verschiedenen Erweiterungen (MMX, 3DNOW!, SSE I+II, ...).

mfg
-bg-

Der Code dividiert zwei Zahlen und gibt das Ergebnis sowie den Rest aus.
Die Zahlen sind vom Typ "CLINT":

typedef unsigned short clint;
typedef clint CLINT[267];

Das C-Interface ist also:
void div_l(CLINT d1_l, CLINT d2_l, CLINT quot_l, CLINT rest_l);

Die ASN-Version geht nach der cdecl Aufrufkonvention.

C-Version:

div_l (CLINT d1_l, CLINT d2_l, CLINT quot_l, CLINT rest_l)
{
  register clint *rptr_l, *bptr_l;
  CLINT b_l;
  clint r_l[2 + (CLINTMAXDIGIT << 1)];  /* Erlaube doppelt langen Rest + 1 Stelle */
  clint *qptr_l, *msdptrb_l, *msdptrr_l, *lsdptrr_l;
  USHORT bv, rv, qdach, ri, ri_1, ri_2, bn, bn_1;
  ULONG right, left, rdach, borrow, carry, sbitsminusd;
  unsigned int d = 0;
  int i;

  cpy_l (r_l, d1_l);
  cpy_l (b_l, d2_l);

  if (EQZ_L (b_l))
    {
      PURGEVARS_L ((1, sizeof (r_l), r_l));
      ISPURGED_L  ((1, sizeof (r_l), r_l));

      return E_CLINT_DBZ;           /* Division durch Null */
    }

  if (EQZ_L (r_l))
    {
      SETZERO_L (quot_l);
      SETZERO_L (rest_l);

      PURGEVARS_L ((1, sizeof (b_l), b_l));
      ISPURGED_L  ((1, sizeof (b_l), b_l));

      return E_CLINT_OK;
    }

  i = cmp_l (r_l, b_l);

  if (i == -1)
    {
      cpy_l (rest_l, r_l);
      SETZERO_L (quot_l);

      PURGEVARS_L ((2, sizeof (b_l), b_l,
                       sizeof (r_l), r_l));
      ISPURGED_L  ((2, sizeof (b_l), b_l,
                       sizeof (r_l), r_l));
      return E_CLINT_OK;
    }
  else if (i == 0)
    {
      SETONE_L (quot_l);
      SETZERO_L (rest_l);

      PURGEVARS_L ((2, sizeof (b_l), b_l,
                       sizeof (r_l), r_l));
      ISPURGED_L  ((2, sizeof (b_l), b_l,
                       sizeof (r_l), r_l));
      return E_CLINT_OK;
    }

  if (DIGITS_L (b_l) == 1)
    {
      goto shortdiv;
    }

  /* Schritt 1 */
  msdptrb_l = MSDPTR_L (b_l);

  bn = *msdptrb_l;
  while (bn < BASEDIV2)
    {
      d++;
      bn <<= 1;
    }

  sbitsminusd = (int)BITPERDGT - d;

  if (d > 0)
    {
      bn += *(msdptrb_l - 1) >> sbitsminusd;

      if (DIGITS_L (b_l) > 2)
        {
          bn_1 = (USHORT)((*(msdptrb_l - 1) << d) + (*(msdptrb_l - 2) >> sbitsminusd));
        }
      else
        {
          bn_1 = (USHORT)(*(msdptrb_l - 1) << d);
        }
    }
  else
    {
      bn_1 = (USHORT)(*(msdptrb_l - 1));
    }

  /* Schritte 2 und 3 */
  msdptrr_l = MSDPTR_L (r_l) + 1;
  lsdptrr_l = MSDPTR_L (r_l) - DIGITS_L (b_l) + 1;
  *msdptrr_l = 0;

  qptr_l = quot_l + DIGITS_L (r_l) - DIGITS_L (b_l) + 1;

  /* Schritt 4 */
  while (lsdptrr_l >= LSDPTR_L (r_l))
    {
      ri = (USHORT)((*msdptrr_l << d) + (*(msdptrr_l - 1) >> sbitsminusd));

      ri_1 = (USHORT)((*(msdptrr_l - 1) << d) + (*(msdptrr_l - 2) >> sbitsminusd));

      if (msdptrr_l - 3 > r_l)
        {
          ri_2 = (USHORT)((*(msdptrr_l - 2) << d) + (*(msdptrr_l - 3) >> sbitsminusd));
        }
      else
        {
          ri_2 = (USHORT)(*(msdptrr_l - 2) << d);
        }

      if (ri != bn)               /* fast immer */
        {
          qdach = (USHORT)((rdach = ((ULONG)ri << BITPERDGT) + (ULONG)ri_1) / bn);
          right = ((rdach = (rdach - (ULONG)bn * qdach)) << BITPERDGT) + ri_2;

          /* test qdach */

          if ((left = (ULONG)bn_1 * qdach) > right)
            {
              qdach--;
              if ((rdach + bn) < BASE)
                  /* sonst bn_1 * qdach < rdach * b_l */
                {
                  if ((left - bn_1) > (right + ((ULONG)bn << BITPERDGT)))
                    {
                      qdach--;
                    }
                }
            }
        }
      else                        /* ri == bn, seltenerer Fall */
        {
          qdach = BASEMINONE;
          right = ((ULONG)(rdach = (ULONG)bn + (ULONG)ri_1) << BITPERDGT) + ri_2;
          if (rdach < BASE)       /* sonst ist bn_1 * qdach < rdach * b_l */
            {
              /* test qdach */

              if ((left = (ULONG)bn_1 * qdach) > right)
                {
                  qdach--;
                  if ((rdach + bn) < BASE)
                      /* sonst ist bn_1 * qdach < rdach * b_l */
                    {
                      if ((left - bn_1) > (right + ((ULONG)bn << BITPERDGT)))
                        {
                          qdach--;
                        }
                    }
                }
            }
        }

      /* Schritt 5 */
      borrow = BASE;
      carry = 0;
      for (bptr_l = LSDPTR_L (b_l), rptr_l = lsdptrr_l; bptr_l <= msdptrb_l; bptr_l++, rptr_l++)
        {
          if (borrow >= BASE)
            {
              *rptr_l = (USHORT)(borrow = ((ULONG)(*rptr_l) + BASE -
                         (ULONG)(USHORT)(carry = (ULONG)(*bptr_l) *
                          qdach + (ULONG)(USHORT)(carry >> BITPERDGT))));
            }
          else
            {
              *rptr_l = (USHORT)(borrow = ((ULONG)(*rptr_l) + BASEMINONEL -
                                (ULONG)(USHORT)(carry = (ULONG)(*bptr_l) *
                               qdach + (ULONG)(USHORT)(carry >> BITPERDGT))));
            }
        }

      if (borrow >= BASE)
        {
          *rptr_l = (USHORT)(borrow = ((ULONG)(*rptr_l) + BASE -
                             (ULONG)(USHORT)(carry >> BITPERDGT)));
        }
      else
        {
          *rptr_l = (USHORT)(borrow = ((ULONG)(*rptr_l) + BASEMINONEL -
                                    (ULONG)(USHORT)(carry >> BITPERDGT)));
        }

      /* Schritt 6 */
      *qptr_l = qdach;

      if (borrow < BASE)
        {
          carry = 0;
          for (bptr_l = LSDPTR_L (b_l), rptr_l = lsdptrr_l; bptr_l <= msdptrb_l; bptr_l++, rptr_l++)
            {
              *rptr_l = (USHORT)(carry = ((ULONG)(*rptr_l) + (ULONG)(*bptr_l) +
                                          (ULONG)(USHORT)(carry >> BITPERDGT)));
            }
          *rptr_l += (USHORT)(carry >> BITPERDGT);
          (*qptr_l)--;
        }

      /* Schritt 7 */
      msdptrr_l--;
      lsdptrr_l--;
      qptr_l--;
    }

  /* Schritt 8 */
  SETDIGITS_L (quot_l, DIGITS_L (r_l) - DIGITS_L (b_l) + 1);
  RMLDZRS_L (quot_l);

  SETDIGITS_L (r_l, DIGITS_L (b_l));
  cpy_l (rest_l, r_l);

  return;

  /* Kurze Division */
  shortdiv:

  rv = 0;
  bv = *LSDPTR_L (b_l);
  for (rptr_l = MSDPTR_L (r_l), qptr_l = quot_l + DIGITS_L (r_l); rptr_l >= LSDPTR_L (r_l); rptr_l--, qptr_l--)
    {
      *qptr_l = (USHORT)((rdach = ((((ULONG)rv) << BITPERDGT) +
                                           (ULONG)*rptr_l)) / bv);
      rv = (USHORT)(rdach - (ULONG)bv * (ULONG)*qptr_l);
    }

  SETDIGITS_L (quot_l, DIGITS_L (r_l));

  RMLDZRS_L (quot_l);
  u2clint_l (rest_l, rv);
}