Announcement

Collapse
No announcement yet.

Two benchmarked memory copy algorithms

Collapse
X
 
  • Filter
  • Time
  • Show
Clear All
new posts

    Two benchmarked memory copy algorithms

    There are two different memory copy algorithms here, one is a simple to use byte aligned copy that is more than fast enough in most instances, the second is an aligned SSE2 copy that is faster in most instances but requires aligned memory both in and out. The rep movsb code is general purpose copy, the SSE2 version is where you are after the fastest you can run for tasks like streaming and other demanding tasks.
    Code:
    ' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
    
        #include "\basic\include\win32api.inc"
    
        MACRO memsize = 1024*1024*768
    
    ' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
    
     FUNCTION PBmain as LONG
    
        LOCAL psrc as DWORD
        LOCAL pdst as DWORD
        LOCAL tcnt as DWORD
        LOCAL lcnt as DWORD
    
        psrc = GlobalAlloc(%GMEM_FIXED or %GMEM_ZEROINIT,memsize)
        pdst = GlobalAlloc(%GMEM_FIXED or %GMEM_ZEROINIT,memsize)
    
        StdOut $CRLF+"Running timing for XMM copy and REP MOVSB copy"+$CRLF
    
        SetPriorityClass GetCurrentProcess,%HIGH_PRIORITY_CLASS
    
        xmmcopya psrc,pdst,memsize      ' dummy run to defeat cache effects
        bcopy psrc,pdst,memsize         ' dummy run to defeat cache effects
    
      ' -----------------------------
    
        ! cpuid
        SleepEx 100,0
    
        tcnt = GetTickCount
    
        lcnt = 20
    
      lbl1:
        xmmcopya psrc,pdst,memsize
        ! sub lcnt, 1
        ! jnz lbl1
    
        tcnt = GetTickCount - tcnt
    
        StdOut "XMM Copy "+format$(tcnt)+" ms"
    
      ' -----------------------------
    
        ! cpuid
        SleepEx 100,0
    
        tcnt = GetTickCount
    
        lcnt = 20
    
      lbl2:
        bcopy psrc,pdst,memsize
        ! sub lcnt, 1
        ! jnz lbl2
    
    
        tcnt = GetTickCount - tcnt
    
        StdOut "rep movsb copy "+format$(tcnt)+" ms"
    
      ' -----------------------------
    
        SetPriorityClass GetCurrentProcess,%NORMAL_PRIORITY_CLASS
    
        GlobalFree psrc
        GlobalFree pdst
    
        waitkey$
    
     End FUNCTION
    
    ' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
    
    FUNCTION xmmcopya(ByVal src as DWORD, ByVal dst as DWORD, ByVal cnt as DWORD) as DWORD
      ' ------------------------
      ' SSE2 aligned memory copy
      ' ------------------------
        #REGISTER NONE
    
        PREFIX "!"
    
        mov esi, src
        mov edi, dst
        mov ecx, cnt
        cmp ecx, 16
        jbe bypass
        shr ecx, 4
        xor ebx, ebx
    
      lpst:
        movdqa xmm0, [esi+ebx]          ' aligned read
        movntdq [edi+ebx], xmm0         ' aligned write
        add ebx, 16
        sub ecx, 1
        jnz lpst
    
        mov eax, cnt
        and eax, 15
        test eax, eax
        jnz lbl
        jmp bye
    
      lbl:
        mov dl, [esi+ebx]
        mov [edi+ebx], dl
        add ebx, 1
        sub eax, 1
        jnz lbl
        jmp bye
    
      bypass:
        rep movsb
    
      bye:
        END PREFIX
    
    END FUNCTION
    
    ' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
    
    FUNCTION bcopy(ByVal src as DWORD, ByVal dst as DWORD, ByVal cnt as DWORD) as DWORD
      ' -------------------
      ' unaligned byte copy
      ' -------------------
        #REGISTER NONE
    
        PREFIX "!"
    
        mov esi, src
        mov edi, dst
        mov ecx, cnt
        rep movsb
    
        END PREFIX
    
    END FUNCTION
    
    ' ¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤¤
    hutch at movsd dot com
    The MASM Forum - SLL Modules and PB Libraries

    http://www.masm32.com/board/index.php?board=69.0

    #2
    Thank you Sir Steve, the results of 3 runs on my laptop are

    Code:
    XMM copy            -->    2234  ms   , 2250  ms , 2125 ms
    
    Rep movsb copy  -->    2578 ms ,    2547  ms , 2547 ms

    XMM copy is much faster


    Comment

    Working...
    X
    😀
    🥰
    🤢
    😎
    😡
    👍
    👎