Improving memcpy performance with SIMD instruction set

  amd-processor, c++, simd, x86

I got introduced to SIMD insctuction set just recently and as one of my pet projects thought about using it to implement memcpy and see if it performs better than standard memcpy. What I observe is the standard memcpy always performs better than SIMD based custom memcpy. I expected SIMD to have some advantage here. Posting my code and compiling instructions below:

Compilation command:

g++ --std=c++11 memcpy_test.cpp  -mavx2 -O3


#include <iostream>
#include <cstdint>
#include <immintrin.h>
#include <chrono>
#include <cstring>
#include <stdlib.h>

using namespace std;

void mymemcpy(char* dst, char* src, size_t size)
    if (dst != src) {
        auto isAligned = [&](uint64_t address) { return (address & 0x1F) == 0; };
        if (isAligned((uint64_t)src) && isAligned((uint64_t)dst)) {
            // std::cout << "Aligned and strting copy" << std::endl;
            const __m256i *s = reinterpret_cast<const __m256i *>(src);
            __m256i *dest = reinterpret_cast<__m256i *>(dst);
            int64_t vectors = size / sizeof(*s);
            int64_t residual = size % sizeof(*s);
            uint64_t vectors_copied = 0;
            for (; vectors > 0; vectors--, s++, dest++) {
            const __m256i loaded = _mm256_stream_load_si256(s);
            _mm256_stream_si256(dest, loaded);

            // if there are residual bytes, go for usual memcopy
            // cout << "residual : " << residual << endl;
            if (residual != 0) {
            uint64_t offset = vectors_copied * sizeof(*s);
            memcpy(dst + offset, src + offset, size - offset);

        } else {
            cout << "NOT ALIGNED" << (void *)src << (void *)dst << endl; 
            memcpy(dst, src, size);

#define DATA_MB 1 * 1024 * 1024

int main()
    using namespace std::chrono;
    char *source1 = reinterpret_cast<char *>(aligned_alloc(0x1F, DATA_MB*sizeof(char))); // 2 gb data 
    memset(source1, 0xF, DATA_MB*sizeof(char));
    char *destination1 = reinterpret_cast<char *>(aligned_alloc(0x1F, DATA_MB*sizeof(char))); // 2 gb data
    memset(destination1, 0x00, DATA_MB*sizeof(char));
    cout << "Standard memcpy" << endl;
    auto start1 = high_resolution_clock::now();
    memcpy(destination1, source1, (DATA_MB*sizeof(char)));
    auto stop1 = high_resolution_clock::now();
    auto duration_std = duration_cast<nanoseconds>(stop1 - start1);
    cout << duration_std.count() << endl;

    /* New buffers to avoid cache improvements (if it helps)*/

    char *source = reinterpret_cast<char *>(aligned_alloc(0x1F, DATA_MB*sizeof(char))); // 2 gb data 
    memset(source, 0xF, DATA_MB*sizeof(char));
    char *destination = reinterpret_cast<char *>(aligned_alloc(0x1F, DATA_MB*sizeof(char))); // 2 gb data
    memset(destination, 0x0, DATA_MB*sizeof(char));
    cout << "Custom memcpy" << endl;
    auto start = high_resolution_clock::now();
    mymemcpy(destination, source, (DATA_MB*sizeof(char)));
    auto stop = high_resolution_clock::now();
    auto duration = duration_cast<nanoseconds>(stop - start);
    cout << duration.count() << endl;

    cout << (duration_std.count() < duration.count()?"standard ":"custom ") << "performed better by " << abs(duration_std.count() - duration.count()) << "ns" << endl;

Test machine:

model name      : AMD EPYC 7282 16-Core Processor
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif umip rdpid overflow_recov succor smca

What am i doing wrong here? What could possibly cause standard memcpy to perform better than SIMD based custom memcpy?
I am very new to SIMD instructions and features it provides so please feel free to enlighten me even with the obvious.

Thanks alot

Source: Windows Questions C++