8989#define  _sse2neon_likely (x ) __builtin_expect(!!(x), 1)
9090#define  _sse2neon_unlikely (x ) __builtin_expect(!!(x), 0)
9191#elif  defined(_MSC_VER )
92- #if  _MSVC_TRADITIONAL 
93- #error  Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
94- #endif 
9592#ifndef  FORCE_INLINE 
9693#define  FORCE_INLINE  static inline
9794#endif 
184181    } while (0)
185182#endif 
186183
184+ #ifdef  _M_ARM 
185+ #define  vst1q_lane_s64 (a , b , c )
186+ #endif 
187+ 
187188/* Memory barriers 
188189 * __atomic_thread_fence does not include a compiler barrier; instead, 
189190 * the barrier is part of __atomic_load/__atomic_store's "volatile-like" 
@@ -202,8 +203,12 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
202203#elif  defined(__GNUC__ ) ||  defined(__clang__ )
203204    __atomic_thread_fence (__ATOMIC_SEQ_CST );
204205#else  /* MSVC */ 
206+ #ifdef  _M_ARM 
207+     __dmb (_ARM_BARRIER_ISH );
208+ #else 
205209    __dmb (_ARM64_BARRIER_ISH );
206210#endif 
211+ #endif 
207212}
208213
209214/* Architecture-specific build options */ 
@@ -268,7 +273,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
268273 * we have to perform syscall instead. 
269274 */ 
270275#if  (!defined(__aarch64__ ) &&  !defined(_M_ARM64 ))
271- #include  <sys/ time.h> 
276+ #include  <time.h> 
272277#endif 
273278
274279/* "__has_builtin" can be used to query support for built-in functions 
@@ -574,10 +579,10 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
574579/* Backwards compatibility for compilers with lack of specific type support */ 
575580
576581// Older gcc does not define vld1q_u8_x4 type 
577- #if  defined(__GNUC__ ) &&  !defined(__clang__ ) &&                          \
582+ #if  defined(_M_ARM )  ||  (defined( __GNUC__ ) &&  !defined(__clang__ ) &&     \
578583    ((__GNUC__  <= 12  &&  defined(__arm__ )) ||                            \
579584     (__GNUC__  ==  10  &&  __GNUC_MINOR__  <  3  &&  defined(__aarch64__ )) ||  \
580-      (__GNUC__  <= 9  &&  defined(__aarch64__ )))
585+      (__GNUC__  <= 9  &&  defined(__aarch64__ )))) 
581586FORCE_INLINE  uint8x16x4_t  _sse2neon_vld1q_u8_x4 (const  uint8_t  * p )
582587{
583588    uint8x16x4_t  ret ;
@@ -610,6 +615,9 @@ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
610615}
611616#endif 
612617
618+ #if  defined(_M_ARM )
619+ #pragma  message("TODO: Windows ARM32: Port many SSE2NEON functions")
620+ #else 
613621#if  !defined(__aarch64__ ) &&  !defined(_M_ARM64 )
614622/* emulate vaddvq u8 variant */ 
615623FORCE_INLINE  uint8_t  _sse2neon_vaddvq_u8 (uint8x16_t  a )
@@ -645,6 +653,7 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
645653    return  vaddvq_u16 (a );
646654}
647655#endif 
656+ #endif 
648657
649658/* Function Naming Conventions 
650659 * The naming convention of SSE intrinsics is straightforward. A generic SSE 
@@ -1765,6 +1774,7 @@ FORCE_INLINE void _mm_free(void *addr)
17651774}
17661775#endif 
17671776
1777+ #ifndef  _M_ARM 
17681778FORCE_INLINE  uint64_t  _sse2neon_get_fpcr ()
17691779{
17701780    uint64_t  value ;
@@ -1808,6 +1818,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
18081818
18091819    return  r .field .bit24  ? _MM_FLUSH_ZERO_ON  : _MM_FLUSH_ZERO_OFF ;
18101820}
1821+ #endif 
18111822
18121823// Macro: Get the rounding mode bits from the MXCSR control and status register. 
18131824// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, 
@@ -1826,6 +1837,8 @@ FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
18261837
18271838#if  defined(__aarch64__ ) ||  defined(_M_ARM64 )
18281839    r .value  =  _sse2neon_get_fpcr ();
1840+ #elif  defined(_M_ARM )
1841+     r .value  =  _MoveFromCoprocessor (10 ,7 , 1 ,0 ,0 );
18291842#else 
18301843    __asm__ __volatile__("vmrs %0, FPSCR"  : "=r" (r .value )); /* read */ 
18311844#endif 
@@ -2247,7 +2260,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
22472260FORCE_INLINE  void  _mm_prefetch (char  const  * p , int  i )
22482261{
22492262    (void ) i ;
2250- #if   defined( _MSC_VER ) 
2263+ #ifdef    _M_ARM64 
22512264    switch  (i ) {
22522265    case  _MM_HINT_NTA :
22532266        __prefetch2 (p , 1 );
@@ -2262,6 +2275,8 @@ FORCE_INLINE void _mm_prefetch(char const *p, int i)
22622275        __prefetch2 (p , 4 );
22632276        break ;
22642277    }
2278+ #elif  defined(_M_ARM )
2279+     // TODO 
22652280#else 
22662281    switch  (i ) {
22672282    case  _MM_HINT_NTA :
@@ -2348,6 +2363,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
23482363        vset_lane_u16 ((int ) vget_lane_u64 (t , 0 ), vdup_n_u16 (0 ), 0 ));
23492364}
23502365
2366+ #ifndef  _M_ARM 
23512367// Macro: Set the flush zero bits of the MXCSR control and status register to 
23522368// the value in unsigned 32-bit integer a. The flush zero may contain any of the 
23532369// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF 
@@ -2379,6 +2395,7 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
23792395    __asm__ __volatile__("vmsr FPSCR, %0"  ::"r" (r ));        /* write */ 
23802396#endif 
23812397}
2398+ #endif 
23822399
23832400// Set packed single-precision (32-bit) floating-point elements in dst with the 
23842401// supplied values. 
@@ -2404,6 +2421,7 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
24042421// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE 
24052422FORCE_INLINE  void  _MM_SET_ROUNDING_MODE (int  rounding )
24062423{
2424+ #ifndef  _M_ARM 
24072425    union  {
24082426        fpcr_bitfield  field ;
24092427#if  defined(__aarch64__ ) ||  defined(_M_ARM64 )
@@ -2442,6 +2460,7 @@ FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
24422460#else 
24432461    __asm__ __volatile__("vmsr FPSCR, %0"  ::"r" (r ));        /* write */ 
24442462#endif 
2463+ #endif 
24452464}
24462465
24472466// Copy single-precision (32-bit) floating-point element a to the lower element 
@@ -3206,6 +3225,7 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
32063225    return  _mm_move_sd (a , _mm_cmpeq_pd (a , b ));
32073226}
32083227
3228+ #ifndef  _M_ARM 
32093229// Compare packed double-precision (64-bit) floating-point elements in a and b 
32103230// for greater-than-or-equal, and store the results in dst. 
32113231// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd 
@@ -3247,6 +3267,7 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
32473267    return  vreinterpretq_m128d_u64 (vld1q_u64 (d ));
32483268#endif 
32493269}
3270+ #endif 
32503271
32513272// Compare packed signed 16-bit integers in a and b for greater-than, and store 
32523273// the results in dst. 
@@ -3275,6 +3296,7 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
32753296        vcgtq_s8 (vreinterpretq_s8_m128i (a ), vreinterpretq_s8_m128i (b )));
32763297}
32773298
3299+ #ifndef  _M_ARM 
32783300// Compare packed double-precision (64-bit) floating-point elements in a and b 
32793301// for greater-than, and store the results in dst. 
32803302// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd 
@@ -3358,6 +3380,7 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
33583380    return  vreinterpretq_m128d_u64 (vld1q_u64 (d ));
33593381#endif 
33603382}
3383+ #endif 
33613384
33623385// Compare packed signed 16-bit integers in a and b for less-than, and store the 
33633386// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the 
@@ -3389,6 +3412,7 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
33893412        vcltq_s8 (vreinterpretq_s8_m128i (a ), vreinterpretq_s8_m128i (b )));
33903413}
33913414
3415+ #ifndef  _M_ARM 
33923416// Compare packed double-precision (64-bit) floating-point elements in a and b 
33933417// for less-than, and store the results in dst. 
33943418// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd 
@@ -3429,6 +3453,7 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
34293453    return  vreinterpretq_m128d_u64 (vld1q_u64 (d ));
34303454#endif 
34313455}
3456+ #endif 
34323457
34333458// Compare packed double-precision (64-bit) floating-point elements in a and b 
34343459// for not-equal, and store the results in dst. 
@@ -3456,6 +3481,7 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
34563481    return  _mm_move_sd (a , _mm_cmpneq_pd (a , b ));
34573482}
34583483
3484+ #ifndef  _M_ARM 
34593485// Compare packed double-precision (64-bit) floating-point elements in a and b 
34603486// for not-greater-than-or-equal, and store the results in dst. 
34613487// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd 
@@ -3756,6 +3782,7 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
37563782    return  (* (double  * ) & a0  <  * (double  * ) & b0 );
37573783#endif 
37583784}
3785+ #endif 
37593786
37603787// Compare the lower double-precision (64-bit) floating-point element in a and b 
37613788// for equality, and return the boolean result (0 or 1). 
@@ -4401,6 +4428,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
44014428        vmaxq_u8 (vreinterpretq_u8_m128i (a ), vreinterpretq_u8_m128i (b )));
44024429}
44034430
4431+ #ifndef  _M_ARM 
44044432// Compare packed double-precision (64-bit) floating-point elements in a and b, 
44054433// and store packed maximum values in dst. 
44064434// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd 
@@ -4487,6 +4515,7 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
44874515    return  vreinterpretq_m128d_u64 (vld1q_u64 (d ));
44884516#endif 
44894517}
4518+ #endif 
44904519
44914520// Compare the lower double-precision (64-bit) floating-point elements in a and 
44924521// b, store the minimum value in the lower element of dst, and copy the upper 
@@ -4793,7 +4822,11 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
47934822FORCE_INLINE  void  _mm_pause ()
47944823{
47954824#if  defined(_MSC_VER )
4825+ #ifdef  _M_ARM 
4826+     __isb (_ARM_BARRIER_SY );
4827+ #else 
47964828    __isb (_ARM64_BARRIER_SY );
4829+ #endif 
47974830#else 
47984831    __asm__ __volatile__("isb\n" );
47994832#endif 
@@ -7622,6 +7655,7 @@ FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
76227655}
76237656
76247657/* SSE4.2 */ 
7658+ #ifndef  _M_ARM 
76257659
76267660const  static  uint16_t  ALIGN_STRUCT (16 ) _sse2neon_cmpestr_mask16b [8 ] =  {
76277661    0x01 , 0x02 , 0x04 , 0x08 , 0x10 , 0x20 , 0x40 , 0x80 ,
@@ -8463,9 +8497,11 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
84638497    return  crc ;
84648498}
84658499
8500+ #endif 
8501+ 
84668502/* AES */ 
84678503
8468- #if  !defined(__ARM_FEATURE_CRYPTO ) &&  !defined(_M_ARM64 )
8504+ #if  !defined(__ARM_FEATURE_CRYPTO ) &&  !defined(_M_ARM64 )  &&  !defined( _M_ARM ) 
84698505/* clang-format off */ 
84708506#define  SSE2NEON_AES_SBOX (w )                                           \
84718507    {                                                                  \
@@ -8913,6 +8949,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
89138949#undef  SSE2NEON_MULTIPLY
89148950#endif 
89158951
8952+ #elif  defined(_M_ARM )
89168953#else  /* __ARM_FEATURE_CRYPTO */ 
89178954// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and 
89188955// AESMC and then manually applying the real key as an xor operation. This 
@@ -9034,6 +9071,7 @@ FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
90349071    }
90359072}
90369073
9074+ #ifndef  _M_ARM 
90379075FORCE_INLINE  unsigned  int  _sse2neon_mm_get_denormals_zero_mode ()
90389076{
90399077    union  {
@@ -9053,6 +9091,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
90539091
90549092    return  r .field .bit24  ? _MM_DENORMALS_ZERO_ON  : _MM_DENORMALS_ZERO_OFF ;
90559093}
9094+ #endif 
90569095
90579096// Count the number of bits set to 1 in unsigned 32-bit integer a, and 
90589097// return that count in dst. 
@@ -9113,6 +9152,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
91139152#endif 
91149153}
91159154
9155+ #ifndef  _M_ARM 
91169156FORCE_INLINE  void  _sse2neon_mm_set_denormals_zero_mode (unsigned int   flag )
91179157{
91189158    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, 
@@ -9140,6 +9180,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
91409180    __asm__ __volatile__("vmsr FPSCR, %0"  ::"r" (r ));        /* write */ 
91419181#endif 
91429182}
9183+ #endif 
91439184
91449185// Return the current 64-bit value of the processor's time-stamp counter. 
91459186// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc 
@@ -9161,6 +9202,9 @@ FORCE_INLINE uint64_t _rdtsc(void)
91619202#endif 
91629203
91639204    return  val ;
9205+ #elif  defined(_M_ARM )
9206+ 	uint32_t  val  =  _MoveFromCoprocessor (15 ,0 , 9 ,13 ,0 );
9207+ 	return  ((uint64_t )val ) << 6 ;
91649208#else 
91659209    uint32_t  pmccntr , pmuseren , pmcntenset ;
91669210    // Read the user mode Performance Monitoring Unit (PMU) 
0 commit comments