LAMMP 4.1.0
Lamina High-Precision Arithmetic Library
载入中...
搜索中...
未找到
longlong.h
浏览该文件的文档.
1/*
2 * [LAMMP]
3 * Copyright (C) [2025-2026] [HJimmyK(Jericho Knox)]
4 *
5 * This program is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public License
16 * along with this program. If not, see <https://www.gnu.org/licenses/>.
17 */
18
19#ifndef __LAMMP_LONGLONG_H__
20#define __LAMMP_LONGLONG_H__
21
22#ifdef _MSC_VER
23#include <intrin.h>
24#include <immintrin.h>
25#elif defined(USE_ASM) && (defined(__x86_64__)) && (defined(__GNUC__) || defined(__clang__))
26#include <x86intrin.h>
27#endif
28
29#include <stdint.h>
30
31static inline void _umul64to128_(uint64_t a, uint64_t b, uint64_t *low, uint64_t *high) {
32#if (defined(__GNUC__) || defined(__clang__))
33#if defined(USE_ASM) && (defined(__x86_64__))
34 __asm__("mul %[b]"
35 : "=a"(*low),
36 "=d"(*high)
37 : "a"(a), [b] "r"(b)
38 :
39 );
40#else
41 __uint128_t prod = (__uint128_t)a * b;
42 *low = (uint64_t)prod;
43 *high = (uint64_t)(prod >> 64);
44#endif
45#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
46 *low = _umul128(a, b, high);
47#else
48 uint64_t ah = a >> 32, bh = b >> 32;
49 a = (uint32_t)a, b = (uint32_t)b;
50 uint64_t r0 = a * b, r1 = a * bh, r2 = ah * b, r3 = ah * bh;
51 r3 += (r1 >> 32) + (r2 >> 32);
52 r1 = (uint32_t)r1, r2 = (uint32_t)r2;
53 r1 += r2;
54 r1 += (r0 >> 32);
55 *high = r3 + (r1 >> 32);
56 *low = (r1 << 32) | (uint32_t)r0;
57#endif
58}
59
60static inline uint64_t _umul64to64hi_(uint64_t a, uint64_t b) {
61#if (defined(__GNUC__) || defined(__clang__)) && defined(__SIZEOF_INT128__)
62 __uint128_t t = (__uint128_t)a * (__uint128_t)b;
63 return (uint64_t)(t >> 64);
64#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
65 return __umulh(a, b);
66#else
67 uint64_t ah = a >> 32, bh = b >> 32;
68 a = (uint32_t)a, b = (uint32_t)b;
69 uint64_t r0 = a * b, r1 = a * bh, r2 = ah * b, r3 = ah * bh;
70 r3 += (r1 >> 32) + (r2 >> 32);
71 r1 = (uint32_t)r1, r2 = (uint32_t)r2;
72 r1 += r2;
73 r1 += (r0 >> 32);
74 return r3 + (r1 >> 32);
75#endif
76}
77
78static inline void _umulx64to128_(uint64_t a, uint64_t b, uint64_t* low, uint64_t* high) {
79#if defined(USE_ASM) && (defined(__x86_64__))
80 *low = _mulx_u64(a, b, high);
81#else
82 _umul64to128_(a, b, low, high);
83#endif
84}
85
86static inline void _umul128to256_(uint64_t a_high, uint64_t a_low, uint64_t b_high, uint64_t b_low, uint64_t rr[4]) {
87 uint64_t p1_low, p1_high; // p1 = a_low × b_high
88 uint64_t p2_low, p2_high; // p2 = a_high × b_low
89 _umulx64to128_(a_low, b_low, rr, rr + 1);
90 _umulx64to128_(a_low, b_high, &p1_low, &p1_high);
91 _umulx64to128_(a_high, b_low, &p2_low, &p2_high);
92 _umulx64to128_(a_high, b_high, rr + 2, rr + 3);
93 /*
94 | res0 | res1 | res2 | res3 |
95 | p0l | p0h | | |
96 | p1l | p1h | |
97 | p2l | p2h | |
98 | | p3l | p3h |
99 */
100 rr[1] += p1_low;
101 uint64_t carry = (rr[1] < p1_low) ? 1 : 0;
102 rr[1] += p2_low;
103 carry += (rr[1] < p2_low) ? 1 : 0;
104
105 rr[2] += carry;
106 carry = (rr[2] < carry) ? 1 : 0;
107 rr[2] += p1_high;
108 carry += (rr[2] < p1_high) ? 1 : 0;
109 rr[2] += p2_high;
110 carry += (rr[2] < p2_high) ? 1 : 0;
111
112 rr[3] += carry;
113}
114
115static inline void _umul128to128_(uint64_t a_high, uint64_t a_low, uint64_t b_high, uint64_t b_low, uint64_t rr[2]) {
116 _umulx64to128_(a_low, b_low, rr, rr + 1);
117 rr[1] += a_low * b_high;
118 rr[1] += a_high * b_low;
119}
120
121static inline int32_t _clz_u64_(uint64_t val) {
122#if defined(__GNUC__) || defined(__clang__)
123 // Fast way to count leading zeros
124 return __builtin_clzll(val);
125#else
126 unsigned long cnt;
127 _BitScanReverse64(&cnt, val);
128 return 63 - cnt;
129#endif
130}
131
132static inline uint64_t _udiv128by64to64_(uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t* r) {
133#if defined(__GNUC__) || defined(__clang__)
134#ifdef USE_ASM
135 uint64_t result;
136 __asm__("div %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi));
137 return result;
138#else
139 __uint128_t num = (__uint128_t)numhi << 64 | numlo;
140 uint64_t result = num / den;
141 *r = num % den;
142 return result;
143#endif
144#else
145 const uint64_t b = ((uint64_t)1 << 32);
146
147 uint32_t q1;
148 uint32_t q0;
149
150 uint64_t q;
151
152 int shift;
153
154 uint64_t den10 = den;
155 uint64_t num10 = numlo;
156
157 uint32_t den1;
158 uint32_t den0;
159 uint32_t num1;
160 uint32_t num0;
161
162 uint64_t rem;
163
164 uint64_t qhat;
165 uint64_t rhat;
166
167 uint64_t c1;
168 uint64_t c2;
169
170 if (numhi >= den) {
171 if (r)
172 *r = ~0ull;
173 return ~0ull;
174 }
175
176 shift = _clz_u64_(den);
177 den <<= shift;
178 numhi <<= shift;
179 numhi |= (numlo >> (-shift & 63)) & (uint64_t)(-(int64_t)shift >> 63);
180 numlo <<= shift;
181
182 num1 = (uint32_t)(numlo >> 32);
183 num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
184 den1 = (uint32_t)(den >> 32);
185 den0 = (uint32_t)(den & 0xFFFFFFFFu);
186
187 qhat = numhi / den1;
188 rhat = numhi % den1;
189 c1 = qhat * den0;
190 c2 = rhat * b + num1;
191 if (c1 > c2)
192 qhat -= (c1 - c2 > den) ? 2 : 1;
193 q1 = (uint32_t)qhat;
194
195 rem = numhi * b + num1 - q1 * den;
196
197 qhat = rem / den1;
198 rhat = rem % den1;
199 c1 = qhat * den0;
200 c2 = rhat * b + num0;
201 if (c1 > c2)
202 qhat -= (c1 - c2 > den) ? 2 : 1;
203 q0 = (uint32_t)qhat;
204
205 q = ((uint64_t)q1 << 32) | q0;
206
207 if (r)
208 *r = num10 - q * den10;
209 return q;
210#endif
211}
212
213#ifdef _MSC_VER
214// cnt = ctz(x)
215// r = x >> cnt
216// assume x is non-zero
217#define ctz_shl(r, x, cnt) \
218 do { \
219 unsigned long long _x_ = (x); \
220 unsigned long _bits_ = 0; \
221 _BitScanForward64(&_bits_, _x_); \
222 cnt = _bits_; \
223 (r) = _x_ >> (cnt); \
224 } while (0)
225#else
226// cnt = ctz(x)
227// r = x >> cnt
228// assume x is non-zero
229#define ctz_shl(r, x, cnt) \
230 do { \
231 unsigned long long _x_ = (x); \
232 (cnt) = __builtin_ctzll(_x_); \
233 (r) = _x_ >> (cnt); \
234 } while (0)
235#endif
236
237/**
238 * 请注意,此处的蒙哥马利域的R为2^64,p不可超过2^63-1
239 */
240
241typedef uint64_t u128[2];
242typedef uint64_t u192[3];
243
244#define _u128lshl(x, y, n) \
245 do { \
246 (*((x) + 1)) = ((*(y)) >> (64 - (n))) | ((*((y) + 1)) << (n)); \
247 (*(x)) = (*(y)) << (n); \
248 } while (0)
249
250#define _u128lshr(x, y, n) \
251 do { \
252 (*(x)) = ((*(y)) >> (n)) | ((*((y) + 1)) << (64 - (n))); \
253 (*((x) + 1)) = (*((y) + 1)) >> (n); \
254 } while (0)
255
256#define _u128high(x) (*((x) + 1))
257
258#define _u128low(x) (*(x))
259
260#define _u128add(r, x, y) \
261 do { \
262 (*(r)) = *(x) + *(y); \
263 (*((r) + 1)) = (*((x) + 1)) + (*((y) + 1)) + ((*(r)) < (*(y)) ? 1 : 0); \
264 } while (0)
265
266#define _u128add64(r, x, _i64) \
267 do { \
268 (*(r)) = *(x) + (_i64); \
269 (*((r) + 1)) = (*((x) + 1)) + (((*(r)) < (_i64)) ? 1 : 0); \
270 } while (0)
271
272#define _u128sub64(r, x, _i64) \
273 do { \
274 uint64_t _c_ = (x)[0] < (_i64); \
275 (r)[0] = (x)[0] - (_i64); \
276 (r)[1] = (x)[1] - _c_; \
277 } while (0)
278
279// x < y ?
280#define _u128cmp(x, y) ((x)[1] < (y)[1] || ((x)[1] == (y)[1] && (x)[0] < (y)[0]))
281
282#define _u128sub(r, x, y) \
283 do { \
284 uint64_t _c_ = (x)[0] < (y)[0]; \
285 (r)[0] = (x)[0] - (y)[0]; \
286 (r)[1] = (x)[1] - (y)[1] - _c_; \
287 } while (0)
288
289#define _u128mul(r, x, y) _umul64to128_((x), (y), (r), (((r) + 1)))
290
291#define _mont64_add(r, x, y, mod2) \
292 do { \
293 (r) = (x) + (y); \
294 (r) = ((r) < (mod2)) ? (r) : ((r) - (mod2)); \
295 } while (0)
296
297#define _mont64_sub(r, x, y, mod2) \
298 do { \
299 (r) = (((x) - (y)) > (x)) ? (((x) - (y)) + (mod2)) : ((x) - (y)); \
300 } while (0)
301
302#define _raw64_add(r, x, y) \
303 do { \
304 (r) = (x) + (y); \
305 } while (0)
306
307#define _raw64_sub(r, x, y, mod2) \
308 do { \
309 (r) = (x) - (y) + (mod2); \
310 } while (0)
311
312#define _mont64_norm2(r, x, mod2) \
313 do { \
314 (r) = (x) >= (mod2) ? ((x) - (mod2)) : (x); \
315 } while (0)
316
317/*
318 * macro _mont_mul(r, x, y, mod, modInvNeg) whithout "p_mont = t if t < p else t - p"
319 * macro _mont_mulinto(x, y, mod, modInvNeg) with "p_mont = t if t < p else t - p"
320 *
321 * assert((x < mod) && (y < mod))
322 *
323 */
324#define _mont64_mul(r, x, y, mod, modInvNeg) \
325 do { \
326 u128 _tmp1_ = {0, 0}; \
327 _u128mul(_tmp1_, (x), (y)); \
328 u128 _tmp2_ = {0, 0}; \
329 (*_tmp2_) = (*(_tmp1_)) * (modInvNeg); \
330 _u128mul(_tmp2_, (*(_tmp2_)), (mod)); \
331 _u128add(_tmp2_, _tmp2_, _tmp1_); \
332 (r) = (*(_tmp2_ + 1)); \
333 } while (0)
334
335/*
336def montgomery_mul(a_mont, b_mont, p):
337 R = 1 << 64
338 p_inv_mod_R = pow(p, -1, R)
339 ab = a_mont * b_mont
340 m = (ab * p_inv_mod_R) % R
341 t = (ab + m * p) // R
342 p_mont = t if t < p else t - p
343 return p_mont
344*/
345
346#define _mont64_mulinto(x, y, mod, modInvNeg) \
347 do { \
348 u128 _tmp1_ = {0, 0}; \
349 _u128mul(_tmp1_, (x), (y)); \
350 u128 _tmp2_ = {0, 0}; \
351 *(_tmp2_) = (*(_tmp1_)) * (modInvNeg); \
352 _u128mul(_tmp2_, *(_tmp2_), (mod)); \
353 _u128add(_tmp2_, _tmp2_, _tmp1_); \
354 (x) = (*(_tmp2_ + 1) < (mod)) ? (*(_tmp2_ + 1)) : ((*(_tmp2_ + 1)) - (mod)); \
355 } while (0)
356
357#define _mont64_tomont(x, r2, mod, modInvNeg) \
358 do { \
359 u128 _tmp1_ = {0, 0}; \
360 _u128mul(_tmp1_, (x), (r2)); \
361 u128 _tmp2_ = {0, 0}; \
362 *(_tmp2_) = (*(_tmp1_)) * (modInvNeg); \
363 _u128mul(_tmp2_, (*(_tmp2_)), (mod)); \
364 _u128add(_tmp2_, _tmp2_, _tmp1_); \
365 (x) = ((*(_tmp2_ + 1)) < (mod)) ? (*(_tmp2_ + 1)) : ((*(_tmp2_ + 1)) - (mod)); \
366 } while (0)
367
368#define _mont64_toint(x, mod, modInvneg) \
369 do { \
370 u128 _tmp_ = {0, 0}; \
371 *_tmp_ = (x) * (modInvneg); \
372 _u128mul(_tmp_, (*_tmp_), (mod)); \
373 _u128add64(_tmp_, _tmp_, x); \
374 (x) = (*(_tmp_ + 1) < (mod)) ? (*(_tmp_ + 1)) : ((*(_tmp_ + 1)) - (mod)); \
375 } while (0)
376
377#define _u128x64to192(i192, i128, i64) \
378 do { \
379 _umul64to128_((*(i128)), i64, (i192), ((i192) + 1)); \
380 uint64_t _tmp_; \
381 _umul64to128_((*((i128) + 1)), i64, (&(_tmp_)), ((i192) + 2)); \
382 (*((i192) + 1)) += _tmp_; \
383 (*((i192) + 2)) += ((*((i192) + 1)) < _tmp_) ? 1 : 0; \
384 } while (0)
385
386#define _u192add(i192, j192) \
387 do { \
388 (*(i192)) += (*(j192)); \
389 uint64_t _c_ = ((*(i192)) < (*j192)) ? 1 : 0; \
390 (*((i192) + 1)) += _c_; \
391 _c_ = ((*((i192) + 1)) < _c_) ? 1 : 0; \
392 (*((i192) + 1)) += (*((j192) + 1)); \
393 _c_ += ((*((i192) + 1)) < (*((j192) + 1))) ? 1 : 0; \
394 (*((i192) + 2)) += _c_ + (*((j192) + 2)); \
395 } while (0)
396
397#define _u192sub(i192, j192) \
398 do { \
399 uint64_t _b_ = ((i192)[0] < (j192)[0]) ? 1 : 0; \
400 (i192)[0] -= (j192)[0]; \
401 uint64_t _b1_ = ((i192)[1] < (j192)[1]) ? 1 : 0; \
402 (i192)[1] -= (j192)[1]; \
403 _b1_ += ((i192)[1] < _b_) ? 1 : 0; \
404 (i192)[1] -= _b_; \
405 (i192)[2] = (i192)[2] - ((j192)[2] + _b1_); \
406 } while (0)
407
408#define _add_ssaaaa(sh, sl, ah, al, bh, bl) \
409 do { \
410 uint64_t _x_; \
411 _x_ = (al) + (bl); \
412 (sh) = (ah) + (bh) + (_x_ < (al)); \
413 (sl) = _x_; \
414 } while (0)
415
416#define _sub_ddmmss(sh, sl, ah, al, bh, bl) \
417 do { \
418 uint64_t _x_; \
419 _x_ = (al) - (bl); \
420 (sh) = (ah) - (bh) - ((al) < (bl)); \
421 (sl) = _x_; \
422 } while (0)
423
424#define _udiv_qrnnd_preinv(q, r, nh, nl, d, di) \
425 do { \
426 uint64_t _qh_, _ql_, _r_, _mask_; \
427 _umul64to128_((nh), (di), &_ql_, &_qh_); \
428 _add_ssaaaa(_qh_, _ql_, _qh_, _ql_, (nh) + 1, (nl)); \
429 _r_ = (nl) - _qh_ * (d); \
430 _mask_ = -(mp_limb_t)(_r_ > _ql_); \
431 _qh_ += _mask_; \
432 _r_ += _mask_ & (d); \
433 if (_r_ >= (d)) { \
434 _r_ -= (d); \
435 _qh_++; \
436 } \
437 (r) = _r_; \
438 (q) = _qh_; \
439 } while (0)
440
441#define _udiv_qr_3by2(q, r1, r0, n2, n1, n0, d1, d0, dinv) \
442 do { \
443 mp_limb_t _q0_, _t1_, _t0_, _mask_; \
444 _umul64to128_((n2), (dinv), &_q0_, &(q)); \
445 _add_ssaaaa((q), _q0_, (q), _q0_, (n2), (n1)); \
446 /* Compute the two most significant limbs of n - q'd */ \
447 (r1) = (n1) - (d1) * (q); \
448 _sub_ddmmss((r1), (r0), (r1), (n0), (d1), (d0)); \
449 _umul64to128_((d0), (q), &_t0_, &_t1_); \
450 _sub_ddmmss((r1), (r0), (r1), (r0), _t1_, _t0_); \
451 (q)++; \
452 /* Conditionally adjust q and the remainders */ \
453 _mask_ = -(uint64_t)((r1) >= _q0_); \
454 (q) += _mask_; \
455 _add_ssaaaa((r1), (r0), (r1), (r0), _mask_ & (d1), _mask_ & (d0)); \
456 if ((r1) >= (d1)) { \
457 if ((r1) > (d1) || (r0) >= (d0)) { \
458 (q)++; \
459 _sub_ddmmss((r1), (r0), (r1), (r0), (d1), (d0)); \
460 } \
461 } \
462 } while (0)
463
464// q = n0 / d0, assuming d0 is a 32-bit number, d0 > 1
465// dinv = (B-1)//d0 + 1
466#define _udiv32by32_q_preinv(q, n0, dinv) \
467 do { \
468 uint64_t _hi_, _lo_; \
469 _umul64to128_((n0), (dinv), &_lo_, &_hi_); \
470 (q) = _hi_; \
471 } while (0)
472
473/******************************
474from https://libdivide.com/
475******************************/
476
477#define _U64_SHIFT_MASK 0x3F
478#define _ADD_MARKER 0x40
479
480typedef struct _udiv64_t {
481 uint64_t magic;
482 uint8_t more;
484
485// assert d != 0
486static inline _udiv64_t _udiv64_gen_internal_(uint64_t d, int branchfree) {
487 _udiv64_t result;
488 uint32_t floor_log_2_d = 63 - _clz_u64_(d);
489
490 // Power of 2
491 if ((d & (d - 1)) == 0) {
492 // We need to subtract 1 from the shift value in case of an unsigned
493 // branchfree divider because there is a hardcoded right shift by 1
494 // in its division algorithm. Because of this we also need to add back
495 // 1 in its recovery algorithm.
496 result.magic = 0;
497 result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
498 } else {
499 uint64_t proposed_m, rem;
500 uint8_t more;
501 // (1 << (64 + floor_log_2_d)) / d
502 proposed_m = _udiv128by64to64_((uint64_t)1 << floor_log_2_d, 0, d, &rem);
503
504 const uint64_t e = d - rem;
505
506 // This power works if e < 2**floor_log_2_d.
507 if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
508 // This power works
509 more = (uint8_t)floor_log_2_d;
510 } else {
511 // We have to use the general 65-bit algorithm. We need to compute
512 // (2**power) / d. However, we already have (2**(power-1))/d and
513 // its remainder. By doubling both, and then correcting the
514 // remainder, we can compute the larger division.
515 // don't care about overflow here - in fact, we expect it
516 proposed_m += proposed_m;
517 const uint64_t twice_rem = rem + rem;
518 if (twice_rem >= d || twice_rem < rem)
519 proposed_m += 1;
520 more = (uint8_t)(floor_log_2_d | _ADD_MARKER);
521 }
522 result.magic = 1 + proposed_m;
523 result.more = more;
524 // result.more's shift should in general be ceil_log_2_d. But if we
525 // used the smaller power, we subtract one from the shift because we're
526 // using the smaller power. If we're using the larger power, we
527 // subtract one from the shift because it's taken care of by the add
528 // indicator. So floor_log_2_d happens to be correct in both cases,
529 // which is why we do it outside of the if statement.
530 }
531 return result;
532}
533
534// d > 1
535static inline _udiv64_t _udiv64_gen(uint64_t d) {
537 _udiv64_t ret = {tmp.magic, (uint8_t)(tmp.more & _U64_SHIFT_MASK)};
538 return ret;
539}
540
541static inline uint64_t _udiv64by64_q_preinv(uint64_t numer, const _udiv64_t* denom) {
542 uint64_t q = _umul64to64hi_(numer, denom->magic);
543 uint64_t t = ((numer - q) >> 1) + q;
544 return t >> denom->more;
545}
546
547#endif // __LAMMP_LONGLONG_H__
uint8_t more
Definition longlong.h:482
static uint64_t _umul64to64hi_(uint64_t a, uint64_t b)
Definition longlong.h:60
uint64_t u192[3]
Definition longlong.h:242
#define _U64_SHIFT_MASK
from https://libdivide.com/
Definition longlong.h:477
static _udiv64_t _udiv64_gen_internal_(uint64_t d, int branchfree)
Definition longlong.h:486
static void _umul64to128_(uint64_t a, uint64_t b, uint64_t *low, uint64_t *high)
Definition longlong.h:31
static void _umul128to256_(uint64_t a_high, uint64_t a_low, uint64_t b_high, uint64_t b_low, uint64_t rr[4])
Definition longlong.h:86
#define _ADD_MARKER
Definition longlong.h:478
static _udiv64_t _udiv64_gen(uint64_t d)
Definition longlong.h:535
static int32_t _clz_u64_(uint64_t val)
Definition longlong.h:121
static void _umul128to128_(uint64_t a_high, uint64_t a_low, uint64_t b_high, uint64_t b_low, uint64_t rr[2])
Definition longlong.h:115
static uint64_t _udiv64by64_q_preinv(uint64_t numer, const _udiv64_t *denom)
Definition longlong.h:541
static uint64_t _udiv128by64to64_(uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r)
Definition longlong.h:132
uint64_t u128[2]
请注意,此处的蒙哥马利域的R为2^64,p不可超过2^63-1
Definition longlong.h:241
uint64_t magic
Definition longlong.h:481
static void _umulx64to128_(uint64_t a, uint64_t b, uint64_t *low, uint64_t *high)
Definition longlong.h:78
#define r2
#define r1
#define r3
#define r0
#define c1