bn_asm.c 29 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094
  1. /* crypto/bn/bn_asm.c */
  2. /* Copyright (C) 1995-1998 Eric Young ([email protected])
  3. * All rights reserved.
  4. *
  5. * This package is an SSL implementation written
  6. * by Eric Young ([email protected]).
  7. * The implementation was written so as to conform with Netscapes SSL.
  8. *
  9. * This library is free for commercial and non-commercial use as long as
  10. * the following conditions are aheared to. The following conditions
  11. * apply to all code found in this distribution, be it the RC4, RSA,
  12. * lhash, DES, etc., code; not just the SSL code. The SSL documentation
  13. * included with this distribution is covered by the same copyright terms
  14. * except that the holder is Tim Hudson ([email protected]).
  15. *
  16. * Copyright remains Eric Young's, and as such any Copyright notices in
  17. * the code are not to be removed.
  18. * If this package is used in a product, Eric Young should be given attribution
  19. * as the author of the parts of the library used.
  20. * This can be in the form of a textual message at program startup or
  21. * in documentation (online or textual) provided with the package.
  22. *
  23. * Redistribution and use in source and binary forms, with or without
  24. * modification, are permitted provided that the following conditions
  25. * are met:
  26. * 1. Redistributions of source code must retain the copyright
  27. * notice, this list of conditions and the following disclaimer.
  28. * 2. Redistributions in binary form must reproduce the above copyright
  29. * notice, this list of conditions and the following disclaimer in the
  30. * documentation and/or other materials provided with the distribution.
  31. * 3. All advertising materials mentioning features or use of this software
  32. * must display the following acknowledgement:
  33. * "This product includes cryptographic software written by
  34. * Eric Young ([email protected])"
  35. * The word 'cryptographic' can be left out if the rouines from the library
  36. * being used are not cryptographic related :-).
  37. * 4. If you include any Windows specific code (or a derivative thereof) from
  38. * the apps directory (application code) you must include an acknowledgement:
  39. * "This product includes software written by Tim Hudson ([email protected])"
  40. *
  41. * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
  42. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  43. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  44. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  45. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  46. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  47. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  48. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  49. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  50. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  51. * SUCH DAMAGE.
  52. *
  53. * The licence and distribution terms for any publically available version or
  54. * derivative of this code cannot be changed. i.e. this code cannot simply be
  55. * copied and put under another distribution licence
  56. * [including the GNU Public Licence.]
  57. */
  58. #ifndef BN_DEBUG
  59. # undef NDEBUG /* avoid conflicting definitions */
  60. # define NDEBUG
  61. #endif
  62. #include <stdio.h>
  63. #include <assert.h>
  64. #include "cryptlib.h"
  65. #include "bn_lcl.h"
  66. #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
  67. BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
  68. BN_ULONG w)
  69. {
  70. BN_ULONG c1 = 0;
  71. assert(num >= 0);
  72. if (num <= 0)
  73. return (c1);
  74. # ifndef OPENSSL_SMALL_FOOTPRINT
  75. while (num & ~3) {
  76. mul_add(rp[0], ap[0], w, c1);
  77. mul_add(rp[1], ap[1], w, c1);
  78. mul_add(rp[2], ap[2], w, c1);
  79. mul_add(rp[3], ap[3], w, c1);
  80. ap += 4;
  81. rp += 4;
  82. num -= 4;
  83. }
  84. # endif
  85. while (num) {
  86. mul_add(rp[0], ap[0], w, c1);
  87. ap++;
  88. rp++;
  89. num--;
  90. }
  91. return (c1);
  92. }
  93. BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
  94. {
  95. BN_ULONG c1 = 0;
  96. assert(num >= 0);
  97. if (num <= 0)
  98. return (c1);
  99. # ifndef OPENSSL_SMALL_FOOTPRINT
  100. while (num & ~3) {
  101. mul(rp[0], ap[0], w, c1);
  102. mul(rp[1], ap[1], w, c1);
  103. mul(rp[2], ap[2], w, c1);
  104. mul(rp[3], ap[3], w, c1);
  105. ap += 4;
  106. rp += 4;
  107. num -= 4;
  108. }
  109. # endif
  110. while (num) {
  111. mul(rp[0], ap[0], w, c1);
  112. ap++;
  113. rp++;
  114. num--;
  115. }
  116. return (c1);
  117. }
  118. void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
  119. {
  120. assert(n >= 0);
  121. if (n <= 0)
  122. return;
  123. # ifndef OPENSSL_SMALL_FOOTPRINT
  124. while (n & ~3) {
  125. sqr(r[0], r[1], a[0]);
  126. sqr(r[2], r[3], a[1]);
  127. sqr(r[4], r[5], a[2]);
  128. sqr(r[6], r[7], a[3]);
  129. a += 4;
  130. r += 8;
  131. n -= 4;
  132. }
  133. # endif
  134. while (n) {
  135. sqr(r[0], r[1], a[0]);
  136. a++;
  137. r += 2;
  138. n--;
  139. }
  140. }
  141. #else /* !(defined(BN_LLONG) ||
  142. * defined(BN_UMULT_HIGH)) */
  143. BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
  144. BN_ULONG w)
  145. {
  146. BN_ULONG c = 0;
  147. BN_ULONG bl, bh;
  148. assert(num >= 0);
  149. if (num <= 0)
  150. return ((BN_ULONG)0);
  151. bl = LBITS(w);
  152. bh = HBITS(w);
  153. # ifndef OPENSSL_SMALL_FOOTPRINT
  154. while (num & ~3) {
  155. mul_add(rp[0], ap[0], bl, bh, c);
  156. mul_add(rp[1], ap[1], bl, bh, c);
  157. mul_add(rp[2], ap[2], bl, bh, c);
  158. mul_add(rp[3], ap[3], bl, bh, c);
  159. ap += 4;
  160. rp += 4;
  161. num -= 4;
  162. }
  163. # endif
  164. while (num) {
  165. mul_add(rp[0], ap[0], bl, bh, c);
  166. ap++;
  167. rp++;
  168. num--;
  169. }
  170. return (c);
  171. }
  172. BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
  173. {
  174. BN_ULONG carry = 0;
  175. BN_ULONG bl, bh;
  176. assert(num >= 0);
  177. if (num <= 0)
  178. return ((BN_ULONG)0);
  179. bl = LBITS(w);
  180. bh = HBITS(w);
  181. # ifndef OPENSSL_SMALL_FOOTPRINT
  182. while (num & ~3) {
  183. mul(rp[0], ap[0], bl, bh, carry);
  184. mul(rp[1], ap[1], bl, bh, carry);
  185. mul(rp[2], ap[2], bl, bh, carry);
  186. mul(rp[3], ap[3], bl, bh, carry);
  187. ap += 4;
  188. rp += 4;
  189. num -= 4;
  190. }
  191. # endif
  192. while (num) {
  193. mul(rp[0], ap[0], bl, bh, carry);
  194. ap++;
  195. rp++;
  196. num--;
  197. }
  198. return (carry);
  199. }
  200. void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
  201. {
  202. assert(n >= 0);
  203. if (n <= 0)
  204. return;
  205. # ifndef OPENSSL_SMALL_FOOTPRINT
  206. while (n & ~3) {
  207. sqr64(r[0], r[1], a[0]);
  208. sqr64(r[2], r[3], a[1]);
  209. sqr64(r[4], r[5], a[2]);
  210. sqr64(r[6], r[7], a[3]);
  211. a += 4;
  212. r += 8;
  213. n -= 4;
  214. }
  215. # endif
  216. while (n) {
  217. sqr64(r[0], r[1], a[0]);
  218. a++;
  219. r += 2;
  220. n--;
  221. }
  222. }
  223. #endif /* !(defined(BN_LLONG) ||
  224. * defined(BN_UMULT_HIGH)) */
  225. #if defined(BN_LLONG) && defined(BN_DIV2W)
  226. BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
  227. {
  228. return ((BN_ULONG)(((((BN_ULLONG) h) << BN_BITS2) | l) / (BN_ULLONG) d));
  229. }
  230. #else
  231. /* Divide h,l by d and return the result. */
  232. /* I need to test this some more :-( */
  233. BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
  234. {
  235. BN_ULONG dh, dl, q, ret = 0, th, tl, t;
  236. int i, count = 2;
  237. if (d == 0)
  238. return (BN_MASK2);
  239. i = BN_num_bits_word(d);
  240. assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
  241. i = BN_BITS2 - i;
  242. if (h >= d)
  243. h -= d;
  244. if (i) {
  245. d <<= i;
  246. h = (h << i) | (l >> (BN_BITS2 - i));
  247. l <<= i;
  248. }
  249. dh = (d & BN_MASK2h) >> BN_BITS4;
  250. dl = (d & BN_MASK2l);
  251. for (;;) {
  252. if ((h >> BN_BITS4) == dh)
  253. q = BN_MASK2l;
  254. else
  255. q = h / dh;
  256. th = q * dh;
  257. tl = dl * q;
  258. for (;;) {
  259. t = h - th;
  260. if ((t & BN_MASK2h) ||
  261. ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4))))
  262. break;
  263. q--;
  264. th -= dh;
  265. tl -= dl;
  266. }
  267. t = (tl >> BN_BITS4);
  268. tl = (tl << BN_BITS4) & BN_MASK2h;
  269. th += t;
  270. if (l < tl)
  271. th++;
  272. l -= tl;
  273. if (h < th) {
  274. h += d;
  275. q--;
  276. }
  277. h -= th;
  278. if (--count == 0)
  279. break;
  280. ret = q << BN_BITS4;
  281. h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
  282. l = (l & BN_MASK2l) << BN_BITS4;
  283. }
  284. ret |= q;
  285. return (ret);
  286. }
  287. #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
  288. #ifdef BN_LLONG
  289. BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
  290. int n)
  291. {
  292. BN_ULLONG ll = 0;
  293. assert(n >= 0);
  294. if (n <= 0)
  295. return ((BN_ULONG)0);
  296. # ifndef OPENSSL_SMALL_FOOTPRINT
  297. while (n & ~3) {
  298. ll += (BN_ULLONG) a[0] + b[0];
  299. r[0] = (BN_ULONG)ll & BN_MASK2;
  300. ll >>= BN_BITS2;
  301. ll += (BN_ULLONG) a[1] + b[1];
  302. r[1] = (BN_ULONG)ll & BN_MASK2;
  303. ll >>= BN_BITS2;
  304. ll += (BN_ULLONG) a[2] + b[2];
  305. r[2] = (BN_ULONG)ll & BN_MASK2;
  306. ll >>= BN_BITS2;
  307. ll += (BN_ULLONG) a[3] + b[3];
  308. r[3] = (BN_ULONG)ll & BN_MASK2;
  309. ll >>= BN_BITS2;
  310. a += 4;
  311. b += 4;
  312. r += 4;
  313. n -= 4;
  314. }
  315. # endif
  316. while (n) {
  317. ll += (BN_ULLONG) a[0] + b[0];
  318. r[0] = (BN_ULONG)ll & BN_MASK2;
  319. ll >>= BN_BITS2;
  320. a++;
  321. b++;
  322. r++;
  323. n--;
  324. }
  325. return ((BN_ULONG)ll);
  326. }
  327. #else /* !BN_LLONG */
  328. BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
  329. int n)
  330. {
  331. BN_ULONG c, l, t;
  332. assert(n >= 0);
  333. if (n <= 0)
  334. return ((BN_ULONG)0);
  335. c = 0;
  336. # ifndef OPENSSL_SMALL_FOOTPRINT
  337. while (n & ~3) {
  338. t = a[0];
  339. t = (t + c) & BN_MASK2;
  340. c = (t < c);
  341. l = (t + b[0]) & BN_MASK2;
  342. c += (l < t);
  343. r[0] = l;
  344. t = a[1];
  345. t = (t + c) & BN_MASK2;
  346. c = (t < c);
  347. l = (t + b[1]) & BN_MASK2;
  348. c += (l < t);
  349. r[1] = l;
  350. t = a[2];
  351. t = (t + c) & BN_MASK2;
  352. c = (t < c);
  353. l = (t + b[2]) & BN_MASK2;
  354. c += (l < t);
  355. r[2] = l;
  356. t = a[3];
  357. t = (t + c) & BN_MASK2;
  358. c = (t < c);
  359. l = (t + b[3]) & BN_MASK2;
  360. c += (l < t);
  361. r[3] = l;
  362. a += 4;
  363. b += 4;
  364. r += 4;
  365. n -= 4;
  366. }
  367. # endif
  368. while (n) {
  369. t = a[0];
  370. t = (t + c) & BN_MASK2;
  371. c = (t < c);
  372. l = (t + b[0]) & BN_MASK2;
  373. c += (l < t);
  374. r[0] = l;
  375. a++;
  376. b++;
  377. r++;
  378. n--;
  379. }
  380. return ((BN_ULONG)c);
  381. }
  382. #endif /* !BN_LLONG */
  383. BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
  384. int n)
  385. {
  386. BN_ULONG t1, t2;
  387. int c = 0;
  388. assert(n >= 0);
  389. if (n <= 0)
  390. return ((BN_ULONG)0);
  391. #ifndef OPENSSL_SMALL_FOOTPRINT
  392. while (n & ~3) {
  393. t1 = a[0];
  394. t2 = b[0];
  395. r[0] = (t1 - t2 - c) & BN_MASK2;
  396. if (t1 != t2)
  397. c = (t1 < t2);
  398. t1 = a[1];
  399. t2 = b[1];
  400. r[1] = (t1 - t2 - c) & BN_MASK2;
  401. if (t1 != t2)
  402. c = (t1 < t2);
  403. t1 = a[2];
  404. t2 = b[2];
  405. r[2] = (t1 - t2 - c) & BN_MASK2;
  406. if (t1 != t2)
  407. c = (t1 < t2);
  408. t1 = a[3];
  409. t2 = b[3];
  410. r[3] = (t1 - t2 - c) & BN_MASK2;
  411. if (t1 != t2)
  412. c = (t1 < t2);
  413. a += 4;
  414. b += 4;
  415. r += 4;
  416. n -= 4;
  417. }
  418. #endif
  419. while (n) {
  420. t1 = a[0];
  421. t2 = b[0];
  422. r[0] = (t1 - t2 - c) & BN_MASK2;
  423. if (t1 != t2)
  424. c = (t1 < t2);
  425. a++;
  426. b++;
  427. r++;
  428. n--;
  429. }
  430. return (c);
  431. }
  432. #if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
  433. # undef bn_mul_comba8
  434. # undef bn_mul_comba4
  435. # undef bn_sqr_comba8
  436. # undef bn_sqr_comba4
  437. /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
  438. /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
  439. /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
  440. /*
  441. * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
  442. * c=(c2,c1,c0)
  443. */
  444. /*
  445. * Keep in mind that carrying into high part of multiplication result
  446. * can not overflow, because it cannot be all-ones.
  447. */
  448. # ifdef BN_LLONG
  449. # define mul_add_c(a,b,c0,c1,c2) \
  450. t=(BN_ULLONG)a*b; \
  451. t1=(BN_ULONG)Lw(t); \
  452. t2=(BN_ULONG)Hw(t); \
  453. c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
  454. c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
  455. # define mul_add_c2(a,b,c0,c1,c2) \
  456. t=(BN_ULLONG)a*b; \
  457. tt=(t+t)&BN_MASK; \
  458. if (tt < t) c2++; \
  459. t1=(BN_ULONG)Lw(tt); \
  460. t2=(BN_ULONG)Hw(tt); \
  461. c0=(c0+t1)&BN_MASK2; \
  462. if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
  463. c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
  464. # define sqr_add_c(a,i,c0,c1,c2) \
  465. t=(BN_ULLONG)a[i]*a[i]; \
  466. t1=(BN_ULONG)Lw(t); \
  467. t2=(BN_ULONG)Hw(t); \
  468. c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
  469. c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
  470. # define sqr_add_c2(a,i,j,c0,c1,c2) \
  471. mul_add_c2((a)[i],(a)[j],c0,c1,c2)
  472. # elif defined(BN_UMULT_LOHI)
  473. # define mul_add_c(a,b,c0,c1,c2) { \
  474. BN_ULONG ta=(a),tb=(b); \
  475. BN_UMULT_LOHI(t1,t2,ta,tb); \
  476. c0 += t1; t2 += (c0<t1)?1:0; \
  477. c1 += t2; c2 += (c1<t2)?1:0; \
  478. }
  479. # define mul_add_c2(a,b,c0,c1,c2) { \
  480. BN_ULONG ta=(a),tb=(b),t0; \
  481. BN_UMULT_LOHI(t0,t1,ta,tb); \
  482. c0 += t0; t2 = t1+((c0<t0)?1:0);\
  483. c1 += t2; c2 += (c1<t2)?1:0; \
  484. c0 += t0; t1 += (c0<t0)?1:0; \
  485. c1 += t1; c2 += (c1<t1)?1:0; \
  486. }
  487. # define sqr_add_c(a,i,c0,c1,c2) { \
  488. BN_ULONG ta=(a)[i]; \
  489. BN_UMULT_LOHI(t1,t2,ta,ta); \
  490. c0 += t1; t2 += (c0<t1)?1:0; \
  491. c1 += t2; c2 += (c1<t2)?1:0; \
  492. }
  493. # define sqr_add_c2(a,i,j,c0,c1,c2) \
  494. mul_add_c2((a)[i],(a)[j],c0,c1,c2)
  495. # elif defined(BN_UMULT_HIGH)
  496. # define mul_add_c(a,b,c0,c1,c2) { \
  497. BN_ULONG ta=(a),tb=(b); \
  498. t1 = ta * tb; \
  499. t2 = BN_UMULT_HIGH(ta,tb); \
  500. c0 += t1; t2 += (c0<t1)?1:0; \
  501. c1 += t2; c2 += (c1<t2)?1:0; \
  502. }
  503. # define mul_add_c2(a,b,c0,c1,c2) { \
  504. BN_ULONG ta=(a),tb=(b),t0; \
  505. t1 = BN_UMULT_HIGH(ta,tb); \
  506. t0 = ta * tb; \
  507. c0 += t0; t2 = t1+((c0<t0)?1:0);\
  508. c1 += t2; c2 += (c1<t2)?1:0; \
  509. c0 += t0; t1 += (c0<t0)?1:0; \
  510. c1 += t1; c2 += (c1<t1)?1:0; \
  511. }
  512. # define sqr_add_c(a,i,c0,c1,c2) { \
  513. BN_ULONG ta=(a)[i]; \
  514. t1 = ta * ta; \
  515. t2 = BN_UMULT_HIGH(ta,ta); \
  516. c0 += t1; t2 += (c0<t1)?1:0; \
  517. c1 += t2; c2 += (c1<t2)?1:0; \
  518. }
  519. # define sqr_add_c2(a,i,j,c0,c1,c2) \
  520. mul_add_c2((a)[i],(a)[j],c0,c1,c2)
  521. # else /* !BN_LLONG */
  522. # define mul_add_c(a,b,c0,c1,c2) \
  523. t1=LBITS(a); t2=HBITS(a); \
  524. bl=LBITS(b); bh=HBITS(b); \
  525. mul64(t1,t2,bl,bh); \
  526. c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
  527. c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
  528. # define mul_add_c2(a,b,c0,c1,c2) \
  529. t1=LBITS(a); t2=HBITS(a); \
  530. bl=LBITS(b); bh=HBITS(b); \
  531. mul64(t1,t2,bl,bh); \
  532. if (t2 & BN_TBIT) c2++; \
  533. t2=(t2+t2)&BN_MASK2; \
  534. if (t1 & BN_TBIT) t2++; \
  535. t1=(t1+t1)&BN_MASK2; \
  536. c0=(c0+t1)&BN_MASK2; \
  537. if ((c0 < t1) && (((++t2)&BN_MASK2) == 0)) c2++; \
  538. c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
  539. # define sqr_add_c(a,i,c0,c1,c2) \
  540. sqr64(t1,t2,(a)[i]); \
  541. c0=(c0+t1)&BN_MASK2; if ((c0) < t1) t2++; \
  542. c1=(c1+t2)&BN_MASK2; if ((c1) < t2) c2++;
  543. # define sqr_add_c2(a,i,j,c0,c1,c2) \
  544. mul_add_c2((a)[i],(a)[j],c0,c1,c2)
  545. # endif /* !BN_LLONG */
  546. void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  547. {
  548. # ifdef BN_LLONG
  549. BN_ULLONG t;
  550. # else
  551. BN_ULONG bl, bh;
  552. # endif
  553. BN_ULONG t1, t2;
  554. BN_ULONG c1, c2, c3;
  555. c1 = 0;
  556. c2 = 0;
  557. c3 = 0;
  558. mul_add_c(a[0], b[0], c1, c2, c3);
  559. r[0] = c1;
  560. c1 = 0;
  561. mul_add_c(a[0], b[1], c2, c3, c1);
  562. mul_add_c(a[1], b[0], c2, c3, c1);
  563. r[1] = c2;
  564. c2 = 0;
  565. mul_add_c(a[2], b[0], c3, c1, c2);
  566. mul_add_c(a[1], b[1], c3, c1, c2);
  567. mul_add_c(a[0], b[2], c3, c1, c2);
  568. r[2] = c3;
  569. c3 = 0;
  570. mul_add_c(a[0], b[3], c1, c2, c3);
  571. mul_add_c(a[1], b[2], c1, c2, c3);
  572. mul_add_c(a[2], b[1], c1, c2, c3);
  573. mul_add_c(a[3], b[0], c1, c2, c3);
  574. r[3] = c1;
  575. c1 = 0;
  576. mul_add_c(a[4], b[0], c2, c3, c1);
  577. mul_add_c(a[3], b[1], c2, c3, c1);
  578. mul_add_c(a[2], b[2], c2, c3, c1);
  579. mul_add_c(a[1], b[3], c2, c3, c1);
  580. mul_add_c(a[0], b[4], c2, c3, c1);
  581. r[4] = c2;
  582. c2 = 0;
  583. mul_add_c(a[0], b[5], c3, c1, c2);
  584. mul_add_c(a[1], b[4], c3, c1, c2);
  585. mul_add_c(a[2], b[3], c3, c1, c2);
  586. mul_add_c(a[3], b[2], c3, c1, c2);
  587. mul_add_c(a[4], b[1], c3, c1, c2);
  588. mul_add_c(a[5], b[0], c3, c1, c2);
  589. r[5] = c3;
  590. c3 = 0;
  591. mul_add_c(a[6], b[0], c1, c2, c3);
  592. mul_add_c(a[5], b[1], c1, c2, c3);
  593. mul_add_c(a[4], b[2], c1, c2, c3);
  594. mul_add_c(a[3], b[3], c1, c2, c3);
  595. mul_add_c(a[2], b[4], c1, c2, c3);
  596. mul_add_c(a[1], b[5], c1, c2, c3);
  597. mul_add_c(a[0], b[6], c1, c2, c3);
  598. r[6] = c1;
  599. c1 = 0;
  600. mul_add_c(a[0], b[7], c2, c3, c1);
  601. mul_add_c(a[1], b[6], c2, c3, c1);
  602. mul_add_c(a[2], b[5], c2, c3, c1);
  603. mul_add_c(a[3], b[4], c2, c3, c1);
  604. mul_add_c(a[4], b[3], c2, c3, c1);
  605. mul_add_c(a[5], b[2], c2, c3, c1);
  606. mul_add_c(a[6], b[1], c2, c3, c1);
  607. mul_add_c(a[7], b[0], c2, c3, c1);
  608. r[7] = c2;
  609. c2 = 0;
  610. mul_add_c(a[7], b[1], c3, c1, c2);
  611. mul_add_c(a[6], b[2], c3, c1, c2);
  612. mul_add_c(a[5], b[3], c3, c1, c2);
  613. mul_add_c(a[4], b[4], c3, c1, c2);
  614. mul_add_c(a[3], b[5], c3, c1, c2);
  615. mul_add_c(a[2], b[6], c3, c1, c2);
  616. mul_add_c(a[1], b[7], c3, c1, c2);
  617. r[8] = c3;
  618. c3 = 0;
  619. mul_add_c(a[2], b[7], c1, c2, c3);
  620. mul_add_c(a[3], b[6], c1, c2, c3);
  621. mul_add_c(a[4], b[5], c1, c2, c3);
  622. mul_add_c(a[5], b[4], c1, c2, c3);
  623. mul_add_c(a[6], b[3], c1, c2, c3);
  624. mul_add_c(a[7], b[2], c1, c2, c3);
  625. r[9] = c1;
  626. c1 = 0;
  627. mul_add_c(a[7], b[3], c2, c3, c1);
  628. mul_add_c(a[6], b[4], c2, c3, c1);
  629. mul_add_c(a[5], b[5], c2, c3, c1);
  630. mul_add_c(a[4], b[6], c2, c3, c1);
  631. mul_add_c(a[3], b[7], c2, c3, c1);
  632. r[10] = c2;
  633. c2 = 0;
  634. mul_add_c(a[4], b[7], c3, c1, c2);
  635. mul_add_c(a[5], b[6], c3, c1, c2);
  636. mul_add_c(a[6], b[5], c3, c1, c2);
  637. mul_add_c(a[7], b[4], c3, c1, c2);
  638. r[11] = c3;
  639. c3 = 0;
  640. mul_add_c(a[7], b[5], c1, c2, c3);
  641. mul_add_c(a[6], b[6], c1, c2, c3);
  642. mul_add_c(a[5], b[7], c1, c2, c3);
  643. r[12] = c1;
  644. c1 = 0;
  645. mul_add_c(a[6], b[7], c2, c3, c1);
  646. mul_add_c(a[7], b[6], c2, c3, c1);
  647. r[13] = c2;
  648. c2 = 0;
  649. mul_add_c(a[7], b[7], c3, c1, c2);
  650. r[14] = c3;
  651. r[15] = c1;
  652. }
  653. void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  654. {
  655. # ifdef BN_LLONG
  656. BN_ULLONG t;
  657. # else
  658. BN_ULONG bl, bh;
  659. # endif
  660. BN_ULONG t1, t2;
  661. BN_ULONG c1, c2, c3;
  662. c1 = 0;
  663. c2 = 0;
  664. c3 = 0;
  665. mul_add_c(a[0], b[0], c1, c2, c3);
  666. r[0] = c1;
  667. c1 = 0;
  668. mul_add_c(a[0], b[1], c2, c3, c1);
  669. mul_add_c(a[1], b[0], c2, c3, c1);
  670. r[1] = c2;
  671. c2 = 0;
  672. mul_add_c(a[2], b[0], c3, c1, c2);
  673. mul_add_c(a[1], b[1], c3, c1, c2);
  674. mul_add_c(a[0], b[2], c3, c1, c2);
  675. r[2] = c3;
  676. c3 = 0;
  677. mul_add_c(a[0], b[3], c1, c2, c3);
  678. mul_add_c(a[1], b[2], c1, c2, c3);
  679. mul_add_c(a[2], b[1], c1, c2, c3);
  680. mul_add_c(a[3], b[0], c1, c2, c3);
  681. r[3] = c1;
  682. c1 = 0;
  683. mul_add_c(a[3], b[1], c2, c3, c1);
  684. mul_add_c(a[2], b[2], c2, c3, c1);
  685. mul_add_c(a[1], b[3], c2, c3, c1);
  686. r[4] = c2;
  687. c2 = 0;
  688. mul_add_c(a[2], b[3], c3, c1, c2);
  689. mul_add_c(a[3], b[2], c3, c1, c2);
  690. r[5] = c3;
  691. c3 = 0;
  692. mul_add_c(a[3], b[3], c1, c2, c3);
  693. r[6] = c1;
  694. r[7] = c2;
  695. }
  696. void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
  697. {
  698. # ifdef BN_LLONG
  699. BN_ULLONG t, tt;
  700. # else
  701. BN_ULONG bl, bh;
  702. # endif
  703. BN_ULONG t1, t2;
  704. BN_ULONG c1, c2, c3;
  705. c1 = 0;
  706. c2 = 0;
  707. c3 = 0;
  708. sqr_add_c(a, 0, c1, c2, c3);
  709. r[0] = c1;
  710. c1 = 0;
  711. sqr_add_c2(a, 1, 0, c2, c3, c1);
  712. r[1] = c2;
  713. c2 = 0;
  714. sqr_add_c(a, 1, c3, c1, c2);
  715. sqr_add_c2(a, 2, 0, c3, c1, c2);
  716. r[2] = c3;
  717. c3 = 0;
  718. sqr_add_c2(a, 3, 0, c1, c2, c3);
  719. sqr_add_c2(a, 2, 1, c1, c2, c3);
  720. r[3] = c1;
  721. c1 = 0;
  722. sqr_add_c(a, 2, c2, c3, c1);
  723. sqr_add_c2(a, 3, 1, c2, c3, c1);
  724. sqr_add_c2(a, 4, 0, c2, c3, c1);
  725. r[4] = c2;
  726. c2 = 0;
  727. sqr_add_c2(a, 5, 0, c3, c1, c2);
  728. sqr_add_c2(a, 4, 1, c3, c1, c2);
  729. sqr_add_c2(a, 3, 2, c3, c1, c2);
  730. r[5] = c3;
  731. c3 = 0;
  732. sqr_add_c(a, 3, c1, c2, c3);
  733. sqr_add_c2(a, 4, 2, c1, c2, c3);
  734. sqr_add_c2(a, 5, 1, c1, c2, c3);
  735. sqr_add_c2(a, 6, 0, c1, c2, c3);
  736. r[6] = c1;
  737. c1 = 0;
  738. sqr_add_c2(a, 7, 0, c2, c3, c1);
  739. sqr_add_c2(a, 6, 1, c2, c3, c1);
  740. sqr_add_c2(a, 5, 2, c2, c3, c1);
  741. sqr_add_c2(a, 4, 3, c2, c3, c1);
  742. r[7] = c2;
  743. c2 = 0;
  744. sqr_add_c(a, 4, c3, c1, c2);
  745. sqr_add_c2(a, 5, 3, c3, c1, c2);
  746. sqr_add_c2(a, 6, 2, c3, c1, c2);
  747. sqr_add_c2(a, 7, 1, c3, c1, c2);
  748. r[8] = c3;
  749. c3 = 0;
  750. sqr_add_c2(a, 7, 2, c1, c2, c3);
  751. sqr_add_c2(a, 6, 3, c1, c2, c3);
  752. sqr_add_c2(a, 5, 4, c1, c2, c3);
  753. r[9] = c1;
  754. c1 = 0;
  755. sqr_add_c(a, 5, c2, c3, c1);
  756. sqr_add_c2(a, 6, 4, c2, c3, c1);
  757. sqr_add_c2(a, 7, 3, c2, c3, c1);
  758. r[10] = c2;
  759. c2 = 0;
  760. sqr_add_c2(a, 7, 4, c3, c1, c2);
  761. sqr_add_c2(a, 6, 5, c3, c1, c2);
  762. r[11] = c3;
  763. c3 = 0;
  764. sqr_add_c(a, 6, c1, c2, c3);
  765. sqr_add_c2(a, 7, 5, c1, c2, c3);
  766. r[12] = c1;
  767. c1 = 0;
  768. sqr_add_c2(a, 7, 6, c2, c3, c1);
  769. r[13] = c2;
  770. c2 = 0;
  771. sqr_add_c(a, 7, c3, c1, c2);
  772. r[14] = c3;
  773. r[15] = c1;
  774. }
  775. void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
  776. {
  777. # ifdef BN_LLONG
  778. BN_ULLONG t, tt;
  779. # else
  780. BN_ULONG bl, bh;
  781. # endif
  782. BN_ULONG t1, t2;
  783. BN_ULONG c1, c2, c3;
  784. c1 = 0;
  785. c2 = 0;
  786. c3 = 0;
  787. sqr_add_c(a, 0, c1, c2, c3);
  788. r[0] = c1;
  789. c1 = 0;
  790. sqr_add_c2(a, 1, 0, c2, c3, c1);
  791. r[1] = c2;
  792. c2 = 0;
  793. sqr_add_c(a, 1, c3, c1, c2);
  794. sqr_add_c2(a, 2, 0, c3, c1, c2);
  795. r[2] = c3;
  796. c3 = 0;
  797. sqr_add_c2(a, 3, 0, c1, c2, c3);
  798. sqr_add_c2(a, 2, 1, c1, c2, c3);
  799. r[3] = c1;
  800. c1 = 0;
  801. sqr_add_c(a, 2, c2, c3, c1);
  802. sqr_add_c2(a, 3, 1, c2, c3, c1);
  803. r[4] = c2;
  804. c2 = 0;
  805. sqr_add_c2(a, 3, 2, c3, c1, c2);
  806. r[5] = c3;
  807. c3 = 0;
  808. sqr_add_c(a, 3, c1, c2, c3);
  809. r[6] = c1;
  810. r[7] = c2;
  811. }
  812. # ifdef OPENSSL_NO_ASM
  813. # ifdef OPENSSL_BN_ASM_MONT
  814. # include <alloca.h>
  815. /*
  816. * This is essentially reference implementation, which may or may not
  817. * result in performance improvement. E.g. on IA-32 this routine was
  818. * observed to give 40% faster rsa1024 private key operations and 10%
  819. * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
  820. * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
  821. * reference implementation, one to be used as starting point for
  822. * platform-specific assembler. Mentioned numbers apply to compiler
  823. * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
  824. * can vary not only from platform to platform, but even for compiler
  825. * versions. Assembler vs. assembler improvement coefficients can
  826. * [and are known to] differ and are to be documented elsewhere.
  827. */
  828. int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  829. const BN_ULONG *np, const BN_ULONG *n0p, int num)
  830. {
  831. BN_ULONG c0, c1, ml, *tp, n0;
  832. # ifdef mul64
  833. BN_ULONG mh;
  834. # endif
  835. volatile BN_ULONG *vp;
  836. int i = 0, j;
  837. # if 0 /* template for platform-specific
  838. * implementation */
  839. if (ap == bp)
  840. return bn_sqr_mont(rp, ap, np, n0p, num);
  841. # endif
  842. vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
  843. n0 = *n0p;
  844. c0 = 0;
  845. ml = bp[0];
  846. # ifdef mul64
  847. mh = HBITS(ml);
  848. ml = LBITS(ml);
  849. for (j = 0; j < num; ++j)
  850. mul(tp[j], ap[j], ml, mh, c0);
  851. # else
  852. for (j = 0; j < num; ++j)
  853. mul(tp[j], ap[j], ml, c0);
  854. # endif
  855. tp[num] = c0;
  856. tp[num + 1] = 0;
  857. goto enter;
  858. for (i = 0; i < num; i++) {
  859. c0 = 0;
  860. ml = bp[i];
  861. # ifdef mul64
  862. mh = HBITS(ml);
  863. ml = LBITS(ml);
  864. for (j = 0; j < num; ++j)
  865. mul_add(tp[j], ap[j], ml, mh, c0);
  866. # else
  867. for (j = 0; j < num; ++j)
  868. mul_add(tp[j], ap[j], ml, c0);
  869. # endif
  870. c1 = (tp[num] + c0) & BN_MASK2;
  871. tp[num] = c1;
  872. tp[num + 1] = (c1 < c0 ? 1 : 0);
  873. enter:
  874. c1 = tp[0];
  875. ml = (c1 * n0) & BN_MASK2;
  876. c0 = 0;
  877. # ifdef mul64
  878. mh = HBITS(ml);
  879. ml = LBITS(ml);
  880. mul_add(c1, np[0], ml, mh, c0);
  881. # else
  882. mul_add(c1, ml, np[0], c0);
  883. # endif
  884. for (j = 1; j < num; j++) {
  885. c1 = tp[j];
  886. # ifdef mul64
  887. mul_add(c1, np[j], ml, mh, c0);
  888. # else
  889. mul_add(c1, ml, np[j], c0);
  890. # endif
  891. tp[j - 1] = c1 & BN_MASK2;
  892. }
  893. c1 = (tp[num] + c0) & BN_MASK2;
  894. tp[num - 1] = c1;
  895. tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
  896. }
  897. if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
  898. c0 = bn_sub_words(rp, tp, np, num);
  899. if (tp[num] != 0 || c0 == 0) {
  900. for (i = 0; i < num + 2; i++)
  901. vp[i] = 0;
  902. return 1;
  903. }
  904. }
  905. for (i = 0; i < num; i++)
  906. rp[i] = tp[i], vp[i] = 0;
  907. vp[num] = 0;
  908. vp[num + 1] = 0;
  909. return 1;
  910. }
  911. # else
  912. /*
  913. * Return value of 0 indicates that multiplication/convolution was not
  914. * performed to signal the caller to fall down to alternative/original
  915. * code-path.
  916. */
  917. int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  918. const BN_ULONG *np, const BN_ULONG *n0, int num)
  919. {
  920. return 0;
  921. }
  922. # endif /* OPENSSL_BN_ASM_MONT */
  923. # endif
  924. #else /* !BN_MUL_COMBA */
  925. /* hmm... is it faster just to do a multiply? */
  926. # undef bn_sqr_comba4
  927. void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
  928. {
  929. BN_ULONG t[8];
  930. bn_sqr_normal(r, a, 4, t);
  931. }
  932. # undef bn_sqr_comba8
  933. void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
  934. {
  935. BN_ULONG t[16];
  936. bn_sqr_normal(r, a, 8, t);
  937. }
  938. void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  939. {
  940. r[4] = bn_mul_words(&(r[0]), a, 4, b[0]);
  941. r[5] = bn_mul_add_words(&(r[1]), a, 4, b[1]);
  942. r[6] = bn_mul_add_words(&(r[2]), a, 4, b[2]);
  943. r[7] = bn_mul_add_words(&(r[3]), a, 4, b[3]);
  944. }
  945. void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
  946. {
  947. r[8] = bn_mul_words(&(r[0]), a, 8, b[0]);
  948. r[9] = bn_mul_add_words(&(r[1]), a, 8, b[1]);
  949. r[10] = bn_mul_add_words(&(r[2]), a, 8, b[2]);
  950. r[11] = bn_mul_add_words(&(r[3]), a, 8, b[3]);
  951. r[12] = bn_mul_add_words(&(r[4]), a, 8, b[4]);
  952. r[13] = bn_mul_add_words(&(r[5]), a, 8, b[5]);
  953. r[14] = bn_mul_add_words(&(r[6]), a, 8, b[6]);
  954. r[15] = bn_mul_add_words(&(r[7]), a, 8, b[7]);
  955. }
  956. # ifdef OPENSSL_NO_ASM
  957. # ifdef OPENSSL_BN_ASM_MONT
  958. # include <alloca.h>
  959. int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  960. const BN_ULONG *np, const BN_ULONG *n0p, int num)
  961. {
  962. BN_ULONG c0, c1, *tp, n0 = *n0p;
  963. volatile BN_ULONG *vp;
  964. int i = 0, j;
  965. vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
  966. for (i = 0; i <= num; i++)
  967. tp[i] = 0;
  968. for (i = 0; i < num; i++) {
  969. c0 = bn_mul_add_words(tp, ap, num, bp[i]);
  970. c1 = (tp[num] + c0) & BN_MASK2;
  971. tp[num] = c1;
  972. tp[num + 1] = (c1 < c0 ? 1 : 0);
  973. c0 = bn_mul_add_words(tp, np, num, tp[0] * n0);
  974. c1 = (tp[num] + c0) & BN_MASK2;
  975. tp[num] = c1;
  976. tp[num + 1] += (c1 < c0 ? 1 : 0);
  977. for (j = 0; j <= num; j++)
  978. tp[j] = tp[j + 1];
  979. }
  980. if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
  981. c0 = bn_sub_words(rp, tp, np, num);
  982. if (tp[num] != 0 || c0 == 0) {
  983. for (i = 0; i < num + 2; i++)
  984. vp[i] = 0;
  985. return 1;
  986. }
  987. }
  988. for (i = 0; i < num; i++)
  989. rp[i] = tp[i], vp[i] = 0;
  990. vp[num] = 0;
  991. vp[num + 1] = 0;
  992. return 1;
  993. }
  994. # else
  995. int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  996. const BN_ULONG *np, const BN_ULONG *n0, int num)
  997. {
  998. return 0;
  999. }
  1000. # endif /* OPENSSL_BN_ASM_MONT */
  1001. # endif
  1002. #endif /* !BN_MUL_COMBA */