ecp_nistz256.c 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521
  1. /******************************************************************************
  2. * *
  3. * Copyright 2014 Intel Corporation *
  4. * *
  5. * Licensed under the Apache License, Version 2.0 (the "License"); *
  6. * you may not use this file except in compliance with the License. *
  7. * You may obtain a copy of the License at *
  8. * *
  9. * http://www.apache.org/licenses/LICENSE-2.0 *
  10. * *
  11. * Unless required by applicable law or agreed to in writing, software *
  12. * distributed under the License is distributed on an "AS IS" BASIS, *
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
  14. * See the License for the specific language governing permissions and *
  15. * limitations under the License. *
  16. * *
  17. ******************************************************************************
  18. * *
  19. * Developers and authors: *
  20. * Shay Gueron (1, 2), and Vlad Krasnov (1) *
  21. * (1) Intel Corporation, Israel Development Center *
  22. * (2) University of Haifa *
  23. * Reference: *
  24. * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with *
  25. * 256 Bit Primes" *
  26. * *
  27. ******************************************************************************/
  28. #include <string.h>
  29. #include <openssl/bn.h>
  30. #include <openssl/err.h>
  31. #include <openssl/ec.h>
  32. #include "cryptlib.h"
  33. #include "ec_lcl.h"
  34. #if BN_BITS2 != 64
  35. # define TOBN(hi,lo) lo,hi
  36. #else
  37. # define TOBN(hi,lo) ((BN_ULONG)hi<<32|lo)
  38. #endif
  39. #if defined(__GNUC__)
  40. # define ALIGN32 __attribute((aligned(32)))
  41. #elif defined(_MSC_VER)
  42. # define ALIGN32 __declspec(align(32))
  43. #else
  44. # define ALIGN32
  45. #endif
  46. #define ALIGNPTR(p,N) ((unsigned char *)p+N-(size_t)p%N)
  47. #define P256_LIMBS (256/BN_BITS2)
  48. typedef unsigned short u16;
  49. typedef struct {
  50. BN_ULONG X[P256_LIMBS];
  51. BN_ULONG Y[P256_LIMBS];
  52. BN_ULONG Z[P256_LIMBS];
  53. } P256_POINT;
  54. typedef struct {
  55. BN_ULONG X[P256_LIMBS];
  56. BN_ULONG Y[P256_LIMBS];
  57. } P256_POINT_AFFINE;
  58. typedef P256_POINT_AFFINE PRECOMP256_ROW[64];
  59. /* structure for precomputed multiples of the generator */
  60. typedef struct ec_pre_comp_st {
  61. const EC_GROUP *group; /* Parent EC_GROUP object */
  62. size_t w; /* Window size */
  63. /*
  64. * Constant time access to the X and Y coordinates of the pre-computed,
  65. * generator multiplies, in the Montgomery domain. Pre-calculated
  66. * multiplies are stored in affine form.
  67. */
  68. PRECOMP256_ROW *precomp;
  69. void *precomp_storage;
  70. int references;
  71. } EC_PRE_COMP;
  72. /* Functions implemented in assembly */
  73. /* Modular mul by 2: res = 2*a mod P */
  74. void ecp_nistz256_mul_by_2(BN_ULONG res[P256_LIMBS],
  75. const BN_ULONG a[P256_LIMBS]);
  76. /* Modular div by 2: res = a/2 mod P */
  77. void ecp_nistz256_div_by_2(BN_ULONG res[P256_LIMBS],
  78. const BN_ULONG a[P256_LIMBS]);
  79. /* Modular mul by 3: res = 3*a mod P */
  80. void ecp_nistz256_mul_by_3(BN_ULONG res[P256_LIMBS],
  81. const BN_ULONG a[P256_LIMBS]);
  82. /* Modular add: res = a+b mod P */
  83. void ecp_nistz256_add(BN_ULONG res[P256_LIMBS],
  84. const BN_ULONG a[P256_LIMBS],
  85. const BN_ULONG b[P256_LIMBS]);
  86. /* Modular sub: res = a-b mod P */
  87. void ecp_nistz256_sub(BN_ULONG res[P256_LIMBS],
  88. const BN_ULONG a[P256_LIMBS],
  89. const BN_ULONG b[P256_LIMBS]);
  90. /* Modular neg: res = -a mod P */
  91. void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]);
  92. /* Montgomery mul: res = a*b*2^-256 mod P */
  93. void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS],
  94. const BN_ULONG a[P256_LIMBS],
  95. const BN_ULONG b[P256_LIMBS]);
  96. /* Montgomery sqr: res = a*a*2^-256 mod P */
  97. void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS],
  98. const BN_ULONG a[P256_LIMBS]);
  99. /* Convert a number from Montgomery domain, by multiplying with 1 */
  100. void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS],
  101. const BN_ULONG in[P256_LIMBS]);
  102. /* Convert a number to Montgomery domain, by multiplying with 2^512 mod P*/
  103. void ecp_nistz256_to_mont(BN_ULONG res[P256_LIMBS],
  104. const BN_ULONG in[P256_LIMBS]);
  105. /* Functions that perform constant time access to the precomputed tables */
  106. void ecp_nistz256_select_w5(P256_POINT * val,
  107. const P256_POINT * in_t, int index);
  108. void ecp_nistz256_select_w7(P256_POINT_AFFINE * val,
  109. const P256_POINT_AFFINE * in_t, int index);
  110. /* One converted into the Montgomery domain */
  111. static const BN_ULONG ONE[P256_LIMBS] = {
  112. TOBN(0x00000000, 0x00000001), TOBN(0xffffffff, 0x00000000),
  113. TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe)
  114. };
  115. static void *ecp_nistz256_pre_comp_dup(void *);
  116. static void ecp_nistz256_pre_comp_free(void *);
  117. static void ecp_nistz256_pre_comp_clear_free(void *);
  118. static EC_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group);
  119. /* Precomputed tables for the default generator */
  120. #include "ecp_nistz256_table.c"
  121. /* Recode window to a signed digit, see ecp_nistputil.c for details */
  122. static unsigned int _booth_recode_w5(unsigned int in)
  123. {
  124. unsigned int s, d;
  125. s = ~((in >> 5) - 1);
  126. d = (1 << 6) - in - 1;
  127. d = (d & s) | (in & ~s);
  128. d = (d >> 1) + (d & 1);
  129. return (d << 1) + (s & 1);
  130. }
  131. static unsigned int _booth_recode_w7(unsigned int in)
  132. {
  133. unsigned int s, d;
  134. s = ~((in >> 7) - 1);
  135. d = (1 << 8) - in - 1;
  136. d = (d & s) | (in & ~s);
  137. d = (d >> 1) + (d & 1);
  138. return (d << 1) + (s & 1);
  139. }
  140. static void copy_conditional(BN_ULONG dst[P256_LIMBS],
  141. const BN_ULONG src[P256_LIMBS], BN_ULONG move)
  142. {
  143. BN_ULONG mask1 = -move;
  144. BN_ULONG mask2 = ~mask1;
  145. dst[0] = (src[0] & mask1) ^ (dst[0] & mask2);
  146. dst[1] = (src[1] & mask1) ^ (dst[1] & mask2);
  147. dst[2] = (src[2] & mask1) ^ (dst[2] & mask2);
  148. dst[3] = (src[3] & mask1) ^ (dst[3] & mask2);
  149. if (P256_LIMBS == 8) {
  150. dst[4] = (src[4] & mask1) ^ (dst[4] & mask2);
  151. dst[5] = (src[5] & mask1) ^ (dst[5] & mask2);
  152. dst[6] = (src[6] & mask1) ^ (dst[6] & mask2);
  153. dst[7] = (src[7] & mask1) ^ (dst[7] & mask2);
  154. }
  155. }
  156. static BN_ULONG is_zero(BN_ULONG in)
  157. {
  158. in |= (0 - in);
  159. in = ~in;
  160. in &= BN_MASK2;
  161. in >>= BN_BITS2 - 1;
  162. return in;
  163. }
  164. static BN_ULONG is_equal(const BN_ULONG a[P256_LIMBS],
  165. const BN_ULONG b[P256_LIMBS])
  166. {
  167. BN_ULONG res;
  168. res = a[0] ^ b[0];
  169. res |= a[1] ^ b[1];
  170. res |= a[2] ^ b[2];
  171. res |= a[3] ^ b[3];
  172. if (P256_LIMBS == 8) {
  173. res |= a[4] ^ b[4];
  174. res |= a[5] ^ b[5];
  175. res |= a[6] ^ b[6];
  176. res |= a[7] ^ b[7];
  177. }
  178. return is_zero(res);
  179. }
  180. static BN_ULONG is_one(const BN_ULONG a[P256_LIMBS])
  181. {
  182. BN_ULONG res;
  183. res = a[0] ^ ONE[0];
  184. res |= a[1] ^ ONE[1];
  185. res |= a[2] ^ ONE[2];
  186. res |= a[3] ^ ONE[3];
  187. if (P256_LIMBS == 8) {
  188. res |= a[4] ^ ONE[4];
  189. res |= a[5] ^ ONE[5];
  190. res |= a[6] ^ ONE[6];
  191. }
  192. return is_zero(res);
  193. }
  194. static int ecp_nistz256_set_words(BIGNUM *a, BN_ULONG words[P256_LIMBS])
  195. {
  196. if (bn_wexpand(a, P256_LIMBS) == NULL) {
  197. ECerr(EC_F_ECP_NISTZ256_SET_WORDS, ERR_R_MALLOC_FAILURE);
  198. return 0;
  199. }
  200. memcpy(a->d, words, sizeof(BN_ULONG) * P256_LIMBS);
  201. a->top = P256_LIMBS;
  202. bn_correct_top(a);
  203. return 1;
  204. }
  205. #ifndef ECP_NISTZ256_REFERENCE_IMPLEMENTATION
  206. void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a);
  207. void ecp_nistz256_point_add(P256_POINT *r,
  208. const P256_POINT *a, const P256_POINT *b);
  209. void ecp_nistz256_point_add_affine(P256_POINT *r,
  210. const P256_POINT *a,
  211. const P256_POINT_AFFINE *b);
  212. #else
  213. /* Point double: r = 2*a */
  214. static void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a)
  215. {
  216. BN_ULONG S[P256_LIMBS];
  217. BN_ULONG M[P256_LIMBS];
  218. BN_ULONG Zsqr[P256_LIMBS];
  219. BN_ULONG tmp0[P256_LIMBS];
  220. const BN_ULONG *in_x = a->X;
  221. const BN_ULONG *in_y = a->Y;
  222. const BN_ULONG *in_z = a->Z;
  223. BN_ULONG *res_x = r->X;
  224. BN_ULONG *res_y = r->Y;
  225. BN_ULONG *res_z = r->Z;
  226. ecp_nistz256_mul_by_2(S, in_y);
  227. ecp_nistz256_sqr_mont(Zsqr, in_z);
  228. ecp_nistz256_sqr_mont(S, S);
  229. ecp_nistz256_mul_mont(res_z, in_z, in_y);
  230. ecp_nistz256_mul_by_2(res_z, res_z);
  231. ecp_nistz256_add(M, in_x, Zsqr);
  232. ecp_nistz256_sub(Zsqr, in_x, Zsqr);
  233. ecp_nistz256_sqr_mont(res_y, S);
  234. ecp_nistz256_div_by_2(res_y, res_y);
  235. ecp_nistz256_mul_mont(M, M, Zsqr);
  236. ecp_nistz256_mul_by_3(M, M);
  237. ecp_nistz256_mul_mont(S, S, in_x);
  238. ecp_nistz256_mul_by_2(tmp0, S);
  239. ecp_nistz256_sqr_mont(res_x, M);
  240. ecp_nistz256_sub(res_x, res_x, tmp0);
  241. ecp_nistz256_sub(S, S, res_x);
  242. ecp_nistz256_mul_mont(S, S, M);
  243. ecp_nistz256_sub(res_y, S, res_y);
  244. }
  245. /* Point addition: r = a+b */
  246. static void ecp_nistz256_point_add(P256_POINT *r,
  247. const P256_POINT *a, const P256_POINT *b)
  248. {
  249. BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS];
  250. BN_ULONG U1[P256_LIMBS], S1[P256_LIMBS];
  251. BN_ULONG Z1sqr[P256_LIMBS];
  252. BN_ULONG Z2sqr[P256_LIMBS];
  253. BN_ULONG H[P256_LIMBS], R[P256_LIMBS];
  254. BN_ULONG Hsqr[P256_LIMBS];
  255. BN_ULONG Rsqr[P256_LIMBS];
  256. BN_ULONG Hcub[P256_LIMBS];
  257. BN_ULONG res_x[P256_LIMBS];
  258. BN_ULONG res_y[P256_LIMBS];
  259. BN_ULONG res_z[P256_LIMBS];
  260. BN_ULONG in1infty, in2infty;
  261. const BN_ULONG *in1_x = a->X;
  262. const BN_ULONG *in1_y = a->Y;
  263. const BN_ULONG *in1_z = a->Z;
  264. const BN_ULONG *in2_x = b->X;
  265. const BN_ULONG *in2_y = b->Y;
  266. const BN_ULONG *in2_z = b->Z;
  267. /* We encode infinity as (0,0), which is not on the curve,
  268. * so it is OK. */
  269. in1infty = (in1_x[0] | in1_x[1] | in1_x[2] | in1_x[3] |
  270. in1_y[0] | in1_y[1] | in1_y[2] | in1_y[3]);
  271. if (P256_LIMBS == 8)
  272. in1infty |= (in1_x[4] | in1_x[5] | in1_x[6] | in1_x[7] |
  273. in1_y[4] | in1_y[5] | in1_y[6] | in1_y[7]);
  274. in2infty = (in2_x[0] | in2_x[1] | in2_x[2] | in2_x[3] |
  275. in2_y[0] | in2_y[1] | in2_y[2] | in2_y[3]);
  276. if (P256_LIMBS == 8)
  277. in2infty |= (in2_x[4] | in2_x[5] | in2_x[6] | in2_x[7] |
  278. in2_y[4] | in2_y[5] | in2_y[6] | in2_y[7]);
  279. in1infty = is_zero(in1infty);
  280. in2infty = is_zero(in2infty);
  281. ecp_nistz256_sqr_mont(Z2sqr, in2_z); /* Z2^2 */
  282. ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */
  283. ecp_nistz256_mul_mont(S1, Z2sqr, in2_z); /* S1 = Z2^3 */
  284. ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */
  285. ecp_nistz256_mul_mont(S1, S1, in1_y); /* S1 = Y1*Z2^3 */
  286. ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */
  287. ecp_nistz256_sub(R, S2, S1); /* R = S2 - S1 */
  288. ecp_nistz256_mul_mont(U1, in1_x, Z2sqr); /* U1 = X1*Z2^2 */
  289. ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */
  290. ecp_nistz256_sub(H, U2, U1); /* H = U2 - U1 */
  291. /*
  292. * This should not happen during sign/ecdh, so no constant time violation
  293. */
  294. if (is_equal(U1, U2) && !in1infty && !in2infty) {
  295. if (is_equal(S1, S2)) {
  296. ecp_nistz256_point_double(r, a);
  297. return;
  298. } else {
  299. memset(r, 0, sizeof(*r));
  300. return;
  301. }
  302. }
  303. ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */
  304. ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */
  305. ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */
  306. ecp_nistz256_mul_mont(res_z, res_z, in2_z); /* Z3 = H*Z1*Z2 */
  307. ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */
  308. ecp_nistz256_mul_mont(U2, U1, Hsqr); /* U1*H^2 */
  309. ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */
  310. ecp_nistz256_sub(res_x, Rsqr, Hsqr);
  311. ecp_nistz256_sub(res_x, res_x, Hcub);
  312. ecp_nistz256_sub(res_y, U2, res_x);
  313. ecp_nistz256_mul_mont(S2, S1, Hcub);
  314. ecp_nistz256_mul_mont(res_y, R, res_y);
  315. ecp_nistz256_sub(res_y, res_y, S2);
  316. copy_conditional(res_x, in2_x, in1infty);
  317. copy_conditional(res_y, in2_y, in1infty);
  318. copy_conditional(res_z, in2_z, in1infty);
  319. copy_conditional(res_x, in1_x, in2infty);
  320. copy_conditional(res_y, in1_y, in2infty);
  321. copy_conditional(res_z, in1_z, in2infty);
  322. memcpy(r->X, res_x, sizeof(res_x));
  323. memcpy(r->Y, res_y, sizeof(res_y));
  324. memcpy(r->Z, res_z, sizeof(res_z));
  325. }
  326. /* Point addition when b is known to be affine: r = a+b */
  327. static void ecp_nistz256_point_add_affine(P256_POINT *r,
  328. const P256_POINT *a,
  329. const P256_POINT_AFFINE *b)
  330. {
  331. BN_ULONG U2[P256_LIMBS], S2[P256_LIMBS];
  332. BN_ULONG Z1sqr[P256_LIMBS];
  333. BN_ULONG H[P256_LIMBS], R[P256_LIMBS];
  334. BN_ULONG Hsqr[P256_LIMBS];
  335. BN_ULONG Rsqr[P256_LIMBS];
  336. BN_ULONG Hcub[P256_LIMBS];
  337. BN_ULONG res_x[P256_LIMBS];
  338. BN_ULONG res_y[P256_LIMBS];
  339. BN_ULONG res_z[P256_LIMBS];
  340. BN_ULONG in1infty, in2infty;
  341. const BN_ULONG *in1_x = a->X;
  342. const BN_ULONG *in1_y = a->Y;
  343. const BN_ULONG *in1_z = a->Z;
  344. const BN_ULONG *in2_x = b->X;
  345. const BN_ULONG *in2_y = b->Y;
  346. /*
  347. * In affine representation we encode infty as (0,0), which is not on the
  348. * curve, so it is OK
  349. */
  350. in1infty = (in1_x[0] | in1_x[1] | in1_x[2] | in1_x[3] |
  351. in1_y[0] | in1_y[1] | in1_y[2] | in1_y[3]);
  352. if (P256_LIMBS == 8)
  353. in1infty |= (in1_x[4] | in1_x[5] | in1_x[6] | in1_x[7] |
  354. in1_y[4] | in1_y[5] | in1_y[6] | in1_y[7]);
  355. in2infty = (in2_x[0] | in2_x[1] | in2_x[2] | in2_x[3] |
  356. in2_y[0] | in2_y[1] | in2_y[2] | in2_y[3]);
  357. if (P256_LIMBS == 8)
  358. in2infty |= (in2_x[4] | in2_x[5] | in2_x[6] | in2_x[7] |
  359. in2_y[4] | in2_y[5] | in2_y[6] | in2_y[7]);
  360. in1infty = is_zero(in1infty);
  361. in2infty = is_zero(in2infty);
  362. ecp_nistz256_sqr_mont(Z1sqr, in1_z); /* Z1^2 */
  363. ecp_nistz256_mul_mont(U2, in2_x, Z1sqr); /* U2 = X2*Z1^2 */
  364. ecp_nistz256_sub(H, U2, in1_x); /* H = U2 - U1 */
  365. ecp_nistz256_mul_mont(S2, Z1sqr, in1_z); /* S2 = Z1^3 */
  366. ecp_nistz256_mul_mont(res_z, H, in1_z); /* Z3 = H*Z1*Z2 */
  367. ecp_nistz256_mul_mont(S2, S2, in2_y); /* S2 = Y2*Z1^3 */
  368. ecp_nistz256_sub(R, S2, in1_y); /* R = S2 - S1 */
  369. ecp_nistz256_sqr_mont(Hsqr, H); /* H^2 */
  370. ecp_nistz256_sqr_mont(Rsqr, R); /* R^2 */
  371. ecp_nistz256_mul_mont(Hcub, Hsqr, H); /* H^3 */
  372. ecp_nistz256_mul_mont(U2, in1_x, Hsqr); /* U1*H^2 */
  373. ecp_nistz256_mul_by_2(Hsqr, U2); /* 2*U1*H^2 */
  374. ecp_nistz256_sub(res_x, Rsqr, Hsqr);
  375. ecp_nistz256_sub(res_x, res_x, Hcub);
  376. ecp_nistz256_sub(H, U2, res_x);
  377. ecp_nistz256_mul_mont(S2, in1_y, Hcub);
  378. ecp_nistz256_mul_mont(H, H, R);
  379. ecp_nistz256_sub(res_y, H, S2);
  380. copy_conditional(res_x, in2_x, in1infty);
  381. copy_conditional(res_x, in1_x, in2infty);
  382. copy_conditional(res_y, in2_y, in1infty);
  383. copy_conditional(res_y, in1_y, in2infty);
  384. copy_conditional(res_z, ONE, in1infty);
  385. copy_conditional(res_z, in1_z, in2infty);
  386. memcpy(r->X, res_x, sizeof(res_x));
  387. memcpy(r->Y, res_y, sizeof(res_y));
  388. memcpy(r->Z, res_z, sizeof(res_z));
  389. }
  390. #endif
  391. /* r = in^-1 mod p */
  392. static void ecp_nistz256_mod_inverse(BN_ULONG r[P256_LIMBS],
  393. const BN_ULONG in[P256_LIMBS])
  394. {
  395. /*
  396. * The poly is ffffffff 00000001 00000000 00000000 00000000 ffffffff
  397. * ffffffff ffffffff We use FLT and used poly-2 as exponent
  398. */
  399. BN_ULONG p2[P256_LIMBS];
  400. BN_ULONG p4[P256_LIMBS];
  401. BN_ULONG p8[P256_LIMBS];
  402. BN_ULONG p16[P256_LIMBS];
  403. BN_ULONG p32[P256_LIMBS];
  404. BN_ULONG res[P256_LIMBS];
  405. int i;
  406. ecp_nistz256_sqr_mont(res, in);
  407. ecp_nistz256_mul_mont(p2, res, in); /* 3*p */
  408. ecp_nistz256_sqr_mont(res, p2);
  409. ecp_nistz256_sqr_mont(res, res);
  410. ecp_nistz256_mul_mont(p4, res, p2); /* f*p */
  411. ecp_nistz256_sqr_mont(res, p4);
  412. ecp_nistz256_sqr_mont(res, res);
  413. ecp_nistz256_sqr_mont(res, res);
  414. ecp_nistz256_sqr_mont(res, res);
  415. ecp_nistz256_mul_mont(p8, res, p4); /* ff*p */
  416. ecp_nistz256_sqr_mont(res, p8);
  417. for (i = 0; i < 7; i++)
  418. ecp_nistz256_sqr_mont(res, res);
  419. ecp_nistz256_mul_mont(p16, res, p8); /* ffff*p */
  420. ecp_nistz256_sqr_mont(res, p16);
  421. for (i = 0; i < 15; i++)
  422. ecp_nistz256_sqr_mont(res, res);
  423. ecp_nistz256_mul_mont(p32, res, p16); /* ffffffff*p */
  424. ecp_nistz256_sqr_mont(res, p32);
  425. for (i = 0; i < 31; i++)
  426. ecp_nistz256_sqr_mont(res, res);
  427. ecp_nistz256_mul_mont(res, res, in);
  428. for (i = 0; i < 32 * 4; i++)
  429. ecp_nistz256_sqr_mont(res, res);
  430. ecp_nistz256_mul_mont(res, res, p32);
  431. for (i = 0; i < 32; i++)
  432. ecp_nistz256_sqr_mont(res, res);
  433. ecp_nistz256_mul_mont(res, res, p32);
  434. for (i = 0; i < 16; i++)
  435. ecp_nistz256_sqr_mont(res, res);
  436. ecp_nistz256_mul_mont(res, res, p16);
  437. for (i = 0; i < 8; i++)
  438. ecp_nistz256_sqr_mont(res, res);
  439. ecp_nistz256_mul_mont(res, res, p8);
  440. ecp_nistz256_sqr_mont(res, res);
  441. ecp_nistz256_sqr_mont(res, res);
  442. ecp_nistz256_sqr_mont(res, res);
  443. ecp_nistz256_sqr_mont(res, res);
  444. ecp_nistz256_mul_mont(res, res, p4);
  445. ecp_nistz256_sqr_mont(res, res);
  446. ecp_nistz256_sqr_mont(res, res);
  447. ecp_nistz256_mul_mont(res, res, p2);
  448. ecp_nistz256_sqr_mont(res, res);
  449. ecp_nistz256_sqr_mont(res, res);
  450. ecp_nistz256_mul_mont(res, res, in);
  451. memcpy(r, res, sizeof(res));
  452. }
  453. /*
  454. * ecp_nistz256_bignum_to_field_elem copies the contents of |in| to |out| and
  455. * returns one if it fits. Otherwise it returns zero.
  456. */
  457. static int ecp_nistz256_bignum_to_field_elem(BN_ULONG out[P256_LIMBS],
  458. const BIGNUM *in)
  459. {
  460. if (in->top > P256_LIMBS)
  461. return 0;
  462. memset(out, 0, sizeof(BN_ULONG) * P256_LIMBS);
  463. memcpy(out, in->d, sizeof(BN_ULONG) * in->top);
  464. return 1;
  465. }
  466. /* r = sum(scalar[i]*point[i]) */
  467. static int ecp_nistz256_windowed_mul(const EC_GROUP *group,
  468. P256_POINT *r,
  469. const BIGNUM **scalar,
  470. const EC_POINT **point,
  471. int num, BN_CTX *ctx)
  472. {
  473. int i, j, ret = 0;
  474. unsigned int index;
  475. unsigned char (*p_str)[33] = NULL;
  476. const unsigned int window_size = 5;
  477. const unsigned int mask = (1 << (window_size + 1)) - 1;
  478. unsigned int wvalue;
  479. BN_ULONG tmp[P256_LIMBS];
  480. ALIGN32 P256_POINT h;
  481. const BIGNUM **scalars = NULL;
  482. P256_POINT (*table)[16] = NULL;
  483. void *table_storage = NULL;
  484. if ((table_storage =
  485. OPENSSL_malloc(num * 16 * sizeof(P256_POINT) + 64)) == NULL
  486. || (p_str =
  487. OPENSSL_malloc(num * 33 * sizeof(unsigned char))) == NULL
  488. || (scalars = OPENSSL_malloc(num * sizeof(BIGNUM *))) == NULL) {
  489. ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_MALLOC_FAILURE);
  490. goto err;
  491. } else {
  492. table = (void *)ALIGNPTR(table_storage, 64);
  493. }
  494. for (i = 0; i < num; i++) {
  495. P256_POINT *row = table[i];
  496. /* This is an unusual input, we don't guarantee constant-timeness. */
  497. if ((BN_num_bits(scalar[i]) > 256) || BN_is_negative(scalar[i])) {
  498. BIGNUM *mod;
  499. if ((mod = BN_CTX_get(ctx)) == NULL)
  500. goto err;
  501. if (!BN_nnmod(mod, scalar[i], &group->order, ctx)) {
  502. ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, ERR_R_BN_LIB);
  503. goto err;
  504. }
  505. scalars[i] = mod;
  506. } else
  507. scalars[i] = scalar[i];
  508. for (j = 0; j < scalars[i]->top * BN_BYTES; j += BN_BYTES) {
  509. BN_ULONG d = scalars[i]->d[j / BN_BYTES];
  510. p_str[i][j + 0] = d & 0xff;
  511. p_str[i][j + 1] = (d >> 8) & 0xff;
  512. p_str[i][j + 2] = (d >> 16) & 0xff;
  513. p_str[i][j + 3] = (d >>= 24) & 0xff;
  514. if (BN_BYTES == 8) {
  515. d >>= 8;
  516. p_str[i][j + 4] = d & 0xff;
  517. p_str[i][j + 5] = (d >> 8) & 0xff;
  518. p_str[i][j + 6] = (d >> 16) & 0xff;
  519. p_str[i][j + 7] = (d >> 24) & 0xff;
  520. }
  521. }
  522. for (; j < 33; j++)
  523. p_str[i][j] = 0;
  524. /* table[0] is implicitly (0,0,0) (the point at infinity),
  525. * therefore it is not stored. All other values are actually
  526. * stored with an offset of -1 in table.
  527. */
  528. if (!ecp_nistz256_bignum_to_field_elem(row[1 - 1].X, &point[i]->X)
  529. || !ecp_nistz256_bignum_to_field_elem(row[1 - 1].Y, &point[i]->Y)
  530. || !ecp_nistz256_bignum_to_field_elem(row[1 - 1].Z, &point[i]->Z)) {
  531. ECerr(EC_F_ECP_NISTZ256_WINDOWED_MUL, EC_R_COORDINATES_OUT_OF_RANGE);
  532. goto err;
  533. }
  534. ecp_nistz256_point_double(&row[ 2 - 1], &row[ 1 - 1]);
  535. ecp_nistz256_point_add (&row[ 3 - 1], &row[ 2 - 1], &row[1 - 1]);
  536. ecp_nistz256_point_double(&row[ 4 - 1], &row[ 2 - 1]);
  537. ecp_nistz256_point_double(&row[ 6 - 1], &row[ 3 - 1]);
  538. ecp_nistz256_point_double(&row[ 8 - 1], &row[ 4 - 1]);
  539. ecp_nistz256_point_double(&row[12 - 1], &row[ 6 - 1]);
  540. ecp_nistz256_point_add (&row[ 5 - 1], &row[ 4 - 1], &row[1 - 1]);
  541. ecp_nistz256_point_add (&row[ 7 - 1], &row[ 6 - 1], &row[1 - 1]);
  542. ecp_nistz256_point_add (&row[ 9 - 1], &row[ 8 - 1], &row[1 - 1]);
  543. ecp_nistz256_point_add (&row[13 - 1], &row[12 - 1], &row[1 - 1]);
  544. ecp_nistz256_point_double(&row[14 - 1], &row[ 7 - 1]);
  545. ecp_nistz256_point_double(&row[10 - 1], &row[ 5 - 1]);
  546. ecp_nistz256_point_add (&row[15 - 1], &row[14 - 1], &row[1 - 1]);
  547. ecp_nistz256_point_add (&row[11 - 1], &row[10 - 1], &row[1 - 1]);
  548. ecp_nistz256_point_add (&row[16 - 1], &row[15 - 1], &row[1 - 1]);
  549. }
  550. index = 255;
  551. wvalue = p_str[0][(index - 1) / 8];
  552. wvalue = (wvalue >> ((index - 1) % 8)) & mask;
  553. ecp_nistz256_select_w5(r, table[0], _booth_recode_w5(wvalue) >> 1);
  554. while (index >= 5) {
  555. for (i = (index == 255 ? 1 : 0); i < num; i++) {
  556. unsigned int off = (index - 1) / 8;
  557. wvalue = p_str[i][off] | p_str[i][off + 1] << 8;
  558. wvalue = (wvalue >> ((index - 1) % 8)) & mask;
  559. wvalue = _booth_recode_w5(wvalue);
  560. ecp_nistz256_select_w5(&h, table[i], wvalue >> 1);
  561. ecp_nistz256_neg(tmp, h.Y);
  562. copy_conditional(h.Y, tmp, (wvalue & 1));
  563. ecp_nistz256_point_add(r, r, &h);
  564. }
  565. index -= window_size;
  566. ecp_nistz256_point_double(r, r);
  567. ecp_nistz256_point_double(r, r);
  568. ecp_nistz256_point_double(r, r);
  569. ecp_nistz256_point_double(r, r);
  570. ecp_nistz256_point_double(r, r);
  571. }
  572. /* Final window */
  573. for (i = 0; i < num; i++) {
  574. wvalue = p_str[i][0];
  575. wvalue = (wvalue << 1) & mask;
  576. wvalue = _booth_recode_w5(wvalue);
  577. ecp_nistz256_select_w5(&h, table[i], wvalue >> 1);
  578. ecp_nistz256_neg(tmp, h.Y);
  579. copy_conditional(h.Y, tmp, wvalue & 1);
  580. ecp_nistz256_point_add(r, r, &h);
  581. }
  582. ret = 1;
  583. err:
  584. if (table_storage)
  585. OPENSSL_free(table_storage);
  586. if (p_str)
  587. OPENSSL_free(p_str);
  588. if (scalars)
  589. OPENSSL_free(scalars);
  590. return ret;
  591. }
  592. /* Coordinates of G, for which we have precomputed tables */
  593. const static BN_ULONG def_xG[P256_LIMBS] = {
  594. TOBN(0x79e730d4, 0x18a9143c), TOBN(0x75ba95fc, 0x5fedb601),
  595. TOBN(0x79fb732b, 0x77622510), TOBN(0x18905f76, 0xa53755c6)
  596. };
  597. const static BN_ULONG def_yG[P256_LIMBS] = {
  598. TOBN(0xddf25357, 0xce95560a), TOBN(0x8b4ab8e4, 0xba19e45c),
  599. TOBN(0xd2e88688, 0xdd21f325), TOBN(0x8571ff18, 0x25885d85)
  600. };
  601. /*
  602. * ecp_nistz256_is_affine_G returns one if |generator| is the standard, P-256
  603. * generator.
  604. */
  605. static int ecp_nistz256_is_affine_G(const EC_POINT *generator)
  606. {
  607. return (generator->X.top == P256_LIMBS) &&
  608. (generator->Y.top == P256_LIMBS) &&
  609. (generator->Z.top == (P256_LIMBS - P256_LIMBS / 8)) &&
  610. is_equal(generator->X.d, def_xG) &&
  611. is_equal(generator->Y.d, def_yG) && is_one(generator->Z.d);
  612. }
  613. static int ecp_nistz256_mult_precompute(EC_GROUP *group, BN_CTX *ctx)
  614. {
  615. /*
  616. * We precompute a table for a Booth encoded exponent (wNAF) based
  617. * computation. Each table holds 64 values for safe access, with an
  618. * implicit value of infinity at index zero. We use window of size 7, and
  619. * therefore require ceil(256/7) = 37 tables.
  620. */
  621. BIGNUM *order;
  622. EC_POINT *P = NULL, *T = NULL;
  623. const EC_POINT *generator;
  624. EC_PRE_COMP *pre_comp;
  625. BN_CTX *new_ctx = NULL;
  626. int i, j, k, ret = 0;
  627. size_t w;
  628. PRECOMP256_ROW *preComputedTable = NULL;
  629. unsigned char *precomp_storage = NULL;
  630. /* if there is an old EC_PRE_COMP object, throw it away */
  631. EC_EX_DATA_free_data(&group->extra_data, ecp_nistz256_pre_comp_dup,
  632. ecp_nistz256_pre_comp_free,
  633. ecp_nistz256_pre_comp_clear_free);
  634. generator = EC_GROUP_get0_generator(group);
  635. if (generator == NULL) {
  636. ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNDEFINED_GENERATOR);
  637. return 0;
  638. }
  639. if (ecp_nistz256_is_affine_G(generator)) {
  640. /*
  641. * No need to calculate tables for the standard generator because we
  642. * have them statically.
  643. */
  644. return 1;
  645. }
  646. if ((pre_comp = ecp_nistz256_pre_comp_new(group)) == NULL)
  647. return 0;
  648. if (ctx == NULL) {
  649. ctx = new_ctx = BN_CTX_new();
  650. if (ctx == NULL)
  651. goto err;
  652. }
  653. BN_CTX_start(ctx);
  654. order = BN_CTX_get(ctx);
  655. if (order == NULL)
  656. goto err;
  657. if (!EC_GROUP_get_order(group, order, ctx))
  658. goto err;
  659. if (BN_is_zero(order)) {
  660. ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, EC_R_UNKNOWN_ORDER);
  661. goto err;
  662. }
  663. w = 7;
  664. if ((precomp_storage =
  665. OPENSSL_malloc(37 * 64 * sizeof(P256_POINT_AFFINE) + 64)) == NULL) {
  666. ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE, ERR_R_MALLOC_FAILURE);
  667. goto err;
  668. } else {
  669. preComputedTable = (void *)ALIGNPTR(precomp_storage, 64);
  670. }
  671. P = EC_POINT_new(group);
  672. T = EC_POINT_new(group);
  673. if (P == NULL || T == NULL)
  674. goto err;
  675. /*
  676. * The zero entry is implicitly infinity, and we skip it, storing other
  677. * values with -1 offset.
  678. */
  679. if (!EC_POINT_copy(T, generator))
  680. goto err;
  681. for (k = 0; k < 64; k++) {
  682. if (!EC_POINT_copy(P, T))
  683. goto err;
  684. for (j = 0; j < 37; j++) {
  685. /*
  686. * It would be faster to use EC_POINTs_make_affine and
  687. * make multiple points affine at the same time.
  688. */
  689. if (!EC_POINT_make_affine(group, P, ctx))
  690. goto err;
  691. if (!ecp_nistz256_bignum_to_field_elem(preComputedTable[j][k].X,
  692. &P->X) ||
  693. !ecp_nistz256_bignum_to_field_elem(preComputedTable[j][k].Y,
  694. &P->Y)) {
  695. ECerr(EC_F_ECP_NISTZ256_MULT_PRECOMPUTE,
  696. EC_R_COORDINATES_OUT_OF_RANGE);
  697. goto err;
  698. }
  699. for (i = 0; i < 7; i++) {
  700. if (!EC_POINT_dbl(group, P, P, ctx))
  701. goto err;
  702. }
  703. }
  704. if (!EC_POINT_add(group, T, T, generator, ctx))
  705. goto err;
  706. }
  707. pre_comp->group = group;
  708. pre_comp->w = w;
  709. pre_comp->precomp = preComputedTable;
  710. pre_comp->precomp_storage = precomp_storage;
  711. precomp_storage = NULL;
  712. if (!EC_EX_DATA_set_data(&group->extra_data, pre_comp,
  713. ecp_nistz256_pre_comp_dup,
  714. ecp_nistz256_pre_comp_free,
  715. ecp_nistz256_pre_comp_clear_free)) {
  716. goto err;
  717. }
  718. pre_comp = NULL;
  719. ret = 1;
  720. err:
  721. if (ctx != NULL)
  722. BN_CTX_end(ctx);
  723. BN_CTX_free(new_ctx);
  724. if (pre_comp)
  725. ecp_nistz256_pre_comp_free(pre_comp);
  726. if (precomp_storage)
  727. OPENSSL_free(precomp_storage);
  728. if (P)
  729. EC_POINT_free(P);
  730. if (T)
  731. EC_POINT_free(T);
  732. return ret;
  733. }
  734. /*
  735. * Note that by default ECP_NISTZ256_AVX2 is undefined. While it's great
  736. * code processing 4 points in parallel, corresponding serial operation
  737. * is several times slower, because it uses 29x29=58-bit multiplication
  738. * as opposite to 64x64=128-bit in integer-only scalar case. As result
  739. * it doesn't provide *significant* performance improvement. Note that
  740. * just defining ECP_NISTZ256_AVX2 is not sufficient to make it work,
  741. * you'd need to compile even asm/ecp_nistz256-avx.pl module.
  742. */
  743. #if defined(ECP_NISTZ256_AVX2)
  744. # if !(defined(__x86_64) || defined(__x86_64__)) || \
  745. defined(_M_AMD64) || defined(_MX64)) || \
  746. !(defined(__GNUC__) || defined(_MSC_VER)) /* this is for ALIGN32 */
  747. # undef ECP_NISTZ256_AVX2
  748. # else
  749. /* Constant time access, loading four values, from four consecutive tables */
  750. void ecp_nistz256_avx2_select_w7(P256_POINT_AFFINE * val,
  751. const P256_POINT_AFFINE * in_t, int index);
  752. void ecp_nistz256_avx2_multi_select_w7(void *result, const void *in, int index0,
  753. int index1, int index2, int index3);
  754. void ecp_nistz256_avx2_transpose_convert(void *RESULTx4, const void *in);
  755. void ecp_nistz256_avx2_convert_transpose_back(void *result, const void *Ax4);
  756. void ecp_nistz256_avx2_point_add_affine_x4(void *RESULTx4, const void *Ax4,
  757. const void *Bx4);
  758. void ecp_nistz256_avx2_point_add_affines_x4(void *RESULTx4, const void *Ax4,
  759. const void *Bx4);
  760. void ecp_nistz256_avx2_to_mont(void *RESULTx4, const void *Ax4);
  761. void ecp_nistz256_avx2_from_mont(void *RESULTx4, const void *Ax4);
  762. void ecp_nistz256_avx2_set1(void *RESULTx4);
  763. int ecp_nistz_avx2_eligible(void);
  764. static void booth_recode_w7(unsigned char *sign,
  765. unsigned char *digit, unsigned char in)
  766. {
  767. unsigned char s, d;
  768. s = ~((in >> 7) - 1);
  769. d = (1 << 8) - in - 1;
  770. d = (d & s) | (in & ~s);
  771. d = (d >> 1) + (d & 1);
  772. *sign = s & 1;
  773. *digit = d;
  774. }
  775. /*
  776. * ecp_nistz256_avx2_mul_g performs multiplication by G, using only the
  777. * precomputed table. It does 4 affine point additions in parallel,
  778. * significantly speeding up point multiplication for a fixed value.
  779. */
  780. static void ecp_nistz256_avx2_mul_g(P256_POINT *r,
  781. unsigned char p_str[33],
  782. const P256_POINT_AFFINE(*preComputedTable)[64])
  783. {
  784. const unsigned int window_size = 7;
  785. const unsigned int mask = (1 << (window_size + 1)) - 1;
  786. unsigned int wvalue;
  787. /* Using 4 windows at a time */
  788. unsigned char sign0, digit0;
  789. unsigned char sign1, digit1;
  790. unsigned char sign2, digit2;
  791. unsigned char sign3, digit3;
  792. unsigned int index = 0;
  793. BN_ULONG tmp[P256_LIMBS];
  794. int i;
  795. ALIGN32 BN_ULONG aX4[4 * 9 * 3] = { 0 };
  796. ALIGN32 BN_ULONG bX4[4 * 9 * 2] = { 0 };
  797. ALIGN32 P256_POINT_AFFINE point_arr[P256_LIMBS];
  798. ALIGN32 P256_POINT res_point_arr[P256_LIMBS];
  799. /* Initial four windows */
  800. wvalue = *((u16 *) & p_str[0]);
  801. wvalue = (wvalue << 1) & mask;
  802. index += window_size;
  803. booth_recode_w7(&sign0, &digit0, wvalue);
  804. wvalue = *((u16 *) & p_str[(index - 1) / 8]);
  805. wvalue = (wvalue >> ((index - 1) % 8)) & mask;
  806. index += window_size;
  807. booth_recode_w7(&sign1, &digit1, wvalue);
  808. wvalue = *((u16 *) & p_str[(index - 1) / 8]);
  809. wvalue = (wvalue >> ((index - 1) % 8)) & mask;
  810. index += window_size;
  811. booth_recode_w7(&sign2, &digit2, wvalue);
  812. wvalue = *((u16 *) & p_str[(index - 1) / 8]);
  813. wvalue = (wvalue >> ((index - 1) % 8)) & mask;
  814. index += window_size;
  815. booth_recode_w7(&sign3, &digit3, wvalue);
  816. ecp_nistz256_avx2_multi_select_w7(point_arr, preComputedTable[0],
  817. digit0, digit1, digit2, digit3);
  818. ecp_nistz256_neg(tmp, point_arr[0].Y);
  819. copy_conditional(point_arr[0].Y, tmp, sign0);
  820. ecp_nistz256_neg(tmp, point_arr[1].Y);
  821. copy_conditional(point_arr[1].Y, tmp, sign1);
  822. ecp_nistz256_neg(tmp, point_arr[2].Y);
  823. copy_conditional(point_arr[2].Y, tmp, sign2);
  824. ecp_nistz256_neg(tmp, point_arr[3].Y);
  825. copy_conditional(point_arr[3].Y, tmp, sign3);
  826. ecp_nistz256_avx2_transpose_convert(aX4, point_arr);
  827. ecp_nistz256_avx2_to_mont(aX4, aX4);
  828. ecp_nistz256_avx2_to_mont(&aX4[4 * 9], &aX4[4 * 9]);
  829. ecp_nistz256_avx2_set1(&aX4[4 * 9 * 2]);
  830. wvalue = *((u16 *) & p_str[(index - 1) / 8]);
  831. wvalue = (wvalue >> ((index - 1) % 8)) & mask;
  832. index += window_size;
  833. booth_recode_w7(&sign0, &digit0, wvalue);
  834. wvalue = *((u16 *) & p_str[(index - 1) / 8]);
  835. wvalue = (wvalue >> ((index - 1) % 8)) & mask;
  836. index += window_size;
  837. booth_recode_w7(&sign1, &digit1, wvalue);
  838. wvalue = *((u16 *) & p_str[(index - 1) / 8]);
  839. wvalue = (wvalue >> ((index - 1) % 8)) & mask;
  840. index += window_size;
  841. booth_recode_w7(&sign2, &digit2, wvalue);
  842. wvalue = *((u16 *) & p_str[(index - 1) / 8]);
  843. wvalue = (wvalue >> ((index - 1) % 8)) & mask;
  844. index += window_size;
  845. booth_recode_w7(&sign3, &digit3, wvalue);
  846. ecp_nistz256_avx2_multi_select_w7(point_arr, preComputedTable[4 * 1],
  847. digit0, digit1, digit2, digit3);
  848. ecp_nistz256_neg(tmp, point_arr[0].Y);
  849. copy_conditional(point_arr[0].Y, tmp, sign0);
  850. ecp_nistz256_neg(tmp, point_arr[1].Y);
  851. copy_conditional(point_arr[1].Y, tmp, sign1);
  852. ecp_nistz256_neg(tmp, point_arr[2].Y);
  853. copy_conditional(point_arr[2].Y, tmp, sign2);
  854. ecp_nistz256_neg(tmp, point_arr[3].Y);
  855. copy_conditional(point_arr[3].Y, tmp, sign3);
  856. ecp_nistz256_avx2_transpose_convert(bX4, point_arr);
  857. ecp_nistz256_avx2_to_mont(bX4, bX4);
  858. ecp_nistz256_avx2_to_mont(&bX4[4 * 9], &bX4[4 * 9]);
  859. /* Optimized when both inputs are affine */
  860. ecp_nistz256_avx2_point_add_affines_x4(aX4, aX4, bX4);
  861. for (i = 2; i < 9; i++) {
  862. wvalue = *((u16 *) & p_str[(index - 1) / 8]);
  863. wvalue = (wvalue >> ((index - 1) % 8)) & mask;
  864. index += window_size;
  865. booth_recode_w7(&sign0, &digit0, wvalue);
  866. wvalue = *((u16 *) & p_str[(index - 1) / 8]);
  867. wvalue = (wvalue >> ((index - 1) % 8)) & mask;
  868. index += window_size;
  869. booth_recode_w7(&sign1, &digit1, wvalue);
  870. wvalue = *((u16 *) & p_str[(index - 1) / 8]);
  871. wvalue = (wvalue >> ((index - 1) % 8)) & mask;
  872. index += window_size;
  873. booth_recode_w7(&sign2, &digit2, wvalue);
  874. wvalue = *((u16 *) & p_str[(index - 1) / 8]);
  875. wvalue = (wvalue >> ((index - 1) % 8)) & mask;
  876. index += window_size;
  877. booth_recode_w7(&sign3, &digit3, wvalue);
  878. ecp_nistz256_avx2_multi_select_w7(point_arr,
  879. preComputedTable[4 * i],
  880. digit0, digit1, digit2, digit3);
  881. ecp_nistz256_neg(tmp, point_arr[0].Y);
  882. copy_conditional(point_arr[0].Y, tmp, sign0);
  883. ecp_nistz256_neg(tmp, point_arr[1].Y);
  884. copy_conditional(point_arr[1].Y, tmp, sign1);
  885. ecp_nistz256_neg(tmp, point_arr[2].Y);
  886. copy_conditional(point_arr[2].Y, tmp, sign2);
  887. ecp_nistz256_neg(tmp, point_arr[3].Y);
  888. copy_conditional(point_arr[3].Y, tmp, sign3);
  889. ecp_nistz256_avx2_transpose_convert(bX4, point_arr);
  890. ecp_nistz256_avx2_to_mont(bX4, bX4);
  891. ecp_nistz256_avx2_to_mont(&bX4[4 * 9], &bX4[4 * 9]);
  892. ecp_nistz256_avx2_point_add_affine_x4(aX4, aX4, bX4);
  893. }
  894. ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 0], &aX4[4 * 9 * 0]);
  895. ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 1], &aX4[4 * 9 * 1]);
  896. ecp_nistz256_avx2_from_mont(&aX4[4 * 9 * 2], &aX4[4 * 9 * 2]);
  897. ecp_nistz256_avx2_convert_transpose_back(res_point_arr, aX4);
  898. /* Last window is performed serially */
  899. wvalue = *((u16 *) & p_str[(index - 1) / 8]);
  900. wvalue = (wvalue >> ((index - 1) % 8)) & mask;
  901. booth_recode_w7(&sign0, &digit0, wvalue);
  902. ecp_nistz256_avx2_select_w7((P256_POINT_AFFINE *) r,
  903. preComputedTable[36], digit0);
  904. ecp_nistz256_neg(tmp, r->Y);
  905. copy_conditional(r->Y, tmp, sign0);
  906. memcpy(r->Z, ONE, sizeof(ONE));
  907. /* Sum the four windows */
  908. ecp_nistz256_point_add(r, r, &res_point_arr[0]);
  909. ecp_nistz256_point_add(r, r, &res_point_arr[1]);
  910. ecp_nistz256_point_add(r, r, &res_point_arr[2]);
  911. ecp_nistz256_point_add(r, r, &res_point_arr[3]);
  912. }
  913. # endif
  914. #endif
  915. static int ecp_nistz256_set_from_affine(EC_POINT *out, const EC_GROUP *group,
  916. const P256_POINT_AFFINE *in,
  917. BN_CTX *ctx)
  918. {
  919. BIGNUM x, y;
  920. BN_ULONG d_x[P256_LIMBS], d_y[P256_LIMBS];
  921. int ret = 0;
  922. memcpy(d_x, in->X, sizeof(d_x));
  923. x.d = d_x;
  924. x.dmax = x.top = P256_LIMBS;
  925. x.neg = 0;
  926. x.flags = BN_FLG_STATIC_DATA;
  927. memcpy(d_y, in->Y, sizeof(d_y));
  928. y.d = d_y;
  929. y.dmax = y.top = P256_LIMBS;
  930. y.neg = 0;
  931. y.flags = BN_FLG_STATIC_DATA;
  932. ret = EC_POINT_set_affine_coordinates_GFp(group, out, &x, &y, ctx);
  933. return ret;
  934. }
  935. /* r = scalar*G + sum(scalars[i]*points[i]) */
  936. static int ecp_nistz256_points_mul(const EC_GROUP *group,
  937. EC_POINT *r,
  938. const BIGNUM *scalar,
  939. size_t num,
  940. const EC_POINT *points[],
  941. const BIGNUM *scalars[], BN_CTX *ctx)
  942. {
  943. int i = 0, ret = 0, no_precomp_for_generator = 0, p_is_infinity = 0;
  944. size_t j;
  945. unsigned char p_str[33] = { 0 };
  946. const PRECOMP256_ROW *preComputedTable = NULL;
  947. const EC_PRE_COMP *pre_comp = NULL;
  948. const EC_POINT *generator = NULL;
  949. unsigned int index = 0;
  950. BN_CTX *new_ctx = NULL;
  951. const BIGNUM **new_scalars = NULL;
  952. const EC_POINT **new_points = NULL;
  953. const unsigned int window_size = 7;
  954. const unsigned int mask = (1 << (window_size + 1)) - 1;
  955. unsigned int wvalue;
  956. ALIGN32 union {
  957. P256_POINT p;
  958. P256_POINT_AFFINE a;
  959. } t, p;
  960. BIGNUM *tmp_scalar;
  961. if (group->meth != r->meth) {
  962. ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_INCOMPATIBLE_OBJECTS);
  963. return 0;
  964. }
  965. if ((scalar == NULL) && (num == 0))
  966. return EC_POINT_set_to_infinity(group, r);
  967. for (j = 0; j < num; j++) {
  968. if (group->meth != points[j]->meth) {
  969. ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_INCOMPATIBLE_OBJECTS);
  970. return 0;
  971. }
  972. }
  973. if (ctx == NULL) {
  974. ctx = new_ctx = BN_CTX_new();
  975. if (ctx == NULL)
  976. goto err;
  977. }
  978. BN_CTX_start(ctx);
  979. if (scalar) {
  980. generator = EC_GROUP_get0_generator(group);
  981. if (generator == NULL) {
  982. ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, EC_R_UNDEFINED_GENERATOR);
  983. goto err;
  984. }
  985. /* look if we can use precomputed multiples of generator */
  986. pre_comp =
  987. EC_EX_DATA_get_data(group->extra_data, ecp_nistz256_pre_comp_dup,
  988. ecp_nistz256_pre_comp_free,
  989. ecp_nistz256_pre_comp_clear_free);
  990. if (pre_comp) {
  991. /*
  992. * If there is a precomputed table for the generator, check that
  993. * it was generated with the same generator.
  994. */
  995. EC_POINT *pre_comp_generator = EC_POINT_new(group);
  996. if (pre_comp_generator == NULL)
  997. goto err;
  998. if (!ecp_nistz256_set_from_affine
  999. (pre_comp_generator, group, pre_comp->precomp[0], ctx)) {
  1000. EC_POINT_free(pre_comp_generator);
  1001. goto err;
  1002. }
  1003. if (0 == EC_POINT_cmp(group, generator, pre_comp_generator, ctx))
  1004. preComputedTable = (const PRECOMP256_ROW *)pre_comp->precomp;
  1005. EC_POINT_free(pre_comp_generator);
  1006. }
  1007. if (preComputedTable == NULL && ecp_nistz256_is_affine_G(generator)) {
  1008. /*
  1009. * If there is no precomputed data, but the generator
  1010. * is the default, a hardcoded table of precomputed
  1011. * data is used. This is because applications, such as
  1012. * Apache, do not use EC_KEY_precompute_mult.
  1013. */
  1014. preComputedTable = (const PRECOMP256_ROW *)ecp_nistz256_precomputed;
  1015. }
  1016. if (preComputedTable) {
  1017. if ((BN_num_bits(scalar) > 256)
  1018. || BN_is_negative(scalar)) {
  1019. if ((tmp_scalar = BN_CTX_get(ctx)) == NULL)
  1020. goto err;
  1021. if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
  1022. ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_BN_LIB);
  1023. goto err;
  1024. }
  1025. scalar = tmp_scalar;
  1026. }
  1027. for (i = 0; i < scalar->top * BN_BYTES; i += BN_BYTES) {
  1028. BN_ULONG d = scalar->d[i / BN_BYTES];
  1029. p_str[i + 0] = d & 0xff;
  1030. p_str[i + 1] = (d >> 8) & 0xff;
  1031. p_str[i + 2] = (d >> 16) & 0xff;
  1032. p_str[i + 3] = (d >>= 24) & 0xff;
  1033. if (BN_BYTES == 8) {
  1034. d >>= 8;
  1035. p_str[i + 4] = d & 0xff;
  1036. p_str[i + 5] = (d >> 8) & 0xff;
  1037. p_str[i + 6] = (d >> 16) & 0xff;
  1038. p_str[i + 7] = (d >> 24) & 0xff;
  1039. }
  1040. }
  1041. for (; i < 33; i++)
  1042. p_str[i] = 0;
  1043. #if defined(ECP_NISTZ256_AVX2)
  1044. if (ecp_nistz_avx2_eligible()) {
  1045. ecp_nistz256_avx2_mul_g(&p.p, p_str, preComputedTable);
  1046. } else
  1047. #endif
  1048. {
  1049. /* First window */
  1050. wvalue = (p_str[0] << 1) & mask;
  1051. index += window_size;
  1052. wvalue = _booth_recode_w7(wvalue);
  1053. ecp_nistz256_select_w7(&p.a, preComputedTable[0], wvalue >> 1);
  1054. ecp_nistz256_neg(p.p.Z, p.p.Y);
  1055. copy_conditional(p.p.Y, p.p.Z, wvalue & 1);
  1056. memcpy(p.p.Z, ONE, sizeof(ONE));
  1057. for (i = 1; i < 37; i++) {
  1058. unsigned int off = (index - 1) / 8;
  1059. wvalue = p_str[off] | p_str[off + 1] << 8;
  1060. wvalue = (wvalue >> ((index - 1) % 8)) & mask;
  1061. index += window_size;
  1062. wvalue = _booth_recode_w7(wvalue);
  1063. ecp_nistz256_select_w7(&t.a,
  1064. preComputedTable[i], wvalue >> 1);
  1065. ecp_nistz256_neg(t.p.Z, t.a.Y);
  1066. copy_conditional(t.a.Y, t.p.Z, wvalue & 1);
  1067. ecp_nistz256_point_add_affine(&p.p, &p.p, &t.a);
  1068. }
  1069. }
  1070. } else {
  1071. p_is_infinity = 1;
  1072. no_precomp_for_generator = 1;
  1073. }
  1074. } else
  1075. p_is_infinity = 1;
  1076. if (no_precomp_for_generator) {
  1077. /*
  1078. * Without a precomputed table for the generator, it has to be
  1079. * handled like a normal point.
  1080. */
  1081. new_scalars = OPENSSL_malloc((num + 1) * sizeof(BIGNUM *));
  1082. if (!new_scalars) {
  1083. ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
  1084. goto err;
  1085. }
  1086. new_points = OPENSSL_malloc((num + 1) * sizeof(EC_POINT *));
  1087. if (!new_points) {
  1088. ECerr(EC_F_ECP_NISTZ256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
  1089. goto err;
  1090. }
  1091. memcpy(new_scalars, scalars, num * sizeof(BIGNUM *));
  1092. new_scalars[num] = scalar;
  1093. memcpy(new_points, points, num * sizeof(EC_POINT *));
  1094. new_points[num] = generator;
  1095. scalars = new_scalars;
  1096. points = new_points;
  1097. num++;
  1098. }
  1099. if (num) {
  1100. P256_POINT *out = &t.p;
  1101. if (p_is_infinity)
  1102. out = &p.p;
  1103. if (!ecp_nistz256_windowed_mul(group, out, scalars, points, num, ctx))
  1104. goto err;
  1105. if (!p_is_infinity)
  1106. ecp_nistz256_point_add(&p.p, &p.p, out);
  1107. }
  1108. /* Not constant-time, but we're only operating on the public output. */
  1109. if (!ecp_nistz256_set_words(&r->X, p.p.X) ||
  1110. !ecp_nistz256_set_words(&r->Y, p.p.Y) ||
  1111. !ecp_nistz256_set_words(&r->Z, p.p.Z)) {
  1112. goto err;
  1113. }
  1114. r->Z_is_one = is_one(p.p.Z) & 1;
  1115. ret = 1;
  1116. err:
  1117. if (ctx)
  1118. BN_CTX_end(ctx);
  1119. BN_CTX_free(new_ctx);
  1120. if (new_points)
  1121. OPENSSL_free(new_points);
  1122. if (new_scalars)
  1123. OPENSSL_free(new_scalars);
  1124. return ret;
  1125. }
  1126. static int ecp_nistz256_get_affine(const EC_GROUP *group,
  1127. const EC_POINT *point,
  1128. BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
  1129. {
  1130. BN_ULONG z_inv2[P256_LIMBS];
  1131. BN_ULONG z_inv3[P256_LIMBS];
  1132. BN_ULONG x_aff[P256_LIMBS];
  1133. BN_ULONG y_aff[P256_LIMBS];
  1134. BN_ULONG point_x[P256_LIMBS], point_y[P256_LIMBS], point_z[P256_LIMBS];
  1135. BN_ULONG x_ret[P256_LIMBS], y_ret[P256_LIMBS];
  1136. if (EC_POINT_is_at_infinity(group, point)) {
  1137. ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_POINT_AT_INFINITY);
  1138. return 0;
  1139. }
  1140. if (!ecp_nistz256_bignum_to_field_elem(point_x, &point->X) ||
  1141. !ecp_nistz256_bignum_to_field_elem(point_y, &point->Y) ||
  1142. !ecp_nistz256_bignum_to_field_elem(point_z, &point->Z)) {
  1143. ECerr(EC_F_ECP_NISTZ256_GET_AFFINE, EC_R_COORDINATES_OUT_OF_RANGE);
  1144. return 0;
  1145. }
  1146. ecp_nistz256_mod_inverse(z_inv3, point_z);
  1147. ecp_nistz256_sqr_mont(z_inv2, z_inv3);
  1148. ecp_nistz256_mul_mont(x_aff, z_inv2, point_x);
  1149. if (x != NULL) {
  1150. ecp_nistz256_from_mont(x_ret, x_aff);
  1151. if (!ecp_nistz256_set_words(x, x_ret))
  1152. return 0;
  1153. }
  1154. if (y != NULL) {
  1155. ecp_nistz256_mul_mont(z_inv3, z_inv3, z_inv2);
  1156. ecp_nistz256_mul_mont(y_aff, z_inv3, point_y);
  1157. ecp_nistz256_from_mont(y_ret, y_aff);
  1158. if (!ecp_nistz256_set_words(y, y_ret))
  1159. return 0;
  1160. }
  1161. return 1;
  1162. }
  1163. static EC_PRE_COMP *ecp_nistz256_pre_comp_new(const EC_GROUP *group)
  1164. {
  1165. EC_PRE_COMP *ret = NULL;
  1166. if (!group)
  1167. return NULL;
  1168. ret = (EC_PRE_COMP *)OPENSSL_malloc(sizeof(EC_PRE_COMP));
  1169. if (!ret) {
  1170. ECerr(EC_F_ECP_NISTZ256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
  1171. return ret;
  1172. }
  1173. ret->group = group;
  1174. ret->w = 6; /* default */
  1175. ret->precomp = NULL;
  1176. ret->precomp_storage = NULL;
  1177. ret->references = 1;
  1178. return ret;
  1179. }
  1180. static void *ecp_nistz256_pre_comp_dup(void *src_)
  1181. {
  1182. EC_PRE_COMP *src = src_;
  1183. /* no need to actually copy, these objects never change! */
  1184. CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
  1185. return src_;
  1186. }
  1187. static void ecp_nistz256_pre_comp_free(void *pre_)
  1188. {
  1189. int i;
  1190. EC_PRE_COMP *pre = pre_;
  1191. if (!pre)
  1192. return;
  1193. i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
  1194. if (i > 0)
  1195. return;
  1196. if (pre->precomp_storage)
  1197. OPENSSL_free(pre->precomp_storage);
  1198. OPENSSL_free(pre);
  1199. }
  1200. static void ecp_nistz256_pre_comp_clear_free(void *pre_)
  1201. {
  1202. int i;
  1203. EC_PRE_COMP *pre = pre_;
  1204. if (!pre)
  1205. return;
  1206. i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
  1207. if (i > 0)
  1208. return;
  1209. if (pre->precomp_storage) {
  1210. OPENSSL_cleanse(pre->precomp,
  1211. 32 * sizeof(unsigned char) * (1 << pre->w) * 2 * 37);
  1212. OPENSSL_free(pre->precomp_storage);
  1213. }
  1214. OPENSSL_cleanse(pre, sizeof *pre);
  1215. OPENSSL_free(pre);
  1216. }
  1217. static int ecp_nistz256_window_have_precompute_mult(const EC_GROUP *group)
  1218. {
  1219. /* There is a hard-coded table for the default generator. */
  1220. const EC_POINT *generator = EC_GROUP_get0_generator(group);
  1221. if (generator != NULL && ecp_nistz256_is_affine_G(generator)) {
  1222. /* There is a hard-coded table for the default generator. */
  1223. return 1;
  1224. }
  1225. return EC_EX_DATA_get_data(group->extra_data, ecp_nistz256_pre_comp_dup,
  1226. ecp_nistz256_pre_comp_free,
  1227. ecp_nistz256_pre_comp_clear_free) != NULL;
  1228. }
  1229. const EC_METHOD *EC_GFp_nistz256_method(void)
  1230. {
  1231. static const EC_METHOD ret = {
  1232. EC_FLAGS_DEFAULT_OCT,
  1233. NID_X9_62_prime_field,
  1234. ec_GFp_mont_group_init,
  1235. ec_GFp_mont_group_finish,
  1236. ec_GFp_mont_group_clear_finish,
  1237. ec_GFp_mont_group_copy,
  1238. ec_GFp_mont_group_set_curve,
  1239. ec_GFp_simple_group_get_curve,
  1240. ec_GFp_simple_group_get_degree,
  1241. ec_GFp_simple_group_check_discriminant,
  1242. ec_GFp_simple_point_init,
  1243. ec_GFp_simple_point_finish,
  1244. ec_GFp_simple_point_clear_finish,
  1245. ec_GFp_simple_point_copy,
  1246. ec_GFp_simple_point_set_to_infinity,
  1247. ec_GFp_simple_set_Jprojective_coordinates_GFp,
  1248. ec_GFp_simple_get_Jprojective_coordinates_GFp,
  1249. ec_GFp_simple_point_set_affine_coordinates,
  1250. ecp_nistz256_get_affine,
  1251. 0, 0, 0,
  1252. ec_GFp_simple_add,
  1253. ec_GFp_simple_dbl,
  1254. ec_GFp_simple_invert,
  1255. ec_GFp_simple_is_at_infinity,
  1256. ec_GFp_simple_is_on_curve,
  1257. ec_GFp_simple_cmp,
  1258. ec_GFp_simple_make_affine,
  1259. ec_GFp_simple_points_make_affine,
  1260. ecp_nistz256_points_mul, /* mul */
  1261. ecp_nistz256_mult_precompute, /* precompute_mult */
  1262. ecp_nistz256_window_have_precompute_mult, /* have_precompute_mult */
  1263. ec_GFp_mont_field_mul,
  1264. ec_GFp_mont_field_sqr,
  1265. 0, /* field_div */
  1266. ec_GFp_mont_field_encode,
  1267. ec_GFp_mont_field_decode,
  1268. ec_GFp_mont_field_set_to_one
  1269. };
  1270. return &ret;
  1271. }