402-avr32-string-ops.patch 23 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139
  1. Subject: [PATCH] AVR32-optimized string operations
  2. Add hand-optimized AVR32-specific string operations. Some of them
  3. need a bit more testing, though.
  4. ---
  5. libc/string/avr32/Makefile | 40 +++++++++++
  6. libc/string/avr32/bcopy.S | 15 ++++
  7. libc/string/avr32/bzero.S | 12 +++
  8. libc/string/avr32/memchr.S | 62 +++++++++++++++++
  9. libc/string/avr32/memcmp.S | 50 +++++++++++++
  10. libc/string/avr32/memcpy.S | 110 ++++++++++++++++++++++++++++++
  11. libc/string/avr32/memmove.S | 114 +++++++++++++++++++++++++++++++
  12. libc/string/avr32/memset.S | 60 ++++++++++++++++
  13. libc/string/avr32/strcat.S | 95 ++++++++++++++++++++++++++
  14. libc/string/avr32/strcmp.S | 80 ++++++++++++++++++++++
  15. libc/string/avr32/strcpy.S | 63 +++++++++++++++++
  16. libc/string/avr32/stringtest.c | 144 ++++++++++++++++++++++++++++++++++++++++
  17. libc/string/avr32/strlen.S | 52 ++++++++++++++
  18. libc/string/avr32/strncpy.S | 77 +++++++++++++++++++++
  19. libc/string/avr32/test_memcpy.c | 66 ++++++++++++++++++
  20. 15 files changed, 1040 insertions(+)
  21. Index: uClibc-0.9.28-avr32/libc/string/avr32/bcopy.S
  22. ===================================================================
  23. --- /dev/null 1970-01-01 00:00:00.000000000 +0000
  24. +++ uClibc-0.9.28-avr32/libc/string/avr32/bcopy.S 2006-10-19 15:05:52.000000000 +0200
  25. @@ -0,0 +1,15 @@
  26. +/*
  27. + * Copyright (C) 2004 Atmel Norway
  28. + */
  29. +
  30. + .text
  31. + .global bcopy
  32. + .type bcopy, @function
  33. + .align 1
  34. +bcopy:
  35. + /* Swap the first two arguments */
  36. + eor r11, r12
  37. + eor r12, r11
  38. + eor r11, r12
  39. + rjmp __memmove
  40. + .size bcopy, . - bcopy
  41. Index: uClibc-0.9.28-avr32/libc/string/avr32/bzero.S
  42. ===================================================================
  43. --- /dev/null 1970-01-01 00:00:00.000000000 +0000
  44. +++ uClibc-0.9.28-avr32/libc/string/avr32/bzero.S 2006-10-19 15:05:52.000000000 +0200
  45. @@ -0,0 +1,12 @@
  46. +/*
  47. + * Copyright (C) 2004 Atmel Norway
  48. + */
  49. +
  50. + .text
  51. + .global bzero
  52. + .type bzero, @function
  53. + .align 1
  54. +bzero:
  55. + mov r10, r11
  56. + mov r11, 0
  57. + rjmp __memset
  58. Index: uClibc-0.9.28-avr32/libc/string/avr32/Makefile
  59. ===================================================================
  60. --- /dev/null 1970-01-01 00:00:00.000000000 +0000
  61. +++ uClibc-0.9.28-avr32/libc/string/avr32/Makefile 2006-10-19 15:05:52.000000000 +0200
  62. @@ -0,0 +1,40 @@
  63. +# Makefile for uClibc
  64. +#
  65. +# Copyright (C) 2000-2003 Erik Andersen <[email protected]>
  66. +#
  67. +# This program is free software; you can redistribute it and/or modify it under
  68. +# the terms of the GNU Library General Public License as published by the Free
  69. +# Software Foundation; either version 2 of the License, or (at your option) any
  70. +# later version.
  71. +#
  72. +# This program is distributed in the hope that it will be useful, but WITHOUT
  73. +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  74. +# FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more
  75. +# details.
  76. +#
  77. +# You should have received a copy of the GNU Library General Public License
  78. +# along with this program; if not, write to the Free Software Foundation, Inc.,
  79. +# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  80. +
  81. +TOPDIR=../../../
  82. +include $(TOPDIR)Rules.mak
  83. +
  84. +SSRC := bcopy.S bzero.S memcmp.S memcpy.S memmove.S
  85. +SSRC += memset.S strcmp.S strlen.S
  86. +# memchr.S, strcat.S, strcpy.S, strncpy.S is broken
  87. +SOBJS := $(patsubst %.S,%.o, $(SSRC))
  88. +OBJS := $(SOBJS)
  89. +
  90. +OBJ_LIST:= ../../obj.string.$(TARGET_ARCH)
  91. +
  92. +all: $(OBJ_LIST)
  93. +
  94. +$(OBJ_LIST): $(OBJS)
  95. + echo $(addprefix string/$(TARGET_ARCH)/, $(OBJS)) > $@
  96. +
  97. +$(SOBJS): %.o: %.S
  98. + $(CC) $(ASFLAGS) -c $< -o $@
  99. + $(STRIPTOOL) -x -R .note -R .comment $@
  100. +
  101. +clean:
  102. + $(RM) *.[oa] *~ core
  103. Index: uClibc-0.9.28-avr32/libc/string/avr32/memchr.S
  104. ===================================================================
  105. --- /dev/null 1970-01-01 00:00:00.000000000 +0000
  106. +++ uClibc-0.9.28-avr32/libc/string/avr32/memchr.S 2006-10-19 15:05:52.000000000 +0200
  107. @@ -0,0 +1,62 @@
  108. +/*
  109. + * Copyright (C) 2004 Atmel Norway
  110. + */
  111. +
  112. +#define str r12
  113. +#define chr r11
  114. +#define len r10
  115. +
  116. + .text
  117. + .global memchr
  118. + .type memchr, @function
  119. +memchr:
  120. + or chr, chr, chr << 8
  121. + or chr, chr, chr << 16
  122. +
  123. + mov r9, str
  124. + andl r9, 3, COH
  125. + brne .Lunaligned_str
  126. +
  127. +1: sub len, 4
  128. + brlt 2f
  129. + ld.w r8, str++
  130. + psub.b r9, r8, r11
  131. + tnbz r9
  132. + brne 1b
  133. +
  134. + sub str, 4
  135. + bfextu r9, r8, 24, 8
  136. + cp.b r9, r11
  137. + reteq str
  138. + sub str, -1
  139. + bfextu r9, r8, 16, 8
  140. + cp.b r9, r11
  141. + reteq str
  142. + sub str, -1
  143. + bfextu r9, r8, 8, 8
  144. + cp.b r9, r11
  145. + reteq str
  146. + sub str, -1
  147. + retal str
  148. +
  149. +2: sub len, -4
  150. + reteq 0
  151. +
  152. +3: ld.ub r8, str++
  153. + cp.w r8, 0
  154. + reteq str
  155. + sub len, 1
  156. + brne 3b
  157. +
  158. + retal 0
  159. +
  160. +.Lunaligned_str:
  161. +1: sub len, 1
  162. + retlt 0
  163. + ld.ub r8, str++
  164. + cp.b r8, r11
  165. + reteq str
  166. + sub r9, 1
  167. + brge 1b
  168. +
  169. + rjmp .Laligned_search
  170. Index: uClibc-0.9.28-avr32/libc/string/avr32/memcmp.S
  171. ===================================================================
  172. --- /dev/null 1970-01-01 00:00:00.000000000 +0000
  173. +++ uClibc-0.9.28-avr32/libc/string/avr32/memcmp.S 2006-10-20 10:42:09.000000000 +0200
  174. @@ -0,0 +1,50 @@
  175. +/*
  176. + * Copyright (C) 2004 Atmel Norway.
  177. + */
  178. +
  179. +#define s1 r12
  180. +#define s2 r11
  181. +#define len r10
  182. +
  183. + .text
  184. + .global memcmp
  185. + .type memcmp, @function
  186. + .align 1
  187. +memcmp:
  188. + sub len, 4
  189. + brlt .Lless_than_4
  190. +
  191. +1: ld.w r8, s1++
  192. + ld.w r9, s2++
  193. + cp.w r8, r9
  194. + brne .Lfound_word
  195. + sub len, 4
  196. + brge 1b
  197. +
  198. +.Lless_than_4:
  199. + sub len, -4
  200. + reteq 0
  201. +
  202. +1: ld.ub r8, s1++
  203. + ld.ub r9, s2++
  204. + sub r8, r9
  205. + retne r8
  206. + sub len, 1
  207. + brgt 1b
  208. +
  209. + retal 0
  210. +
  211. +.Lfound_word:
  212. + psub.b r9, r8, r9
  213. + bfextu r8, r9, 24, 8
  214. + retne r8
  215. + bfextu r8, r9, 16, 8
  216. + retne r8
  217. + bfextu r8, r9, 8, 8
  218. + retne r8
  219. + retal r9
  220. +
  221. + .size memcmp, . - memcmp
  222. +
  223. + .weak bcmp
  224. + bcmp = memcmp
  225. Index: uClibc-0.9.28-avr32/libc/string/avr32/memcpy.S
  226. ===================================================================
  227. --- /dev/null 1970-01-01 00:00:00.000000000 +0000
  228. +++ uClibc-0.9.28-avr32/libc/string/avr32/memcpy.S 2006-10-19 15:05:52.000000000 +0200
  229. @@ -0,0 +1,110 @@
  230. +/*
  231. + * Copyright (C) 2004 Atmel Norway
  232. + */
  233. +
  234. +/* Don't use r12 as dst since we must return it unmodified */
  235. +#define dst r9
  236. +#define src r11
  237. +#define len r10
  238. +
  239. + .text
  240. + .global memcpy
  241. + .type memcpy, @function
  242. +
  243. + .global __memcpy
  244. + .hidden __memcpy
  245. + .type __memcpy, @function
  246. +memcpy:
  247. +__memcpy:
  248. + pref src[0]
  249. + mov dst, r12
  250. +
  251. + /* If we have less than 32 bytes, don't do anything fancy */
  252. + cp.w len, 32
  253. + brge .Lmore_than_31
  254. +
  255. + sub len, 1
  256. + retlt r12
  257. +1: ld.ub r8, src++
  258. + st.b dst++, r8
  259. + sub len, 1
  260. + brge 1b
  261. + retal r12
  262. +
  263. +.Lmore_than_31:
  264. + pushm r0-r7, lr
  265. +
  266. + /* Check alignment */
  267. + mov r8, src
  268. + andl r8, 31, COH
  269. + brne .Lunaligned_src
  270. + mov r8, dst
  271. + andl r8, 3, COH
  272. + brne .Lunaligned_dst
  273. +
  274. +.Laligned_copy:
  275. + sub len, 32
  276. + brlt .Lless_than_32
  277. +
  278. +1: /* Copy 32 bytes at a time */
  279. + ldm src, r0-r7
  280. + sub src, -32
  281. + stm dst, r0-r7
  282. + sub dst, -32
  283. + sub len, 32
  284. + brge 1b
  285. +
  286. +.Lless_than_32:
  287. + /* Copy 16 more bytes if possible */
  288. + sub len, -16
  289. + brlt .Lless_than_16
  290. + ldm src, r0-r3
  291. + sub src, -16
  292. + sub len, 16
  293. + stm dst, r0-r3
  294. + sub dst, -16
  295. +
  296. +.Lless_than_16:
  297. + /* Do the remaining as byte copies */
  298. + neg len
  299. + add pc, pc, len << 2
  300. + .rept 15
  301. + ld.ub r0, src++
  302. + st.b dst++, r0
  303. + .endr
  304. +
  305. + popm r0-r7, pc
  306. +
  307. +.Lunaligned_src:
  308. + /* Make src cacheline-aligned. r8 = (src & 31) */
  309. + rsub r8, r8, 32
  310. + sub len, r8
  311. +1: ld.ub r0, src++
  312. + st.b dst++, r0
  313. + sub r8, 1
  314. + brne 1b
  315. +
  316. + /* If dst is word-aligned, we're ready to go */
  317. + pref src[0]
  318. + mov r8, 3
  319. + tst dst, r8
  320. + breq .Laligned_copy
  321. +
  322. +.Lunaligned_dst:
  323. + /* src is aligned, but dst is not. Expect bad performance */
  324. + sub len, 4
  325. + brlt 2f
  326. +1: ld.w r0, src++
  327. + st.w dst++, r0
  328. + sub len, 4
  329. + brge 1b
  330. +
  331. +2: neg len
  332. + add pc, pc, len << 2
  333. + .rept 3
  334. + ld.ub r0, src++
  335. + st.b dst++, r0
  336. + .endr
  337. +
  338. + popm r0-r7, pc
  339. + .size memcpy, . - memcpy
  340. Index: uClibc-0.9.28-avr32/libc/string/avr32/memmove.S
  341. ===================================================================
  342. --- /dev/null 1970-01-01 00:00:00.000000000 +0000
  343. +++ uClibc-0.9.28-avr32/libc/string/avr32/memmove.S 2006-10-19 15:05:52.000000000 +0200
  344. @@ -0,0 +1,114 @@
  345. +/*
  346. + * Copyright (C) 2004 Atmel Norway
  347. + */
  348. +
  349. +#define dst r12
  350. +#define src r11
  351. +#define len r10
  352. +
  353. + .text
  354. + .global memmove
  355. + .type memmove, @function
  356. +
  357. + .global __memmove
  358. + .hidden __memmove
  359. + .type __memmove, @function
  360. +memmove:
  361. +__memmove:
  362. + cp.w src, dst
  363. + brge __memcpy
  364. +
  365. + add dst, len
  366. + add src, len
  367. + pref src[-1]
  368. +
  369. + /*
  370. + * The rest is basically the same as in memcpy.S except that
  371. + * the direction is reversed.
  372. + */
  373. + cp.w len, 32
  374. + brge .Lmore_than_31
  375. +
  376. + sub len, 1
  377. + retlt r12
  378. +1: ld.ub r8, --src
  379. + st.b --dst, r8
  380. + sub len, 1
  381. + brge 1b
  382. + retal r12
  383. +
  384. +.Lmore_than_31:
  385. + pushm r0-r7, lr
  386. +
  387. + /* Check alignment */
  388. + mov r8, src
  389. + andl r8, 31, COH
  390. + brne .Lunaligned_src
  391. + mov r8, r12
  392. + andl r8, 3, COH
  393. + brne .Lunaligned_dst
  394. +
  395. +.Laligned_copy:
  396. + sub len, 32
  397. + brlt .Lless_than_32
  398. +
  399. +1: /* Copy 32 bytes at a time */
  400. + sub src, 32
  401. + ldm src, r0-r7
  402. + sub dst, 32
  403. + sub len, 32
  404. + stm dst, r0-r7
  405. + brge 1b
  406. +
  407. +.Lless_than_32:
  408. + /* Copy 16 more bytes if possible */
  409. + sub len, -16
  410. + brlt .Lless_than_16
  411. + sub src, 16
  412. + ldm src, r0-r3
  413. + sub dst, 16
  414. + sub len, 16
  415. + stm dst, r0-r3
  416. +
  417. +.Lless_than_16:
  418. + /* Do the remaining as byte copies */
  419. + sub len, -16
  420. + breq 2f
  421. +1: ld.ub r0, --src
  422. + st.b --dst, r0
  423. + sub len, 1
  424. + brne 1b
  425. +
  426. +2: popm r0-r7, pc
  427. +
  428. +.Lunaligned_src:
  429. + /* Make src cacheline-aligned. r8 = (src & 31) */
  430. + sub len, r8
  431. +1: ld.ub r0, --src
  432. + st.b --dst, r0
  433. + sub r8, 1
  434. + brne 1b
  435. +
  436. + /* If dst is word-aligned, we're ready to go */
  437. + pref src[-4]
  438. + mov r8, 3
  439. + tst dst, r8
  440. + breq .Laligned_copy
  441. +
  442. +.Lunaligned_dst:
  443. + /* src is aligned, but dst is not. Expect bad performance */
  444. + sub len, 4
  445. + brlt 2f
  446. +1: ld.w r0, --src
  447. + st.w --dst, r0
  448. + sub len, 4
  449. + brge 1b
  450. +
  451. +2: neg len
  452. + add pc, pc, len << 2
  453. + .rept 3
  454. + ld.ub r0, --src
  455. + st.b --dst, r0
  456. + .endr
  457. +
  458. + popm r0-r7, pc
  459. Index: uClibc-0.9.28-avr32/libc/string/avr32/memset.S
  460. ===================================================================
  461. --- /dev/null 1970-01-01 00:00:00.000000000 +0000
  462. +++ uClibc-0.9.28-avr32/libc/string/avr32/memset.S 2006-10-20 10:42:15.000000000 +0200
  463. @@ -0,0 +1,60 @@
  464. +/*
  465. + * Copyright (C) 2004 Atmel Norway.
  466. + */
  467. +
  468. +#define s r12
  469. +#define c r11
  470. +#define n r10
  471. +
  472. + .text
  473. + .global memset
  474. + .type memset, @function
  475. +
  476. + .global __memset
  477. + .hidden __memset
  478. + .type __memset, @function
  479. +
  480. + .align 1
  481. +memset:
  482. +__memset:
  483. + cp.w n, 32
  484. + mov r9, s
  485. + brge .Llarge_memset
  486. +
  487. + sub n, 1
  488. + retlt s
  489. +1: st.b s++, c
  490. + sub n, 1
  491. + brge 1b
  492. +
  493. + retal r9
  494. +
  495. +.Llarge_memset:
  496. + mov r8, r11
  497. + mov r11, 3
  498. + bfins r8, r8, 8, 8
  499. + bfins r8, r8, 16, 16
  500. + tst s, r11
  501. + breq 2f
  502. +
  503. +1: st.b s++, r8
  504. + sub n, 1
  505. + tst s, r11
  506. + brne 1b
  507. +
  508. +2: mov r11, r9
  509. + mov r9, r8
  510. + sub n, 8
  511. +
  512. +3: st.d s++, r8
  513. + sub n, 8
  514. + brge 3b
  515. +
  516. + /* If we are done, n == -8 and we'll skip all st.b insns below */
  517. + neg n
  518. + lsl n, 1
  519. + add pc, n
  520. + .rept 7
  521. + st.b s++, r8
  522. + .endr
  523. + retal r11
  524. Index: uClibc-0.9.28-avr32/libc/string/avr32/strcat.S
  525. ===================================================================
  526. --- /dev/null 1970-01-01 00:00:00.000000000 +0000
  527. +++ uClibc-0.9.28-avr32/libc/string/avr32/strcat.S 2006-10-19 15:05:52.000000000 +0200
  528. @@ -0,0 +1,95 @@
  529. +/*
  530. + * Copyright (C) 2004 Atmel Norway
  531. + */
  532. +
  533. +#define s1 r9
  534. +#define s2 r11
  535. +
  536. + .text
  537. + .global strcat
  538. + .type strcat, @function
  539. + .align 1
  540. +strcat:
  541. + mov s1, r12
  542. +
  543. + /* Make sure s1 is word-aligned */
  544. + mov r10, s1
  545. + andl r10, 3, COH
  546. + breq 2f
  547. +
  548. + add pc, pc, r10 << 3
  549. + sub r0, r0, 0 /* 4-byte nop */
  550. + ld.ub r8, s1++
  551. + sub r8, r8, 0
  552. + breq 2f
  553. + ld.ub r8, s1++
  554. + sub r8, r8, 0
  555. + breq 3f
  556. + ld.ub r8, s1++
  557. + sub r8, r8, 0
  558. + breq 4f
  559. +
  560. + /* Find the end of the first string */
  561. +5: ld.w r8, s1++
  562. + tnbz r8
  563. + brne 5b
  564. +
  565. + sub s1, 4
  566. +
  567. + bfextu r10, r8, 24, 8
  568. + cp.w r10, 0
  569. + breq 1f
  570. + sub s1, -1
  571. + bfextu r10, r8, 16, 8
  572. + cp.w r10, 0
  573. + breq 2f
  574. + sub s1, -1
  575. + bfextu r10, r8, 8, 8
  576. + cp.w r10, 0
  577. + breq 3f
  578. + sub s1, -1
  579. + rjmp 4f
  580. +
  581. + /* Now, append s2 */
  582. +1: ld.ub r8, s2++
  583. + st.b s1++, r8
  584. + cp.w r8, 0
  585. + reteq r12
  586. +2: ld.ub r8, s2++
  587. + st.b s1++, r8
  588. + cp.w r8, 0
  589. + reteq r12
  590. +3: ld.ub r8, s2++
  591. + st.b s1++, r8
  592. + cp.w r8, 0
  593. + reteq r12
  594. +4: ld.ub r8, s2++
  595. + st.b s1++, r8
  596. + cp.w r8, 0
  597. + reteq r12
  598. +
  599. + /* Copy one word at a time */
  600. + ld.w r8, s2++
  601. + tnbz r8
  602. + breq 2f
  603. +1: st.w r8, s2++
  604. + ld.w r8, s2++
  605. + tnbz r8
  606. + brne 1b
  607. +
  608. + /* Copy the remaining bytes */
  609. + bfextu r10, r8, 24, 8
  610. + st.b s1++, r10
  611. + cp.w r10, 0
  612. + reteq r12
  613. + bfextu r10, r8, 16, 8
  614. + st.b s1++, r10
  615. + cp.w r10, 0
  616. + reteq r12
  617. + bfextu r10, r8, 8, 8
  618. + st.b s1++, r10
  619. + cp.w r10, 0
  620. + reteq r12
  621. + st.b s1++, r8
  622. + retal r12
  623. + .size strcat, . - strcat
  624. Index: uClibc-0.9.28-avr32/libc/string/avr32/strcmp.S
  625. ===================================================================
  626. --- /dev/null 1970-01-01 00:00:00.000000000 +0000
  627. +++ uClibc-0.9.28-avr32/libc/string/avr32/strcmp.S 2006-10-19 15:05:52.000000000 +0200
  628. @@ -0,0 +1,80 @@
  629. +/*
  630. + * Copyright (C) 2004 Atmel Norway.
  631. + */
  632. +
  633. +#define s1 r12
  634. +#define s2 r11
  635. +#define len r10
  636. +
  637. + .text
  638. + .global strcmp
  639. + .type strcmp, @function
  640. + .align 1
  641. +strcmp:
  642. + mov r8, 3
  643. + tst s1, r8
  644. + brne .Lunaligned_s1
  645. + tst s2, r8
  646. + brne .Lunaligned_s2
  647. +
  648. +1: ld.w r8, s1++
  649. + ld.w r9, s2++
  650. + cp.w r8, r9
  651. + brne 2f
  652. + tnbz r8
  653. + brne 1b
  654. + retal 0
  655. +
  656. +2: bfextu r12, r8, 24, 8
  657. + bfextu r11, r9, 24, 8
  658. + sub r12, r11
  659. + retne r12
  660. + cp.w r11, 0
  661. + reteq 0
  662. + bfextu r12, r8, 16, 8
  663. + bfextu r11, r9, 16, 8
  664. + sub r12, r11
  665. + retne r12
  666. + cp.w r11, 0
  667. + reteq 0
  668. + bfextu r12, r8, 8, 8
  669. + bfextu r11, r9, 8, 8
  670. + sub r12, r11
  671. + retne r12
  672. + cp.w r11, 0
  673. + reteq 0
  674. + bfextu r12, r8, 0, 8
  675. + bfextu r11, r9, 0, 8
  676. + sub r12, r11
  677. + retal r12
  678. +
  679. +.Lunaligned_s1:
  680. +3: tst s1, r8
  681. + breq 4f
  682. + ld.ub r10, s1++
  683. + ld.ub r9, s2++
  684. + sub r10, r9
  685. + retne r10
  686. + cp.w r9, 0
  687. + brne 3b
  688. + retal r10
  689. +
  690. +4: tst s2, r8
  691. + breq 1b
  692. +
  693. +.Lunaligned_s2:
  694. + /*
  695. + * s1 and s2 can't both be aligned, and unaligned word loads
  696. + * can trigger spurious exceptions if we cross a page boundary.
  697. + * Do it the slow way...
  698. + */
  699. +1: ld.ub r8, s1++
  700. + ld.ub r9, s2++
  701. + sub r8, r9
  702. + retne r8
  703. + cp.w r9, 0
  704. + brne 1b
  705. + retal 0
  706. +
  707. + .weak strcoll
  708. + strcoll = strcmp
  709. Index: uClibc-0.9.28-avr32/libc/string/avr32/strcpy.S
  710. ===================================================================
  711. --- /dev/null 1970-01-01 00:00:00.000000000 +0000
  712. +++ uClibc-0.9.28-avr32/libc/string/avr32/strcpy.S 2006-10-19 15:05:52.000000000 +0200
  713. @@ -0,0 +1,63 @@
  714. +/*
  715. + * Copyright (C) 2004 Atmel Norway
  716. + *
  717. + * To reduce the size, this one might simply call strncpy with len = -1.
  718. + */
  719. +
  720. +#define dst r9
  721. +#define src r11
  722. +
  723. + .text
  724. + .global strcpy
  725. + .type strcpy, @function
  726. +strcpy:
  727. + mov dst, r12
  728. +
  729. + pref src[0]
  730. +
  731. + /*
  732. + * Check alignment. If src is aligned but dst isn't, we can't
  733. + * do much about it...
  734. + */
  735. + mov r8, src
  736. + andl r8, 3 COH
  737. + brne .Lunaligned_src
  738. +
  739. +.Laligned_copy:
  740. +1: ld.w r8, src++
  741. + tnbz r8
  742. + breq 2f
  743. + st.w dst++, r8
  744. + rjmp 1b
  745. +
  746. +2: /*
  747. + * Ok, r8 now contains the terminating '\0'. Copy the
  748. + * remaining bytes individually.
  749. + */
  750. + bfextu r10, r8, 24, 8
  751. + st.b dst++, r10
  752. + cp.w r10, 0
  753. + reteq r12
  754. + bfextu r10, r8, 16, 8
  755. + st.b dst++, r10
  756. + cp.w r10, 0
  757. + reteq r12
  758. + bfextu r10, r8, 8, 8
  759. + st.b dst++, r10
  760. + cp.w r10, 0
  761. + reteq r12
  762. + st.b dst++, r8
  763. + retal r12
  764. +
  765. +.Lunaligned_src:
  766. + /* Copy bytes until we're aligned */
  767. + rsub r8, r8, 4
  768. + add pc, pc, r8 << 3
  769. + nop
  770. + nop
  771. + ld.ub r10, src++
  772. + st.b dst++, r10
  773. + cp.w r10, 0
  774. + reteq r12
  775. +
  776. + rjmp .Laligned_copy
  777. Index: uClibc-0.9.28-avr32/libc/string/avr32/stringtest.c
  778. ===================================================================
  779. --- /dev/null 1970-01-01 00:00:00.000000000 +0000
  780. +++ uClibc-0.9.28-avr32/libc/string/avr32/stringtest.c 2006-10-19 15:05:52.000000000 +0200
  781. @@ -0,0 +1,144 @@
  782. +
  783. +#include <stdio.h>
  784. +#include <string.h>
  785. +#include <time.h>
  786. +#include <sys/mman.h>
  787. +
  788. +#define BUF_SIZE (8 * 1024)
  789. +
  790. +static char *buf1;
  791. +static char *buf1_ref;
  792. +static char *buf2;
  793. +
  794. +extern void *optimized_memcpy(void *dest, void *src, size_t len);
  795. +extern void *optimized_memmove(void *dest, void *src, size_t len);
  796. +extern char *optimized_strcpy(char *dest, char *src);
  797. +extern char *optimized_strncpy(char *dest, char *src, size_t len);
  798. +
  799. +void dump_mismatch(char *buf, char *ref, size_t len)
  800. +{
  801. + int i, j;
  802. +
  803. + for (i = 0; i < len; i += 16) {
  804. + if (memcmp(buf + i, ref + i, 16) == 0)
  805. + continue;
  806. +
  807. + printf("%4x buf:", i);
  808. + for (j = i; j < (i + 16); j++)
  809. + printf(" %02x", buf[j]);
  810. + printf("\n ref:");
  811. + for (j = i; j < (i + 16); j++)
  812. + printf(" %02x", ref[j]);
  813. + printf("\n");
  814. + }
  815. +}
  816. +
  817. +static void test_memcpy(int src_offset, int dst_offset, int len)
  818. +{
  819. + clock_t start, old, new;
  820. + int i;
  821. +
  822. + memset(buf1, 0x55, BUF_SIZE);
  823. + memset(buf1_ref, 0x55, BUF_SIZE);
  824. + memset(buf2, 0xaa, BUF_SIZE);
  825. +
  826. + printf("Testing memcpy with offsets %d => %d and len %d...",
  827. + src_offset, dst_offset, len);
  828. +
  829. + start = clock();
  830. + for (i = 0; i < 8192; i++)
  831. + optimized_memcpy(buf1 + dst_offset, buf2 + src_offset, len);
  832. + new = clock() - start;
  833. + start = clock();
  834. + for ( i = 0; i < 8192; i++)
  835. + memcpy(buf1_ref + dst_offset, buf2 + src_offset, len);
  836. + old = clock() - start;
  837. +
  838. + if (memcmp(buf1, buf1_ref, BUF_SIZE) == 0)
  839. + printf("OK\n");
  840. + else {
  841. + printf("FAILED\n");
  842. + dump_mismatch(buf1, buf1_ref, BUF_SIZE);
  843. + }
  844. + printf("CPU time used: %d vs. %d\n", new, old);
  845. +}
  846. +
  847. +static void test_memmove(int src_offset, int dst_offset, int len)
  848. +{
  849. + clock_t start, old, new;
  850. +
  851. + memset(buf1, 0x55, BUF_SIZE);
  852. + memset(buf1_ref, 0x55, BUF_SIZE);
  853. + memset(buf2, 0xaa, BUF_SIZE);
  854. +
  855. + printf("Testing memmove with offsets %d => %d and len %d...",
  856. + src_offset, dst_offset, len);
  857. +
  858. + start = clock();
  859. + optimized_memmove(buf1 + dst_offset, buf2 + src_offset, len);
  860. + new = clock() - start;
  861. + start = clock();
  862. + memmove(buf1_ref + dst_offset, buf2 + src_offset, len);
  863. + old = clock() - start;
  864. +
  865. + if (memcmp(buf1, buf1_ref, BUF_SIZE) == 0)
  866. + printf("OK\n");
  867. + else {
  868. + printf("FAILED\n");
  869. + dump_mismatch(buf1, buf1_ref, BUF_SIZE);
  870. + }
  871. + printf("CPU time used: %d vs. %d\n", new, old);
  872. +}
  873. +
  874. +int main(int argc, char *argv[])
  875. +{
  876. + buf2 = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
  877. + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
  878. + if (buf2 == MAP_FAILED) {
  879. + perror("Failed to allocate memory for buf2");
  880. + return 1;
  881. + }
  882. + buf1 = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
  883. + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
  884. + if (buf1 == MAP_FAILED) {
  885. + perror("Failed to allocate memory for buf1");
  886. + return 1;
  887. + }
  888. + buf1_ref = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
  889. + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
  890. + if (buf1_ref == MAP_FAILED) {
  891. + perror("Failed to allocate memory for buf1_ref");
  892. + return 1;
  893. + }
  894. + printf("\n === MEMCPY ===\n\n");
  895. +
  896. + test_memcpy(0, 0, BUF_SIZE - 32);
  897. + test_memcpy(0, 0, 1);
  898. + test_memcpy(0, 0, 31);
  899. + test_memcpy(0, 0, 32);
  900. + test_memcpy(0, 0, 127);
  901. + test_memcpy(0, 0, 128);
  902. + test_memcpy(4, 4, BUF_SIZE - 32 - 4);
  903. + test_memcpy(1, 1, BUF_SIZE - 32 - 1);
  904. + test_memcpy(1, 1, 126);
  905. + test_memcpy(0, 3, 128);
  906. + test_memcpy(1, 4, 128);
  907. + test_memcpy(0, 0, 0);
  908. +
  909. + printf("\n === MEMMOVE ===\n\n");
  910. +
  911. + test_memmove(0, 0, BUF_SIZE - 32);
  912. + test_memmove(0, 0, 1);
  913. + test_memmove(0, 0, 31);
  914. + test_memmove(0, 0, 32);
  915. + test_memmove(0, 0, BUF_SIZE - 33);
  916. + test_memmove(0, 0, 128);
  917. + test_memmove(4, 4, BUF_SIZE - 32 - 4);
  918. + test_memmove(1, 1, BUF_SIZE - 32 - 1);
  919. + test_memmove(1, 1, BUF_SIZE - 130);
  920. + test_memmove(0, 3, BUF_SIZE - 128);
  921. + test_memmove(1, 4, BUF_SIZE - 128);
  922. + test_memmove(0, 0, 0);
  923. +
  924. + return 0;
  925. +}
  926. Index: uClibc-0.9.28-avr32/libc/string/avr32/strlen.S
  927. ===================================================================
  928. --- /dev/null 1970-01-01 00:00:00.000000000 +0000
  929. +++ uClibc-0.9.28-avr32/libc/string/avr32/strlen.S 2006-10-19 15:05:52.000000000 +0200
  930. @@ -0,0 +1,52 @@
  931. +/*
  932. + * Copyright (C) 2004 Atmel Norway
  933. + */
  934. +
  935. +#define str r12
  936. +
  937. + .text
  938. + .global strlen
  939. + .type strlen, @function
  940. +strlen:
  941. + mov r11, r12
  942. +
  943. + mov r9, str
  944. + andl r9, 3, COH
  945. + brne .Lunaligned_str
  946. +
  947. +1: ld.w r8, str++
  948. + tnbz r8
  949. + brne 1b
  950. +
  951. + sub r12, r11
  952. + bfextu r9, r8, 24, 8
  953. + cp.w r9, 0
  954. + subeq r12, 4
  955. + reteq r12
  956. + bfextu r9, r8, 16, 8
  957. + cp.w r9, 0
  958. + subeq r12, 3
  959. + reteq r12
  960. + bfextu r9, r8, 8, 8
  961. + cp.w r9, 0
  962. + subeq r12, 2
  963. + reteq r12
  964. + sub r12, 1
  965. + retal r12
  966. +
  967. +.Lunaligned_str:
  968. + add pc, pc, r9 << 3
  969. + sub r0, r0, 0 /* 4-byte nop */
  970. + ld.ub r8, str++
  971. + sub r8, r8, 0
  972. + breq 1f
  973. + ld.ub r8, str++
  974. + sub r8, r8, 0
  975. + breq 1f
  976. + ld.ub r8, str++
  977. + sub r8, r8, 0
  978. + brne 1b
  979. +
  980. +1: sub r12, 1
  981. + sub r12, r11
  982. + retal r12
  983. Index: uClibc-0.9.28-avr32/libc/string/avr32/strncpy.S
  984. ===================================================================
  985. --- /dev/null 1970-01-01 00:00:00.000000000 +0000
  986. +++ uClibc-0.9.28-avr32/libc/string/avr32/strncpy.S 2006-10-19 15:05:52.000000000 +0200
  987. @@ -0,0 +1,77 @@
  988. +/*
  989. + * Copyright (C) 2004 Atmel Norway
  990. + */
  991. +
  992. +#define dst r9
  993. +#define src r11
  994. +
  995. + .text
  996. + .global strcpy
  997. + .type strncpy, @function
  998. +strncpy:
  999. + mov dst, r12
  1000. +
  1001. + pref src[0]
  1002. + mov dst, r12
  1003. +
  1004. + /*
  1005. + * Check alignment. If src is aligned but dst isn't, we can't
  1006. + * do much about it...
  1007. + */
  1008. + mov r8, src
  1009. + andl r8, 3 COH
  1010. + brne .Lunaligned_src
  1011. +
  1012. +.Laligned_copy:
  1013. + sub r10, 4
  1014. + brlt 3f
  1015. +1: ld.w r8, src++
  1016. + tnbz r8
  1017. + breq 2f
  1018. + st.w dst++, r8
  1019. + sub r10, 4
  1020. + brne 1b
  1021. +
  1022. +3: sub r10, -4
  1023. + reteq r12
  1024. +
  1025. + /* This is safe as long as src is word-aligned and r10 > 0 */
  1026. + ld.w r8, src++
  1027. +
  1028. +2: /*
  1029. + * Ok, r8 now contains the terminating '\0'. Copy the
  1030. + * remaining bytes individually.
  1031. + */
  1032. + bfextu r11, r8, 24, 8
  1033. + st.b dst++, r11
  1034. + cp.w r11, 0
  1035. + reteq r12
  1036. + sub r10, 1
  1037. + reteq r12
  1038. + bfextu r11, r8, 16, 8
  1039. + st.b dst++, r11
  1040. + cp.w r11, 0
  1041. + reteq r12
  1042. + sub r10, 1
  1043. + reteq r12
  1044. + bfextu r11, r8, 8, 8
  1045. + st.b dst++, r11
  1046. + cp.w r11, 0
  1047. + reteq r12
  1048. + sub r10, 1
  1049. + reteq r12
  1050. + st.b dst++, r8
  1051. + retal r12
  1052. +
  1053. +.Lunaligned_src:
  1054. + /* Copy bytes until we're aligned */
  1055. + min r8, r8, r10
  1056. + sub r10, r8
  1057. + sub r8, 1
  1058. + retlt r12
  1059. +1: ld.ub r10, src++
  1060. + st.b dst++, r10
  1061. + sub r8, 1
  1062. + brge 1b
  1063. +
  1064. + rjmp .Laligned_copy
  1065. Index: uClibc-0.9.28-avr32/libc/string/avr32/test_memcpy.c
  1066. ===================================================================
  1067. --- /dev/null 1970-01-01 00:00:00.000000000 +0000
  1068. +++ uClibc-0.9.28-avr32/libc/string/avr32/test_memcpy.c 2006-10-19 15:05:52.000000000 +0200
  1069. @@ -0,0 +1,66 @@
  1070. +
  1071. +#include <stdio.h>
  1072. +#include <string.h>
  1073. +
  1074. +#define BUF_SIZE 32768
  1075. +
  1076. +static char buf1[BUF_SIZE] __attribute__((aligned(32)));
  1077. +static char buf1_ref[BUF_SIZE] __attribute__((aligned(32)));
  1078. +static char buf2[BUF_SIZE] __attribute__((aligned(32)));
  1079. +
  1080. +extern void *new_memcpy(void *dest, void *src, size_t len);
  1081. +
  1082. +void dump_mismatch(char *buf, char *ref, size_t len)
  1083. +{
  1084. + int i, j;
  1085. +
  1086. + for (i = 0; i < len; i += 16) {
  1087. + if (memcmp(buf + i, ref + i, 16) == 0)
  1088. + continue;
  1089. +
  1090. + printf("% 4x buf:", i);
  1091. + for (j = i; j < (i + 16); j++)
  1092. + printf(" %02x", buf[j]);
  1093. + printf("\n ref:");
  1094. + for (j = i; j < (i + 16); j++)
  1095. + printf(" %02x", ref[j]);
  1096. + printf("\n");
  1097. + }
  1098. +}
  1099. +
  1100. +void test(int src_offset, int dst_offset, int len)
  1101. +{
  1102. + memset(buf1, 0x55, sizeof(buf1));
  1103. + memset(buf1_ref, 0x55, sizeof(buf1_ref));
  1104. + memset(buf2, 0xaa, sizeof(buf2));
  1105. +
  1106. + printf("Testing with offsets %d => %d and len %d...",
  1107. + src_offset, dst_offset, len);
  1108. +
  1109. + new_memcpy(buf1 + dst_offset, buf2 + src_offset, len);
  1110. + memcpy(buf1_ref + dst_offset, buf2 + src_offset, len);
  1111. +
  1112. + if (memcmp(buf1, buf1_ref, sizeof(buf1)) == 0)
  1113. + printf("OK\n");
  1114. + else {
  1115. + printf("FAILED\n");
  1116. + dump_mismatch(buf1, buf1_ref, sizeof(buf1));
  1117. + }
  1118. +}
  1119. +
  1120. +int main(int argc, char *argv[])
  1121. +{
  1122. + test(0, 0, BUF_SIZE);
  1123. + test(0, 0, 1);
  1124. + test(0, 0, 31);
  1125. + test(0, 0, 32);
  1126. + test(0, 0, 127);
  1127. + test(0, 0, 128);
  1128. + test(4, 4, BUF_SIZE - 4);
  1129. + test(1, 1, BUF_SIZE - 1);
  1130. + test(1, 1, 126);
  1131. + test(0, 3, 128);
  1132. + test(1, 4, 128);
  1133. +
  1134. + return 0;
  1135. +}