transform.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package transform provides reader and writer wrappers that transform the
  5. // bytes passing through as well as various transformations. Example
  6. // transformations provided by other packages include normalization and
  7. // conversion between character sets.
  8. package transform // import "golang.org/x/text/transform"
  9. import (
  10. "bytes"
  11. "errors"
  12. "io"
  13. "unicode/utf8"
  14. )
  15. var (
  16. // ErrShortDst means that the destination buffer was too short to
  17. // receive all of the transformed bytes.
  18. ErrShortDst = errors.New("transform: short destination buffer")
  19. // ErrShortSrc means that the source buffer has insufficient data to
  20. // complete the transformation.
  21. ErrShortSrc = errors.New("transform: short source buffer")
  22. // errInconsistentByteCount means that Transform returned success (nil
  23. // error) but also returned nSrc inconsistent with the src argument.
  24. errInconsistentByteCount = errors.New("transform: inconsistent byte count returned")
  25. // errShortInternal means that an internal buffer is not large enough
  26. // to make progress and the Transform operation must be aborted.
  27. errShortInternal = errors.New("transform: short internal buffer")
  28. )
  29. // Transformer transforms bytes.
  30. type Transformer interface {
  31. // Transform writes to dst the transformed bytes read from src, and
  32. // returns the number of dst bytes written and src bytes read. The
  33. // atEOF argument tells whether src represents the last bytes of the
  34. // input.
  35. //
  36. // Callers should always process the nDst bytes produced and account
  37. // for the nSrc bytes consumed before considering the error err.
  38. //
  39. // A nil error means that all of the transformed bytes (whether freshly
  40. // transformed from src or left over from previous Transform calls)
  41. // were written to dst. A nil error can be returned regardless of
  42. // whether atEOF is true. If err is nil then nSrc must equal len(src);
  43. // the converse is not necessarily true.
  44. //
  45. // ErrShortDst means that dst was too short to receive all of the
  46. // transformed bytes. ErrShortSrc means that src had insufficient data
  47. // to complete the transformation. If both conditions apply, then
  48. // either error may be returned. Other than the error conditions listed
  49. // here, implementations are free to report other errors that arise.
  50. Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error)
  51. // Reset resets the state and allows a Transformer to be reused.
  52. Reset()
  53. }
  54. // NopResetter can be embedded by implementations of Transformer to add a nop
  55. // Reset method.
  56. type NopResetter struct{}
  57. // Reset implements the Reset method of the Transformer interface.
  58. func (NopResetter) Reset() {}
  59. // Reader wraps another io.Reader by transforming the bytes read.
  60. type Reader struct {
  61. r io.Reader
  62. t Transformer
  63. err error
  64. // dst[dst0:dst1] contains bytes that have been transformed by t but
  65. // not yet copied out via Read.
  66. dst []byte
  67. dst0, dst1 int
  68. // src[src0:src1] contains bytes that have been read from r but not
  69. // yet transformed through t.
  70. src []byte
  71. src0, src1 int
  72. // transformComplete is whether the transformation is complete,
  73. // regardless of whether or not it was successful.
  74. transformComplete bool
  75. }
  76. const defaultBufSize = 4096
  77. // NewReader returns a new Reader that wraps r by transforming the bytes read
  78. // via t. It calls Reset on t.
  79. func NewReader(r io.Reader, t Transformer) *Reader {
  80. t.Reset()
  81. return &Reader{
  82. r: r,
  83. t: t,
  84. dst: make([]byte, defaultBufSize),
  85. src: make([]byte, defaultBufSize),
  86. }
  87. }
  88. // Read implements the io.Reader interface.
  89. func (r *Reader) Read(p []byte) (int, error) {
  90. n, err := 0, error(nil)
  91. for {
  92. // Copy out any transformed bytes and return the final error if we are done.
  93. if r.dst0 != r.dst1 {
  94. n = copy(p, r.dst[r.dst0:r.dst1])
  95. r.dst0 += n
  96. if r.dst0 == r.dst1 && r.transformComplete {
  97. return n, r.err
  98. }
  99. return n, nil
  100. } else if r.transformComplete {
  101. return 0, r.err
  102. }
  103. // Try to transform some source bytes, or to flush the transformer if we
  104. // are out of source bytes. We do this even if r.r.Read returned an error.
  105. // As the io.Reader documentation says, "process the n > 0 bytes returned
  106. // before considering the error".
  107. if r.src0 != r.src1 || r.err != nil {
  108. r.dst0 = 0
  109. r.dst1, n, err = r.t.Transform(r.dst, r.src[r.src0:r.src1], r.err == io.EOF)
  110. r.src0 += n
  111. switch {
  112. case err == nil:
  113. if r.src0 != r.src1 {
  114. r.err = errInconsistentByteCount
  115. }
  116. // The Transform call was successful; we are complete if we
  117. // cannot read more bytes into src.
  118. r.transformComplete = r.err != nil
  119. continue
  120. case err == ErrShortDst && (r.dst1 != 0 || n != 0):
  121. // Make room in dst by copying out, and try again.
  122. continue
  123. case err == ErrShortSrc && r.src1-r.src0 != len(r.src) && r.err == nil:
  124. // Read more bytes into src via the code below, and try again.
  125. default:
  126. r.transformComplete = true
  127. // The reader error (r.err) takes precedence over the
  128. // transformer error (err) unless r.err is nil or io.EOF.
  129. if r.err == nil || r.err == io.EOF {
  130. r.err = err
  131. }
  132. continue
  133. }
  134. }
  135. // Move any untransformed source bytes to the start of the buffer
  136. // and read more bytes.
  137. if r.src0 != 0 {
  138. r.src0, r.src1 = 0, copy(r.src, r.src[r.src0:r.src1])
  139. }
  140. n, r.err = r.r.Read(r.src[r.src1:])
  141. r.src1 += n
  142. }
  143. }
  144. // TODO: implement ReadByte (and ReadRune??).
  145. // Writer wraps another io.Writer by transforming the bytes read.
  146. // The user needs to call Close to flush unwritten bytes that may
  147. // be buffered.
  148. type Writer struct {
  149. w io.Writer
  150. t Transformer
  151. dst []byte
  152. // src[:n] contains bytes that have not yet passed through t.
  153. src []byte
  154. n int
  155. }
  156. // NewWriter returns a new Writer that wraps w by transforming the bytes written
  157. // via t. It calls Reset on t.
  158. func NewWriter(w io.Writer, t Transformer) *Writer {
  159. t.Reset()
  160. return &Writer{
  161. w: w,
  162. t: t,
  163. dst: make([]byte, defaultBufSize),
  164. src: make([]byte, defaultBufSize),
  165. }
  166. }
  167. // Write implements the io.Writer interface. If there are not enough
  168. // bytes available to complete a Transform, the bytes will be buffered
  169. // for the next write. Call Close to convert the remaining bytes.
  170. func (w *Writer) Write(data []byte) (n int, err error) {
  171. src := data
  172. if w.n > 0 {
  173. // Append bytes from data to the last remainder.
  174. // TODO: limit the amount copied on first try.
  175. n = copy(w.src[w.n:], data)
  176. w.n += n
  177. src = w.src[:w.n]
  178. }
  179. for {
  180. nDst, nSrc, err := w.t.Transform(w.dst, src, false)
  181. if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
  182. return n, werr
  183. }
  184. src = src[nSrc:]
  185. if w.n > 0 && len(src) <= n {
  186. // Enough bytes from w.src have been consumed. We make src point
  187. // to data instead to reduce the copying.
  188. w.n = 0
  189. n -= len(src)
  190. src = data[n:]
  191. if n < len(data) && (err == nil || err == ErrShortSrc) {
  192. continue
  193. }
  194. } else {
  195. n += nSrc
  196. }
  197. switch {
  198. case err == ErrShortDst && (nDst > 0 || nSrc > 0):
  199. case err == ErrShortSrc && len(src) < len(w.src):
  200. m := copy(w.src, src)
  201. // If w.n > 0, bytes from data were already copied to w.src and n
  202. // was already set to the number of bytes consumed.
  203. if w.n == 0 {
  204. n += m
  205. }
  206. w.n = m
  207. return n, nil
  208. case err == nil && w.n > 0:
  209. return n, errInconsistentByteCount
  210. default:
  211. return n, err
  212. }
  213. }
  214. }
  215. // Close implements the io.Closer interface.
  216. func (w *Writer) Close() error {
  217. for src := w.src[:w.n]; len(src) > 0; {
  218. nDst, nSrc, err := w.t.Transform(w.dst, src, true)
  219. if nDst == 0 {
  220. return err
  221. }
  222. if _, werr := w.w.Write(w.dst[:nDst]); werr != nil {
  223. return werr
  224. }
  225. if err != ErrShortDst {
  226. return err
  227. }
  228. src = src[nSrc:]
  229. }
  230. return nil
  231. }
  232. type nop struct{ NopResetter }
  233. func (nop) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  234. n := copy(dst, src)
  235. if n < len(src) {
  236. err = ErrShortDst
  237. }
  238. return n, n, err
  239. }
  240. type discard struct{ NopResetter }
  241. func (discard) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  242. return 0, len(src), nil
  243. }
  244. var (
  245. // Discard is a Transformer for which all Transform calls succeed
  246. // by consuming all bytes and writing nothing.
  247. Discard Transformer = discard{}
  248. // Nop is a Transformer that copies src to dst.
  249. Nop Transformer = nop{}
  250. )
  251. // chain is a sequence of links. A chain with N Transformers has N+1 links and
  252. // N+1 buffers. Of those N+1 buffers, the first and last are the src and dst
  253. // buffers given to chain.Transform and the middle N-1 buffers are intermediate
  254. // buffers owned by the chain. The i'th link transforms bytes from the i'th
  255. // buffer chain.link[i].b at read offset chain.link[i].p to the i+1'th buffer
  256. // chain.link[i+1].b at write offset chain.link[i+1].n, for i in [0, N).
  257. type chain struct {
  258. link []link
  259. err error
  260. // errStart is the index at which the error occurred plus 1. Processing
  261. // errStart at this level at the next call to Transform. As long as
  262. // errStart > 0, chain will not consume any more source bytes.
  263. errStart int
  264. }
  265. func (c *chain) fatalError(errIndex int, err error) {
  266. if i := errIndex + 1; i > c.errStart {
  267. c.errStart = i
  268. c.err = err
  269. }
  270. }
  271. type link struct {
  272. t Transformer
  273. // b[p:n] holds the bytes to be transformed by t.
  274. b []byte
  275. p int
  276. n int
  277. }
  278. func (l *link) src() []byte {
  279. return l.b[l.p:l.n]
  280. }
  281. func (l *link) dst() []byte {
  282. return l.b[l.n:]
  283. }
  284. // Chain returns a Transformer that applies t in sequence.
  285. func Chain(t ...Transformer) Transformer {
  286. if len(t) == 0 {
  287. return nop{}
  288. }
  289. c := &chain{link: make([]link, len(t)+1)}
  290. for i, tt := range t {
  291. c.link[i].t = tt
  292. }
  293. // Allocate intermediate buffers.
  294. b := make([][defaultBufSize]byte, len(t)-1)
  295. for i := range b {
  296. c.link[i+1].b = b[i][:]
  297. }
  298. return c
  299. }
  300. // Reset resets the state of Chain. It calls Reset on all the Transformers.
  301. func (c *chain) Reset() {
  302. for i, l := range c.link {
  303. if l.t != nil {
  304. l.t.Reset()
  305. }
  306. c.link[i].p, c.link[i].n = 0, 0
  307. }
  308. }
  309. // Transform applies the transformers of c in sequence.
  310. func (c *chain) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  311. // Set up src and dst in the chain.
  312. srcL := &c.link[0]
  313. dstL := &c.link[len(c.link)-1]
  314. srcL.b, srcL.p, srcL.n = src, 0, len(src)
  315. dstL.b, dstL.n = dst, 0
  316. var lastFull, needProgress bool // for detecting progress
  317. // i is the index of the next Transformer to apply, for i in [low, high].
  318. // low is the lowest index for which c.link[low] may still produce bytes.
  319. // high is the highest index for which c.link[high] has a Transformer.
  320. // The error returned by Transform determines whether to increase or
  321. // decrease i. We try to completely fill a buffer before converting it.
  322. for low, i, high := c.errStart, c.errStart, len(c.link)-2; low <= i && i <= high; {
  323. in, out := &c.link[i], &c.link[i+1]
  324. nDst, nSrc, err0 := in.t.Transform(out.dst(), in.src(), atEOF && low == i)
  325. out.n += nDst
  326. in.p += nSrc
  327. if i > 0 && in.p == in.n {
  328. in.p, in.n = 0, 0
  329. }
  330. needProgress, lastFull = lastFull, false
  331. switch err0 {
  332. case ErrShortDst:
  333. // Process the destination buffer next. Return if we are already
  334. // at the high index.
  335. if i == high {
  336. return dstL.n, srcL.p, ErrShortDst
  337. }
  338. if out.n != 0 {
  339. i++
  340. // If the Transformer at the next index is not able to process any
  341. // source bytes there is nothing that can be done to make progress
  342. // and the bytes will remain unprocessed. lastFull is used to
  343. // detect this and break out of the loop with a fatal error.
  344. lastFull = true
  345. continue
  346. }
  347. // The destination buffer was too small, but is completely empty.
  348. // Return a fatal error as this transformation can never complete.
  349. c.fatalError(i, errShortInternal)
  350. case ErrShortSrc:
  351. if i == 0 {
  352. // Save ErrShortSrc in err. All other errors take precedence.
  353. err = ErrShortSrc
  354. break
  355. }
  356. // Source bytes were depleted before filling up the destination buffer.
  357. // Verify we made some progress, move the remaining bytes to the errStart
  358. // and try to get more source bytes.
  359. if needProgress && nSrc == 0 || in.n-in.p == len(in.b) {
  360. // There were not enough source bytes to proceed while the source
  361. // buffer cannot hold any more bytes. Return a fatal error as this
  362. // transformation can never complete.
  363. c.fatalError(i, errShortInternal)
  364. break
  365. }
  366. // in.b is an internal buffer and we can make progress.
  367. in.p, in.n = 0, copy(in.b, in.src())
  368. fallthrough
  369. case nil:
  370. // if i == low, we have depleted the bytes at index i or any lower levels.
  371. // In that case we increase low and i. In all other cases we decrease i to
  372. // fetch more bytes before proceeding to the next index.
  373. if i > low {
  374. i--
  375. continue
  376. }
  377. default:
  378. c.fatalError(i, err0)
  379. }
  380. // Exhausted level low or fatal error: increase low and continue
  381. // to process the bytes accepted so far.
  382. i++
  383. low = i
  384. }
  385. // If c.errStart > 0, this means we found a fatal error. We will clear
  386. // all upstream buffers. At this point, no more progress can be made
  387. // downstream, as Transform would have bailed while handling ErrShortDst.
  388. if c.errStart > 0 {
  389. for i := 1; i < c.errStart; i++ {
  390. c.link[i].p, c.link[i].n = 0, 0
  391. }
  392. err, c.errStart, c.err = c.err, 0, nil
  393. }
  394. return dstL.n, srcL.p, err
  395. }
  396. // RemoveFunc returns a Transformer that removes from the input all runes r for
  397. // which f(r) is true. Illegal bytes in the input are replaced by RuneError.
  398. func RemoveFunc(f func(r rune) bool) Transformer {
  399. return removeF(f)
  400. }
  401. type removeF func(r rune) bool
  402. func (removeF) Reset() {}
  403. // Transform implements the Transformer interface.
  404. func (t removeF) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  405. for r, sz := rune(0), 0; len(src) > 0; src = src[sz:] {
  406. if r = rune(src[0]); r < utf8.RuneSelf {
  407. sz = 1
  408. } else {
  409. r, sz = utf8.DecodeRune(src)
  410. if sz == 1 {
  411. // Invalid rune.
  412. if !atEOF && !utf8.FullRune(src) {
  413. err = ErrShortSrc
  414. break
  415. }
  416. // We replace illegal bytes with RuneError. Not doing so might
  417. // otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
  418. // The resulting byte sequence may subsequently contain runes
  419. // for which t(r) is true that were passed unnoticed.
  420. if !t(r) {
  421. if nDst+3 > len(dst) {
  422. err = ErrShortDst
  423. break
  424. }
  425. nDst += copy(dst[nDst:], "\uFFFD")
  426. }
  427. nSrc++
  428. continue
  429. }
  430. }
  431. if !t(r) {
  432. if nDst+sz > len(dst) {
  433. err = ErrShortDst
  434. break
  435. }
  436. nDst += copy(dst[nDst:], src[:sz])
  437. }
  438. nSrc += sz
  439. }
  440. return
  441. }
  442. // grow returns a new []byte that is longer than b, and copies the first n bytes
  443. // of b to the start of the new slice.
  444. func grow(b []byte, n int) []byte {
  445. m := len(b)
  446. if m <= 256 {
  447. m *= 2
  448. } else {
  449. m += m >> 1
  450. }
  451. buf := make([]byte, m)
  452. copy(buf, b[:n])
  453. return buf
  454. }
  455. const initialBufSize = 128
  456. // String returns a string with the result of converting s[:n] using t, where
  457. // n <= len(s). If err == nil, n will be len(s). It calls Reset on t.
  458. func String(t Transformer, s string) (result string, n int, err error) {
  459. if s == "" {
  460. return "", 0, nil
  461. }
  462. t.Reset()
  463. // Allocate only once. Note that both dst and src escape when passed to
  464. // Transform.
  465. buf := [2 * initialBufSize]byte{}
  466. dst := buf[:initialBufSize:initialBufSize]
  467. src := buf[initialBufSize : 2*initialBufSize]
  468. // Avoid allocation if the transformed string is identical to the original.
  469. // After this loop, pDst will point to the furthest point in s for which it
  470. // could be detected that t gives equal results, src[:nSrc] will
  471. // indicated the last processed chunk of s for which the output is not equal
  472. // and dst[:nDst] will be the transform of this chunk.
  473. var nDst, nSrc int
  474. pDst := 0 // Used as index in both src and dst in this loop.
  475. for {
  476. n := copy(src, s[pDst:])
  477. nDst, nSrc, err = t.Transform(dst, src[:n], pDst+n == len(s))
  478. // Note 1: we will not enter the loop with pDst == len(s) and we will
  479. // not end the loop with it either. So if nSrc is 0, this means there is
  480. // some kind of error from which we cannot recover given the current
  481. // buffer sizes. We will give up in this case.
  482. // Note 2: it is not entirely correct to simply do a bytes.Equal as
  483. // a Transformer may buffer internally. It will work in most cases,
  484. // though, and no harm is done if it doesn't work.
  485. // TODO: let transformers implement an optional Spanner interface, akin
  486. // to norm's QuickSpan. This would even allow us to avoid any allocation.
  487. if nSrc == 0 || !bytes.Equal(dst[:nDst], src[:nSrc]) {
  488. break
  489. }
  490. if pDst += nDst; pDst == len(s) {
  491. return s, pDst, nil
  492. }
  493. }
  494. // Move the bytes seen so far to dst.
  495. pSrc := pDst + nSrc
  496. if pDst+nDst <= initialBufSize {
  497. copy(dst[pDst:], dst[:nDst])
  498. } else {
  499. b := make([]byte, len(s)+nDst-nSrc)
  500. copy(b[pDst:], dst[:nDst])
  501. dst = b
  502. }
  503. copy(dst, s[:pDst])
  504. pDst += nDst
  505. if err != nil && err != ErrShortDst && err != ErrShortSrc {
  506. return string(dst[:pDst]), pSrc, err
  507. }
  508. // Complete the string with the remainder.
  509. for {
  510. n := copy(src, s[pSrc:])
  511. nDst, nSrc, err = t.Transform(dst[pDst:], src[:n], pSrc+n == len(s))
  512. pDst += nDst
  513. pSrc += nSrc
  514. switch err {
  515. case nil:
  516. if pSrc == len(s) {
  517. return string(dst[:pDst]), pSrc, nil
  518. }
  519. case ErrShortDst:
  520. // Do not grow as long as we can make progress. This may avoid
  521. // excessive allocations.
  522. if nDst == 0 {
  523. dst = grow(dst, pDst)
  524. }
  525. case ErrShortSrc:
  526. if nSrc == 0 {
  527. src = grow(src, 0)
  528. }
  529. default:
  530. return string(dst[:pDst]), pSrc, err
  531. }
  532. }
  533. }
  534. // Bytes returns a new byte slice with the result of converting b[:n] using t,
  535. // where n <= len(b). If err == nil, n will be len(b). It calls Reset on t.
  536. func Bytes(t Transformer, b []byte) (result []byte, n int, err error) {
  537. return doAppend(t, 0, make([]byte, len(b)), b)
  538. }
  539. // Append appends the result of converting src[:n] using t to dst, where
  540. // n <= len(src), If err == nil, n will be len(src). It calls Reset on t.
  541. func Append(t Transformer, dst, src []byte) (result []byte, n int, err error) {
  542. if len(dst) == cap(dst) {
  543. n := len(src) + len(dst) // It is okay for this to be 0.
  544. b := make([]byte, n)
  545. dst = b[:copy(b, dst)]
  546. }
  547. return doAppend(t, len(dst), dst[:cap(dst)], src)
  548. }
  549. func doAppend(t Transformer, pDst int, dst, src []byte) (result []byte, n int, err error) {
  550. t.Reset()
  551. pSrc := 0
  552. for {
  553. nDst, nSrc, err := t.Transform(dst[pDst:], src[pSrc:], true)
  554. pDst += nDst
  555. pSrc += nSrc
  556. if err != ErrShortDst {
  557. return dst[:pDst], pSrc, err
  558. }
  559. // Grow the destination buffer, but do not grow as long as we can make
  560. // progress. This may avoid excessive allocations.
  561. if nDst == 0 {
  562. dst = grow(dst, pDst)
  563. }
  564. }
  565. }