TextDetector.cs 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. using System.Collections.Generic;
  2. using System.IO;
  3. using System.Linq;
  4. using System.Reflection;
  5. using System.Text;
  6. using Masuit.Tools.Mime;
  7. namespace Masuit.Tools.Files.FileDetector.Detectors;
  8. [FormatCategory(FormatCategory.Document)]
  9. internal sealed class TextDetector : IDetector
  10. {
  11. private static readonly byte[] SignatureBuffer = new byte[4];
  12. private static readonly char[] TextBuffer = new char[4096];
  13. private static readonly byte[] ReadBuffer = new byte[4096];
  14. private static readonly byte[] EncodingBuffer = new byte[4096];
  15. private static readonly Encoding[] Utf8Encodings = { Encoding.UTF8 };
  16. private static readonly Encoding[] Utf16Encodings = { Encoding.Unicode };
  17. private static readonly Encoding[] Utf16BeEncodings = { Encoding.BigEndianUnicode };
  18. private static readonly Encoding[] Utf32Encodings = { Encoding.GetEncoding("utf-32") };
  19. private static readonly Encoding[] OtherwiseEncodings = {
  20. Encoding.GetEncoding ( "ascii" ),
  21. Encoding.UTF8,
  22. Encoding.GetEncoding ( "utf-32" ),
  23. Encoding.Unicode,
  24. Encoding.BigEndianUnicode,
  25. Encoding.ASCII,
  26. };
  27. public string Extension => "txt";
  28. public string Precondition => null;
  29. public string MimeType => new MimeMapper().GetMimeFromExtension("." + Extension);
  30. public List<FormatCategory> FormatCategories => GetType().GetCustomAttributes<FormatCategoryAttribute>().Select(a => a.Category).ToList();
  31. public bool Detect(Stream stream)
  32. {
  33. _ = stream.Read(SignatureBuffer, 0, SignatureBuffer.Length);
  34. stream.Seek(0, SeekOrigin.Begin);
  35. Encoding[] encodings;
  36. if (SignatureBuffer[0] == 0xef && SignatureBuffer[1] == 0xbb && SignatureBuffer[2] == 0xbf)
  37. {
  38. encodings = Utf8Encodings;
  39. stream.Position = 3;
  40. }
  41. else if (SignatureBuffer[0] == 0xfe && SignatureBuffer[1] == 0xff)
  42. {
  43. encodings = Utf16Encodings;
  44. stream.Position = 2;
  45. }
  46. else if (SignatureBuffer[0] == 0xff && SignatureBuffer[1] == 0xfe)
  47. {
  48. encodings = Utf16BeEncodings;
  49. stream.Position = 2;
  50. }
  51. else if (SignatureBuffer[0] == 0 && SignatureBuffer[1] == 0 && SignatureBuffer[2] == 0xfe && SignatureBuffer[3] == 0xff)
  52. {
  53. encodings = Utf32Encodings;
  54. stream.Position = 4;
  55. }
  56. else
  57. {
  58. encodings = OtherwiseEncodings;
  59. stream.Position = 0;
  60. }
  61. int readed = stream.Read(ReadBuffer, 0, /*2048*/1024);
  62. foreach (var encoding in encodings)
  63. {
  64. for (int count = readed; count >= (readed - 16); --count)
  65. {
  66. bool succeed = true;
  67. int texted = encoding.GetChars(ReadBuffer, 0, count, TextBuffer, 0);
  68. for (int i = 0; i < texted; ++i)
  69. {
  70. char ch = TextBuffer[i];
  71. if ((char.IsControl(ch) && ch != '\r' && ch != '\n' && ch != '\t') || ch == '\0')
  72. {
  73. succeed = false;
  74. break;
  75. }
  76. }
  77. _ = encoding.GetBytes(TextBuffer, 0, texted, EncodingBuffer, 0);
  78. if (succeed/* && readed == byted*/ )
  79. {
  80. for (int i = 0; i < count; ++i)
  81. {
  82. if (ReadBuffer[i] != EncodingBuffer[i])
  83. {
  84. succeed = false;
  85. break;
  86. }
  87. }
  88. }
  89. else
  90. {
  91. continue;
  92. }
  93. if (succeed)
  94. {
  95. return true;
  96. }
  97. }
  98. }
  99. return false;
  100. }
  101. public override string ToString() => "Text File Detector";
  102. }