TextDetector.cs 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. using System.Collections.Generic;
  2. using System.IO;
  3. using System.Linq;
  4. using System.Reflection;
  5. using System.Text;
  6. using Masuit.Tools.Mime;
  7. namespace Masuit.Tools.Files.FileDetector.Detectors;
  8. [FormatCategory(FormatCategory.Document)]
  9. internal class TextDetector : IDetector
  10. {
  11. private static readonly byte[] SignatureBuffer = new byte[4];
  12. private static readonly char[] TextBuffer = new char[4096];
  13. private static readonly byte[] ReadBuffer = new byte[4096];
  14. private static readonly byte[] EncodingBuffer = new byte[4096];
  15. private static readonly Encoding[] Utf8Encodings = { Encoding.UTF8 };
  16. private static readonly Encoding[] Utf16Encodings = { Encoding.Unicode };
  17. private static readonly Encoding[] Utf16BeEncodings = { Encoding.BigEndianUnicode };
  18. private static readonly Encoding[] Utf32Encodings = { Encoding.GetEncoding("utf-32") };
  19. private static readonly Encoding[] OtherwiseEncodings = {
  20. Encoding.GetEncoding ( "ascii" ),
  21. Encoding.UTF8,
  22. Encoding.GetEncoding ( "utf-32" ),
  23. Encoding.Unicode,
  24. Encoding.BigEndianUnicode
  25. };
  26. public string Extension => "txt";
  27. public string Precondition => null;
  28. public string MimeType => new MimeMapper().GetMimeFromExtension("." + Extension);
  29. public List<FormatCategory> FormatCategories => GetType().GetCustomAttributes<FormatCategoryAttribute>().Select(a => a.Category).ToList();
  30. public bool Detect(Stream stream)
  31. {
  32. _ = stream.Read(SignatureBuffer, 0, SignatureBuffer.Length);
  33. stream.Seek(0, SeekOrigin.Begin);
  34. Encoding[] encodings;
  35. if (SignatureBuffer[0] == 0xef && SignatureBuffer[1] == 0xbb && SignatureBuffer[2] == 0xbf)
  36. {
  37. encodings = Utf8Encodings;
  38. stream.Position = 3;
  39. }
  40. else if (SignatureBuffer[0] == 0xfe && SignatureBuffer[1] == 0xff)
  41. {
  42. encodings = Utf16Encodings;
  43. stream.Position = 2;
  44. }
  45. else if (SignatureBuffer[0] == 0xff && SignatureBuffer[1] == 0xfe)
  46. {
  47. encodings = Utf16BeEncodings;
  48. stream.Position = 2;
  49. }
  50. else if (SignatureBuffer[0] == 0 && SignatureBuffer[1] == 0 && SignatureBuffer[2] == 0xfe && SignatureBuffer[3] == 0xff)
  51. {
  52. encodings = Utf32Encodings;
  53. stream.Position = 4;
  54. }
  55. else
  56. {
  57. encodings = OtherwiseEncodings;
  58. stream.Position = 0;
  59. }
  60. int readed = stream.Read(ReadBuffer, 0, /*2048*/1024);
  61. foreach (var encoding in encodings)
  62. {
  63. for (int count = readed; count >= (readed - 16); --count)
  64. {
  65. bool succeed = true;
  66. int texted = encoding.GetChars(ReadBuffer, 0, count, TextBuffer, 0);
  67. for (int i = 0; i < texted; ++i)
  68. {
  69. char ch = TextBuffer[i];
  70. if ((char.IsControl(ch) && ch != '\r' && ch != '\n' && ch != '\t') || ch == '\0')
  71. {
  72. succeed = false;
  73. break;
  74. }
  75. }
  76. _ = encoding.GetBytes(TextBuffer, 0, texted, EncodingBuffer, 0);
  77. if (succeed/* && readed == byted*/ )
  78. {
  79. for (int i = 0; i < count; ++i)
  80. {
  81. if (ReadBuffer[i] != EncodingBuffer[i])
  82. {
  83. succeed = false;
  84. break;
  85. }
  86. }
  87. }
  88. else
  89. {
  90. continue;
  91. }
  92. if (succeed)
  93. {
  94. return true;
  95. }
  96. }
  97. }
  98. return false;
  99. }
  100. public override string ToString() => "Text File Detector";
  101. }