IndexBuilder.cs 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Linq;
  5. using System.Text;
  6. using System.Threading.Tasks;
  7. namespace MediaBrowser.Tests.ConsistencyTests.TextIndexing
  8. {
  9. public class IndexBuilder
  10. {
  11. public const int MinumumWordLength = 4;
  12. public static char[] WordChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890".ToCharArray();
  13. public static WordIndex BuildIndexFromFiles(IEnumerable<FileInfo> wordFiles, string rootFolderPath)
  14. {
  15. var index = new WordIndex();
  16. var wordSeparators = Enumerable.Range(32, 127).Select(e => Convert.ToChar(e)).Where(c => !WordChars.Contains(c)).ToArray();
  17. wordSeparators = wordSeparators.Concat(new[] { '\t' }).ToArray(); // add tab
  18. foreach (var file in wordFiles)
  19. {
  20. var lineNumber = 1;
  21. var displayFileName = file.FullName.Replace(rootFolderPath, string.Empty);
  22. using (var reader = file.OpenText())
  23. {
  24. while (!reader.EndOfStream)
  25. {
  26. var words = reader
  27. .ReadLine()
  28. .Split(wordSeparators, StringSplitOptions.RemoveEmptyEntries);
  29. ////.Select(f => f.Trim());
  30. var wordIndex = 1;
  31. foreach (var word in words)
  32. {
  33. if (word.Length >= MinumumWordLength)
  34. {
  35. index.AddWordOccurrence(word, displayFileName, file.FullName, lineNumber, wordIndex++);
  36. }
  37. }
  38. lineNumber++;
  39. }
  40. }
  41. }
  42. return index;
  43. }
  44. }
  45. }