IndexBuilder.cs 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. using System.Linq;
  5. namespace MediaBrowser.Tests.ConsistencyTests.TextIndexing
  6. {
  7. public class IndexBuilder
  8. {
  9. public const int MinumumWordLength = 4;
  10. public static char[] WordChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890".ToCharArray();
  11. public static WordIndex BuildIndexFromFiles(IEnumerable<FileInfo> wordFiles, string rootFolderPath)
  12. {
  13. var index = new WordIndex();
  14. var wordSeparators = Enumerable.Range(32, 127).Select(e => Convert.ToChar(e)).Where(c => !WordChars.Contains(c)).ToArray();
  15. wordSeparators = wordSeparators.Concat(new[] { '\t' }).ToArray(); // add tab
  16. foreach (var file in wordFiles)
  17. {
  18. var lineNumber = 1;
  19. var displayFileName = file.FullName.Replace(rootFolderPath, string.Empty);
  20. using (var reader = file.OpenText())
  21. {
  22. while (!reader.EndOfStream)
  23. {
  24. var words = reader
  25. .ReadLine()
  26. .Split(wordSeparators, StringSplitOptions.RemoveEmptyEntries);
  27. ////.Select(f => f.Trim());
  28. var wordIndex = 1;
  29. foreach (var word in words)
  30. {
  31. if (word.Length >= MinumumWordLength)
  32. {
  33. index.AddWordOccurrence(word, displayFileName, file.FullName, lineNumber, wordIndex++);
  34. }
  35. }
  36. lineNumber++;
  37. }
  38. }
  39. }
  40. return index;
  41. }
  42. }
  43. }