Browse Source

Early work on background agent for search

NGPixel 8 năm trước cách đây
mục cha
commit
528fab6c87
9 tập tin đã thay đổi với 189 bổ sung68 xóa
  1. 57 6
      agent.js
  2. 4 3
      config.sample.yml
  3. 12 7
      gulpfile.js
  4. 30 31
      models/entries.js
  5. 4 12
      models/git.js
  6. 38 2
      models/markdown.js
  7. 42 0
      models/search.js
  8. 1 0
      package.json
  9. 1 7
      server.js

+ 57 - 6
agent.js

@@ -11,17 +11,20 @@ global.ROOTPATH = __dirname;
 // ----------------------------------------
 
 global.winston = require('winston');
-winston.info('[AGENT] Requarks Wiki BgAgent is initializing...');
+winston.info('[AGENT] Background Agent is initializing...');
 
 var appconfig = require('./models/config')('./config.yml');
 
-global.git = require('./models/git').init(appconfig, true);
+global.git = require('./models/git').init(appconfig);
 global.entries = require('./models/entries').init(appconfig);
 global.mark = require('./models/markdown');
+global.search = require('./models/search').init(appconfig);
 
 var _ = require('lodash');
 var moment = require('moment');
 var Promise = require('bluebird');
+var fs = Promise.promisifyAll(require("fs-extra"));
+var path = require('path');
 var cron = require('cron').CronJob;
 
 // ----------------------------------------
@@ -44,6 +47,7 @@ var job = new cron({
 		// Prepare async job collector
 
 		let jobs = [];
+		let repoPath = path.resolve(ROOTPATH, appconfig.datadir.repo);
 
 		// ----------------------------------------
 		// Compile Jobs
@@ -51,12 +55,58 @@ var job = new cron({
 
 		//-> Resync with Git remote
 
-		jobs.push(git.resync().then(() => {
+		jobs.push(git.onReady.then(() => {
+			return git.resync().then(() => {
 
-			//-> Purge outdated cache
+				//-> Stream all documents
 
-			return entries.purgeStaleCache();
+				let cacheJobs = [];
 
+				fs.walk(repoPath).on('data', function (item) {
+					if(path.extname(item.path) === '.md') {
+
+						let entryPath = entries.parsePath(entries.getEntryPathFromFullPath(item.path));
+						let cachePath = entries.getCachePath(entryPath);
+						
+						//-> Purge outdated cache
+
+						cacheJobs.push(
+							fs.statAsync(cachePath).then((st) => {
+								return moment(st.mtime).isBefore(item.stats.mtime) ? 'expired' : 'active';
+							}).catch((err) => {
+								return (err.code !== 'EEXIST') ? err : 'new';
+							}).then((fileStatus) => {
+
+								//-> Delete expired cache file
+
+								if(fileStatus === 'expired') {
+									return fs.unlinkAsync(cachePath).return(fileStatus);
+								}
+
+								return fileStatus;
+
+							}).then((fileStatus) => {
+
+								//-> Update search index
+
+								if(fileStatus !== 'active') {
+									return entries.fetchTextVersion(entryPath).then((content) => {
+										console.log(content);
+									});
+								}
+
+								return true;
+
+							})
+
+						);
+
+					}
+				});
+
+				return Promise.all(cacheJobs);
+
+			});
 		}));
 
 		// ----------------------------------------
@@ -73,7 +123,8 @@ var job = new cron({
 
 	},
 	start: true,
-	timeZone: 'UTC'
+	timeZone: 'UTC',
+	runOnInit: true
 });
 
 // ----------------------------------------

+ 4 - 3
config.sample.yml

@@ -1,6 +1,7 @@
 ###################################################
 # REQUARKS WIKI - CONFIGURATION                   #
 ###################################################
+# Full explanation + examples in the documentation (https://requarks-wiki.readme.io/)
 
 # -------------------------------------------------
 # Title of this site
@@ -32,7 +33,6 @@ datadir:
 # -------------------------------------------------
 # Git Connection Info
 # -------------------------------------------------
-# Full explanation + examples in the documentation (https://requarks-wiki.readme.io/)
 
 git:
   url: https://github.com/Organization/Repo
@@ -68,7 +68,8 @@ sessionSecret: 1234567890abcdefghijklmnopqrstuvxyz
 admin: admin@company.com
 
 # -------------------------------------------------
-# Default page for Home
+# Site UI Language
 # -------------------------------------------------
+# Possible values: en, fr
 
-homepage: Home.md
+lang: en

+ 12 - 7
gulpfile.js

@@ -55,13 +55,18 @@ var paths = {
 		'!./node_modules/font-awesome/fonts/*-webfont.svg'
 	],
 	deploypackage: [
-		'./**/*',
-		'!node_modules', '!node_modules/**',
-		'!coverage', '!coverage/**',
-		'!client/js', '!client/js/**',
-		'!dist', '!dist/**',
-		'!tests', '!tests/**',
-		'!gulpfile.js', '!inch.json', '!config.yml', '!wiki.sublime-project'
+		'./assets/**/*',
+		'./client/content/**/*',
+		'./controllers/**/*',
+		'./locales/**/*',
+		'./middlewares/**/*',
+		'./models/**/*',
+		'./views/**/*',
+		'./LICENSE',
+		'./agent.js',
+		'./server.js',
+		'./package.json',
+		'./config.sample.yml'
 	]
 };
 

+ 30 - 31
models/entries.js

@@ -27,8 +27,8 @@ module.exports = {
 
 		let self = this;
 
-		self._repoPath = appconfig.datadir.repo;
-		self._cachePath = path.join(appconfig.datadir.db, 'cache');
+		self._repoPath = path.resolve(ROOTPATH, appconfig.datadir.repo);
+		self._cachePath = path.resolve(ROOTPATH, appconfig.datadir.db, 'cache');
 
 		return self;
 
@@ -177,6 +177,32 @@ module.exports = {
 
 	},
 
+	/**
+	 * Fetches a text version of a Markdown-formatted document
+	 *
+	 * @param      {String}  entryPath  The entry path
+	 * @return     {String}  Text-only version
+	 */
+	fetchTextVersion(entryPath) {
+
+		let self = this;
+
+		return self.fetchOriginal(entryPath, {
+			parseMarkdown: false,
+			parseMeta: true,
+			parseTree: false,
+			includeMarkdown: true,
+			includeParentInfo: false,
+			cache: false
+		}).then((pageData) => {
+			return {
+				meta: pageData.meta,
+				text: mark.removeMarkdown(pageData.markdown)
+			};
+		});
+
+	},
+
 	/**
 	 * Parse raw url path and make it safe
 	 *
@@ -341,6 +367,8 @@ module.exports = {
 
 	},
 
+
+
 	/**
 	 * Generate a starter page content based on the entry path
 	 *
@@ -356,35 +384,6 @@ module.exports = {
 			return _.replace(contents, new RegExp('{TITLE}', 'g'), formattedTitle);
 		});
 
-	},
-
-	purgeStaleCache() {
-
-		let self = this;
-
-		let cacheJobs = [];
-
-		fs.walk(self._repoPath)
-		.on('data', function (item) {
-			if(path.extname(item.path) === '.md') {
-
-				let entryPath = self.parsePath(self.getEntryPathFromFullPath(item.path));
-				let cachePath = self.getCachePath(entryPath);
-
-				cacheJobs.push(fs.statAsync(cachePath).then((st) => {
-					if(moment(st.mtime).isBefore(item.stats.mtime)) {
-						return fs.unlinkAsync(cachePath);
-					} else {
-						return true;
-					}
-				}).catch((err) => {
-					return (err.code !== 'EEXIST') ? err : true;
-				}));
-			}
-		});
-
-		return Promise.all(cacheJobs);
-
 	}
 
 };

+ 4 - 12
models/git.js

@@ -19,8 +19,7 @@ module.exports = {
 	_repo: {
 		path: '',
 		branch: 'master',
-		exists: false,
-		sync: true
+		exists: false
 	},
 	_signature: {
 		name: 'Wiki',
@@ -30,6 +29,7 @@ module.exports = {
 		clone: {},
 		push: {}
 	},
+	onReady: null,
 
 	/**
 	 * Initialize Git model
@@ -37,12 +37,10 @@ module.exports = {
 	 * @param      {Object}  appconfig  The application config
 	 * @return     {Object}  Git model instance
 	 */
-	init(appconfig, sync) {
+	init(appconfig) {
 
 		let self = this;
 
-		self._repo.sync = sync;
-
 		//-> Build repository path
 		
 		if(_.isEmpty(appconfig.datadir.repo)) {
@@ -53,13 +51,7 @@ module.exports = {
 
 		//-> Initialize repository
 
-		self._initRepo(appconfig).then((repo) => {
-
-			if(self._repo.sync) {
-				self.resync();
-			}
-
-		});
+		self.onReady = self._initRepo(appconfig);
 
 		// Define signature
 

+ 38 - 2
models/markdown.js

@@ -12,7 +12,8 @@ var Promise = require('bluebird'),
 	mdAttrs = require('markdown-it-attrs'),
 	hljs = require('highlight.js'),
 	cheerio = require('cheerio'),
-	_ = require('lodash');
+	_ = require('lodash'),
+	mdRemove = require('remove-markdown');
 
 // Load plugins
 
@@ -157,6 +158,12 @@ const parseContent = (content)  => {
 
 };
 
+/**
+ * Parse meta-data tags from content
+ *
+ * @param      {String}  content  Markdown content
+ * @return     {Object}  Properties found in the content and their values
+ */
 const parseMeta = (content) => {
 
 	let commentMeta = new RegExp('<!-- ?([a-zA-Z]+):(.*)-->','g');
@@ -171,6 +178,12 @@ const parseMeta = (content) => {
 
 module.exports = {
 
+	/**
+	 * Parse content and return all data
+	 *
+	 * @param      {String}  content  Markdown-formatted content
+	 * @return     {Object}  Object containing meta, html and tree data
+	 */
 	parse(content) {
 		return {
 			meta: parseMeta(content),
@@ -181,6 +194,29 @@ module.exports = {
 
 	parseContent,
 	parseMeta,
-	parseTree
+	parseTree,
+
+	/**
+	 * Strips non-text elements from Markdown content
+	 *
+	 * @param      {String}  content  Markdown-formatted content
+	 * @return     {String}  Text-only version
+	 */
+	removeMarkdown(content) {
+		return mdRemove(_.chain(content)
+			.replace(/<!-- ?([a-zA-Z]+):(.*)-->/g, '')
+			.replace(/```[^`]+```/g, '')
+			.replace(/`[^`]+`/g, '')
+			.replace(new RegExp('(?!mailto:)(?:(?:http|https|ftp)://)(?:\\S+(?::\\S*)?@)?(?:(?:(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}(?:\\.(?:[0-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))|(?:(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)(?:\\.(?:[a-z\\u00a1-\\uffff0-9]+-?)*[a-z\\u00a1-\\uffff0-9]+)*(?:\\.(?:[a-z\\u00a1-\\uffff]{2,})))|localhost)(?::\\d{2,5})?(?:(/|\\?|#)[^\\s]*)?', 'g'), '')
+			.replace(/\r?\n|\r/g, ' ')
+			.deburr()
+			.toLower()
+			.replace(/(\b([^a-z]+)\b)/g, ' ')
+			.replace(/[^a-z]+/g, ' ')
+			.replace(/(\b(\w{1,2})\b(\W|$))/g, '')
+			.replace(/\s\s+/g, ' ')
+			.value()
+		);
+	}
 
 };

+ 42 - 0
models/search.js

@@ -0,0 +1,42 @@
+"use strict";
+
+var Promise = require('bluebird'),
+	_ = require('lodash'),
+	path = require('path'),
+	searchIndex = Promise.promisifyAll(require('search-index')),
+	stopWord = require('stopword');
+
+/**
+ * Search Model
+ */
+module.exports = {
+
+	_si: null,
+
+	/**
+	 * Initialize Search model
+	 *
+	 * @param      {Object}  appconfig  The application config
+	 * @return     {Object}  Search model instance
+	 */
+	init(appconfig) {
+
+		let dbPath = path.resolve(ROOTPATH, appconfig.datadir.db, 'search-index');
+
+		this._si = searchIndex({
+			deletable: true,
+			fieldedSearch: true,
+			indexPath: dbPath,
+			logLevel: 'error',
+			stopwords: stopWord.getStopwords(appconfig.lang).sort()
+		}, (err, si) => {
+			if(err) {
+				winston.error('Failed to initialize search-index.', err);
+			}
+		});
+
+	}
+
+
+
+};

+ 1 - 0
package.json

@@ -72,6 +72,7 @@
     "passport": "^0.3.2",
     "passport-local": "^1.0.0",
     "pug": "^2.0.0-beta6",
+    "remove-markdown": "^0.1.0",
     "search-index": "^0.8.15",
     "serve-favicon": "^2.3.0",
     "simplemde": "^1.11.2",

+ 1 - 7
server.js

@@ -52,7 +52,6 @@ var ctrl = autoload(path.join(ROOTPATH, '/controllers'));
 // ----------------------------------------
 
 global.app = express();
-global.ROOTPATH = __dirname;
 var _isDebug = (app.get('env') === 'development');
 
 // ----------------------------------------
@@ -127,7 +126,6 @@ app.use(express.static(path.join(ROOTPATH, 'assets')));
 app.locals._ = require('lodash');
 app.locals.moment = require('moment');
 app.locals.appconfig = appconfig;
-//app.locals.appdata = require('./data.json');
 app.use(mw.flash);
 
 // ----------------------------------------
@@ -195,16 +193,12 @@ server.on('listening', () => {
 });
 
 // ----------------------------------------
-// Start Background Agent
+// Start Agents
 // ----------------------------------------
 
 var fork = require('child_process').fork;
 var bgAgent = fork('agent.js');
 
-bgAgent.on('message', (m) => {
-
-});
-
 process.on('exit', (code) => {
   bgAgent.disconnect();
 });