Bläddra i källkod

fix: postgres indexing (#791) + deactivate handler

Nick 6 år sedan
förälder
incheckning
7c77cb0815

+ 15 - 0
CHANGELOG.md

@@ -2,6 +2,21 @@
 All notable changes to this project will be documented in this file.
 This project adheres to [Semantic Versioning](http://semver.org/).
 
+## [2.0.0-beta.XX] - 2019-XX-XX
+### Added
+- Added Search Engine - Algolia
+- Added Search Engine - Elasticsearch
+
+### Fixed
+- Fixed error when saving navigation in admin area
+- Fixed guest search failing because of missing global permissions
+- Fixed PostgreSQL search engine indexing issues
+
+### Changed
+- Improved search suggestions from sanitized content
+- Search engine deactivate handler is now being called on engine switch
+- Markdown editor UI improvements for insert actions (wip)
+
 ## [2.0.0-beta.68] - 2019-03-17
 ### Added
 - Added Search Results overlay

+ 2 - 0
package.json

@@ -64,6 +64,7 @@
     "diff2html": "2.7.0",
     "dotize": "0.3.0",
     "elasticsearch": "15.4.1",
+    "emoji-regex": "8.0.0",
     "express": "4.16.4",
     "express-brute": "1.0.1",
     "file-type": "10.7.1",
@@ -154,6 +155,7 @@
     "simple-git": "1.107.0",
     "solr-node": "1.1.3",
     "sqlite3": "4.0.6",
+    "striptags": "3.1.1",
     "subscriptions-transport-ws": "0.9.15",
     "twemoji": "11.3.0",
     "uslug": "1.0.4",

+ 11 - 0
server/graph/resolvers/search.js

@@ -38,7 +38,11 @@ module.exports = {
   SearchMutation: {
     async updateSearchEngines(obj, args, context) {
       try {
+        let newActiveEngine = ''
         for (let searchEngine of args.engines) {
+          if (searchEngine.isEnabled) {
+            newActiveEngine = searchEngine.key
+          }
           await WIKI.models.searchEngines.query().patch({
             isEnabled: searchEngine.isEnabled,
             config: _.reduce(searchEngine.config, (result, value, key) => {
@@ -47,6 +51,13 @@ module.exports = {
             }, {})
           }).where('key', searchEngine.key)
         }
+        if (newActiveEngine !== WIKI.data.searchEngine.key) {
+          try {
+            await WIKI.data.searchEngine.deactivate()
+          } catch (err) {
+            WIKI.logger.warn('Failed to deactivate previous search engine:', err)
+          }
+        }
         await WIKI.models.searchEngines.initEngine({ activate: true })
         return {
           responseResult: graphHelper.generateSuccess('Search Engines updated successfully')

+ 36 - 0
server/models/pages.js

@@ -5,6 +5,8 @@ const pageHelper = require('../helpers/page')
 const path = require('path')
 const fs = require('fs-extra')
 const yaml = require('js-yaml')
+const striptags = require('striptags')
+const emojiRegex = require('emoji-regex')
 
 /* global WIKI */
 
@@ -14,6 +16,9 @@ const frontmatterRegex = {
   markdown: /^(-{3}(?:\n|\r)([\w\W]+?)(?:\n|\r)-{3})?(?:\n|\r)*([\w\W]*)*/
 }
 
+const punctuationRegex = /[!,:;/\\_+\-=()&#@<>$~%^*[\]{}"'|]+|(\.\s)|(\s\.)/ig
+const htmlEntitiesRegex = /(&#[0-9]{3};)|(&#x[a-zA-Z0-9]{2};)/ig
+
 /**
  * Pages model
  */
@@ -209,14 +214,23 @@ module.exports = class Page extends Model {
       userId: opts.authorId,
       isPrivate: opts.isPrivate
     })
+
+    // -> Render page to HTML
     await WIKI.models.pages.renderPage(page)
+
+    // -> Add to Search Index
+    const pageContents = await WIKI.models.pages.query().findById(page.id).select('render')
+    page.safeContent = WIKI.models.pages.cleanHTML(pageContents.render)
     await WIKI.data.searchEngine.created(page)
+
+    // -> Add to Storage
     if (!opts.skipStorage) {
       await WIKI.models.storage.pageEvent({
         event: 'created',
         page
       })
     }
+
     return page
   }
 
@@ -245,8 +259,16 @@ module.exports = class Page extends Model {
       userId: ogPage.authorId,
       isPrivate: ogPage.isPrivate
     })
+
+    // -> Render page to HTML
     await WIKI.models.pages.renderPage(page)
+
+    // -> Update Search Index
+    const pageContents = await WIKI.models.pages.query().findById(page.id).select('render')
+    page.safeContent = WIKI.models.pages.cleanHTML(pageContents.render)
     await WIKI.data.searchEngine.updated(page)
+
+    // -> Update on Storage
     if (!opts.skipStorage) {
       await WIKI.models.storage.pageEvent({
         event: 'updated',
@@ -275,7 +297,11 @@ module.exports = class Page extends Model {
     })
     await WIKI.models.pages.query().delete().where('id', page.id)
     await WIKI.models.pages.deletePageFromCache(page)
+
+    // -> Delete from Search Index
     await WIKI.data.searchEngine.deleted(page)
+
+    // -> Delete from Storage
     if (!opts.skipStorage) {
       await WIKI.models.storage.pageEvent({
         event: 'deleted',
@@ -390,4 +416,14 @@ module.exports = class Page extends Model {
   static async deletePageFromCache(page) {
     return fs.remove(path.join(process.cwd(), `data/cache/${page.hash}.bin`))
   }
+
+  static cleanHTML(rawHTML = '') {
+    return striptags(rawHTML || '')
+      .replace(emojiRegex(), '')
+      .replace(htmlEntitiesRegex, '')
+      .replace(punctuationRegex, ' ')
+      .replace(/(\r\n|\n|\r)/gm, ' ')
+      .replace(/\s\s+/g, ' ')
+      .split(' ').filter(w => w.length > 1).join(' ').toLowerCase()
+  }
 }

+ 1 - 0
server/models/searchEngines.js

@@ -99,6 +99,7 @@ module.exports = class SearchEngine extends Model {
     const searchEngine = await WIKI.models.searchEngines.query().findOne('isEnabled', true)
     if (searchEngine) {
       WIKI.data.searchEngine = require(`../modules/search/${searchEngine.key}/engine`)
+      WIKI.data.searchEngine.key = searchEngine.key
       WIKI.data.searchEngine.config = searchEngine.config
       if (activate) {
         try {

+ 9 - 7
server/modules/search/algolia/engine.js

@@ -1,6 +1,8 @@
 const _ = require('lodash')
 const algoliasearch = require('algoliasearch')
-const { pipeline, Transform } = require('stream')
+const stream = require('stream')
+const Promise = require('bluebird')
+const pipeline = Promise.promisify(stream.pipeline)
 
 /* global WIKI */
 
@@ -77,7 +79,7 @@ module.exports = {
       path: page.path,
       title: page.title,
       description: page.description,
-      content: page.content
+      content: page.safeContent
     })
   },
   /**
@@ -90,7 +92,7 @@ module.exports = {
       objectID: page.hash,
       title: page.title,
       description: page.description,
-      content: page.content
+      content: page.safeContent
     })
   },
   /**
@@ -114,7 +116,7 @@ module.exports = {
       path: page.destinationPath,
       title: page.title,
       description: page.description,
-      content: page.content
+      content: page.safeContent
     })
   },
   /**
@@ -176,7 +178,7 @@ module.exports = {
             path: doc.path,
             title: doc.title,
             description: doc.description,
-            content: doc.content
+            content: WIKI.models.pages.cleanHTML(doc.render)
           }))
         )
       } catch (err) {
@@ -187,11 +189,11 @@ module.exports = {
     }
 
     await pipeline(
-      WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'content').select().from('pages').where({
+      WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'render').select().from('pages').where({
         isPublished: true,
         isPrivate: false
       }).stream(),
-      new Transform({
+      new stream.Transform({
         objectMode: true,
         transform: async (chunk, enc, cb) => processDocument(cb, chunk),
         flush: async (cb) => processDocument(cb)

+ 9 - 7
server/modules/search/aws/engine.js

@@ -1,6 +1,8 @@
 const _ = require('lodash')
 const AWS = require('aws-sdk')
-const { pipeline, Transform } = require('stream')
+const stream = require('stream')
+const Promise = require('bluebird')
+const pipeline = Promise.promisify(stream.pipeline)
 
 /* global WIKI */
 
@@ -197,7 +199,7 @@ module.exports = {
             path: page.path,
             title: page.title,
             description: page.description,
-            content: page.content
+            content: page.safeContent
           }
         }
       ])
@@ -220,7 +222,7 @@ module.exports = {
             path: page.path,
             title: page.title,
             description: page.description,
-            content: page.content
+            content: page.safeContent
           }
         }
       ])
@@ -268,7 +270,7 @@ module.exports = {
             path: page.destinationPath,
             title: page.title,
             description: page.description,
-            content: page.content
+            content: page.safeContent
           }
         }
       ])
@@ -335,7 +337,7 @@ module.exports = {
               path: doc.path,
               title: doc.title,
               description: doc.description,
-              content: doc.content
+              content: WIKI.models.pages.cleanHTML(doc.render)
             }
           })))
         }).promise()
@@ -347,11 +349,11 @@ module.exports = {
     }
 
     await pipeline(
-      WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'content').select().from('pages').where({
+      WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'render').select().from('pages').where({
         isPublished: true,
         isPrivate: false
       }).stream(),
-      new Transform({
+      new stream.Transform({
         objectMode: true,
         transform: async (chunk, enc, cb) => processDocument(cb, chunk),
         flush: async (cb) => processDocument(cb)

+ 20 - 5
server/modules/search/azure/engine.js

@@ -1,7 +1,9 @@
 const _ = require('lodash')
 const { SearchService, QueryType } = require('azure-search-client')
 const request = require('request-promise')
-const { pipeline } = require('stream')
+const stream = require('stream')
+const Promise = require('bluebird')
+const pipeline = Promise.promisify(stream.pipeline)
 
 /* global WIKI */
 
@@ -146,7 +148,7 @@ module.exports = {
         path: page.path,
         title: page.title,
         description: page.description,
-        content: page.content
+        content: page.safeContent
       }
     ])
   },
@@ -163,7 +165,7 @@ module.exports = {
         path: page.path,
         title: page.title,
         description: page.description,
-        content: page.content
+        content: page.safeContent
       }
     ])
   },
@@ -199,7 +201,7 @@ module.exports = {
         path: page.destinationPath,
         title: page.title,
         description: page.description,
-        content: page.content
+        content: page.safeContent
       }
     ])
   },
@@ -209,10 +211,23 @@ module.exports = {
   async rebuild() {
     WIKI.logger.info(`(SEARCH/AZURE) Rebuilding Index...`)
     await pipeline(
-      WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'content').select().from('pages').where({
+      WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'render').select().from('pages').where({
         isPublished: true,
         isPrivate: false
       }).stream(),
+      new stream.Transform({
+        objectMode: true,
+        transform: (chunk, enc, cb) => {
+          cb(null, {
+            id: chunk.id,
+            path: chunk.path,
+            locale: chunk.locale,
+            title: chunk.title,
+            description: chunk.description,
+            content: WIKI.models.pages.cleanHTML(chunk.render)
+          })
+        }
+      }),
       this.client.indexes.use(this.config.indexName).createIndexingStream()
     )
     WIKI.logger.info(`(SEARCH/AZURE) Index rebuilt successfully.`)

+ 2 - 2
server/modules/search/db/engine.js

@@ -1,4 +1,4 @@
-const _ = require('lodash')
+/* global WIKI */
 
 module.exports = {
   activate() {
@@ -32,7 +32,7 @@ module.exports = {
         }
         // TODO: Add user permissions filtering
         builder.andWhere(builder => {
-          switch(WIKI.config.db.type) {
+          switch (WIKI.config.db.type) {
             case 'postgres':
               builder.where('title', 'ILIKE', `%${q}%`)
               builder.orWhere('description', 'ILIKE', `%${q}%`)

+ 11 - 8
server/modules/search/elasticsearch/engine.js

@@ -1,6 +1,8 @@
 const _ = require('lodash')
 const elasticsearch = require('elasticsearch')
-const { pipeline, Transform } = require('stream')
+const stream = require('stream')
+const Promise = require('bluebird')
+const pipeline = Promise.promisify(stream.pipeline)
 
 /* global WIKI */
 
@@ -116,7 +118,7 @@ module.exports = {
         input: s,
         weight: 3
       })),
-      page.content.split(' ').map(s => ({
+      page.safeContent.split(' ').map(s => ({
         input: s,
         weight: 1
       }))
@@ -138,7 +140,7 @@ module.exports = {
         path: page.path,
         title: page.title,
         description: page.description,
-        content: page.content
+        content: page.safeContent
       },
       refresh: true
     })
@@ -159,7 +161,7 @@ module.exports = {
         path: page.path,
         title: page.title,
         description: page.description,
-        content: page.content
+        content: page.safeContent
       },
       refresh: true
     })
@@ -199,7 +201,7 @@ module.exports = {
         path: page.destinationPath,
         title: page.title,
         description: page.description,
-        content: page.content
+        content: page.safeContent
       },
       refresh: true
     })
@@ -262,13 +264,14 @@ module.exports = {
                 _id: doc.id
               }
             })
+            doc.safeContent = WIKI.models.pages.cleanHTML(doc.render)
             result.push({
               suggest: this.buildSuggest(doc),
               locale: doc.locale,
               path: doc.path,
               title: doc.title,
               description: doc.description,
-              content: doc.content
+              content: doc.safeContent
             })
             return result
           }, []),
@@ -282,11 +285,11 @@ module.exports = {
     }
 
     await pipeline(
-      WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'content').select().from('pages').where({
+      WIKI.models.knex.column({ id: 'hash' }, 'path', { locale: 'localeCode' }, 'title', 'description', 'render').select().from('pages').where({
         isPublished: true,
         isPrivate: false
       }).stream(),
-      new Transform({
+      new stream.Transform({
         objectMode: true,
         transform: async (chunk, enc, cb) => processDocument(cb, chunk),
         flush: async (cb) => processDocument(cb)

+ 41 - 14
server/modules/search/postgres/engine.js

@@ -1,5 +1,9 @@
-const _ = require('lodash')
 const tsquery = require('pg-tsquery')()
+const stream = require('stream')
+const Promise = require('bluebird')
+const pipeline = Promise.promisify(stream.pipeline)
+
+/* global WIKI */
 
 module.exports = {
   async activate() {
@@ -8,7 +12,10 @@ module.exports = {
     }
   },
   async deactivate() {
-    // not used
+    WIKI.logger.info(`(SEARCH/POSTGRES) Dropping index tables...`)
+    await WIKI.models.knex.schema.dropTable('pagesWords')
+    await WIKI.models.knex.schema.dropTable('pagesVector')
+    WIKI.logger.info(`(SEARCH/POSTGRES) Index tables have been dropped.`)
   },
   /**
    * INIT
@@ -27,6 +34,7 @@ module.exports = {
         table.string('title')
         table.string('description')
         table.specificType('tokens', 'TSVECTOR')
+        table.text('content')
       })
     }
     // -> Create Words Index
@@ -71,7 +79,6 @@ module.exports = {
       WIKI.logger.warn('Search Engine Error:')
       WIKI.logger.warn(err)
     }
-
   },
   /**
    * CREATE
@@ -80,10 +87,10 @@ module.exports = {
    */
   async created(page) {
     await WIKI.models.knex.raw(`
-      INSERT INTO "pagesVector" (path, locale, title, description, tokens) VALUES (
-        '?', '?', '?', '?', (setweight(to_tsvector('${this.config.dictLanguage}', '?'), 'A') || setweight(to_tsvector('${this.config.dictLanguage}', '?'), 'B') || setweight(to_tsvector('${this.config.dictLanguage}', '?'), 'C'))
+      INSERT INTO "pagesVector" (path, locale, title, description, "tokens") VALUES (
+        ?, ?, ?, ?, (setweight(to_tsvector('${this.config.dictLanguage}', ?), 'A') || setweight(to_tsvector('${this.config.dictLanguage}', ?), 'B') || setweight(to_tsvector('${this.config.dictLanguage}', ?), 'C'))
       )
-    `, [page.path, page.localeCode, page.title, page.description, page.title, page.description, page.content])
+    `, [page.path, page.localeCode, page.title, page.description, page.title, page.description, page.safeContent])
   },
   /**
    * UPDATE
@@ -99,7 +106,7 @@ module.exports = {
         setweight(to_tsvector('${this.config.dictLanguage}', ?), 'B') ||
         setweight(to_tsvector('${this.config.dictLanguage}', ?), 'C'))
       WHERE path = ? AND locale = ?
-    `, [page.title, page.description, page.title, page.description, page.content, page.path, page.localeCode])
+    `, [page.title, page.description, page.title, page.description, page.safeContent, page.path, page.localeCode])
   },
   /**
    * DELETE
@@ -132,14 +139,34 @@ module.exports = {
   async rebuild() {
     WIKI.logger.info(`(SEARCH/POSTGRES) Rebuilding Index...`)
     await WIKI.models.knex('pagesVector').truncate()
+    await WIKI.models.knex('pagesWords').truncate()
+
+    await pipeline(
+      WIKI.models.knex.column('path', 'localeCode', 'title', 'description', 'render').select().from('pages').where({
+        isPublished: true,
+        isPrivate: false
+      }).stream(),
+      new stream.Transform({
+        objectMode: true,
+        transform: async (page, enc, cb) => {
+          const content = WIKI.models.pages.cleanHTML(page.render)
+          await WIKI.models.knex.raw(`
+            INSERT INTO "pagesVector" (path, locale, title, description, "tokens", content) VALUES (
+              ?, ?, ?, ?, (setweight(to_tsvector('${this.config.dictLanguage}', ?), 'A') || setweight(to_tsvector('${this.config.dictLanguage}', ?), 'B') || setweight(to_tsvector('${this.config.dictLanguage}', ?), 'C')), ?
+            )
+          `, [page.path, page.localeCode, page.title, page.description, page.title, page.description, content, content])
+          cb()
+        }
+      })
+    )
+
     await WIKI.models.knex.raw(`
-      INSERT INTO "pagesVector" (path, locale, title, description, "tokens")
-        SELECT path, "localeCode" AS locale, title, description,
-          (setweight(to_tsvector('${this.config.dictLanguage}', title), 'A') ||
-          setweight(to_tsvector('${this.config.dictLanguage}', description), 'B') ||
-          setweight(to_tsvector('${this.config.dictLanguage}', content), 'C')) AS tokens
-        FROM "pages"
-        WHERE pages."isPublished" AND NOT pages."isPrivate"`)
+      INSERT INTO "pagesWords" (word)
+        SELECT word FROM ts_stat(
+          'SELECT to_tsvector(''simple'', "title") || to_tsvector(''simple'', "description") || to_tsvector(''simple'', "content") FROM "pagesVector"'
+        )
+      `)
+
     WIKI.logger.info(`(SEARCH/POSTGRES) Index rebuilt successfully.`)
   }
 }

+ 10 - 0
yarn.lock

@@ -4454,6 +4454,11 @@ emitter-listener@^1.1.1:
   dependencies:
     shimmer "^1.2.0"
 
+emoji-regex@8.0.0:
+  version "8.0.0"
+  resolved "https://registry.yarnpkg.com/emoji-regex/-/emoji-regex-8.0.0.tgz#e818fd69ce5ccfcb404594f842963bf53164cc37"
+  integrity sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==
+
 emojis-list@^2.0.0:
   version "2.1.0"
   resolved "https://registry.yarnpkg.com/emojis-list/-/emojis-list-2.1.0.tgz#4daa4d9db00f9819880c79fa457ae5b09a1fd389"
@@ -12294,6 +12299,11 @@ strip-json-comments@^2.0.1, strip-json-comments@~2.0.1:
   resolved "https://registry.yarnpkg.com/strip-json-comments/-/strip-json-comments-2.0.1.tgz#3c531942e908c2697c0ec344858c286c7ca0a60a"
   integrity sha1-PFMZQukIwml8DsNEhYwobHygpgo=
 
+striptags@3.1.1:
+  version "3.1.1"
+  resolved "https://registry.yarnpkg.com/striptags/-/striptags-3.1.1.tgz#c8c3e7fdd6fb4bb3a32a3b752e5b5e3e38093ebd"
+  integrity sha1-yMPn/db7S7OjKjt1LltePjgJPr0=
+
 style-loader@0.23.1:
   version "0.23.1"
   resolved "https://registry.yarnpkg.com/style-loader/-/style-loader-0.23.1.tgz#cb9154606f3e771ab6c4ab637026a1049174d925"