Browse Source

multiple minor fixes

Markus-Rost 4 years ago
parent
commit
f9d1e8b627
3 changed files with 62 additions and 34 deletions
  1. 1 1
      dashboard/settings.js
  2. 41 21
      functions/parse_page.js
  3. 20 12
      util/functions.js

+ 1 - 1
dashboard/settings.js

@@ -343,7 +343,7 @@ function update_settings(res, userSettings, guild, type, settings) {
 				if ( response.patreon ) {
 				if ( response.patreon ) {
 					if ( row.lang !== row.mainlang ) text += `\n${lang.get('settings.currentlang')} \`${allLangs.names[row.lang]}\``;
 					if ( row.lang !== row.mainlang ) text += `\n${lang.get('settings.currentlang')} \`${allLangs.names[row.lang]}\``;
 					if ( row.role !== row.mainrole ) text += `\n${lang.get('settings.currentrole')} ` + ( row.role ? `<@&${row.role}>` : '@everyone' );
 					if ( row.role !== row.mainrole ) text += `\n${lang.get('settings.currentrole')} ` + ( row.role ? `<@&${row.role}>` : '@everyone' );
-					if ( row.inline !== row.maininline ) text += `\n${lang.get('settings.currentinline')} ${( row.inline ? '~~' : '' )}\`[[${inlinepage}]]\`${( row.inline ? '~~' : '' )}`;
+					if ( row.inline !== row.maininline ) text += `\n${lang.get('settings.currentinline')} ${( row.inline ? '~~' : '' )}\`[[${( lang.localNames.page || 'page' )}]]\`${( row.inline ? '~~' : '' )}`;
 				}
 				}
 				text += `\n<${new URL(`/guild/${guild}/settings`, process.env.dashboard).href}>`;
 				text += `\n<${new URL(`/guild/${guild}/settings`, process.env.dashboard).href}>`;
 				sendMsg( {
 				sendMsg( {

+ 41 - 21
functions/parse_page.js

@@ -2,6 +2,13 @@ const cheerio = require('cheerio');
 const {toSection} = require('../util/wiki.js');
 const {toSection} = require('../util/wiki.js');
 const {htmlToPlain, htmlToDiscord, limitLength} = require('../util/functions.js');
 const {htmlToPlain, htmlToDiscord, limitLength} = require('../util/functions.js');
 
 
+const parsedContentModels = [
+	'wikitext',
+	'wikibase-item',
+	'wikibase-lexeme',
+	'wikibase-property'
+];
+
 // Max length of 10 characters
 // Max length of 10 characters
 const contentModels = {
 const contentModels = {
 	Scribunto: 'lua',
 	Scribunto: 'lua',
@@ -40,12 +47,21 @@ const removeClasses = [
 	'.nomobile',
 	'.nomobile',
 	'.noprint',
 	'.noprint',
 	'.noexcerpt',
 	'.noexcerpt',
-	'.sortkey'
+	'.sortkey',
+	'wb\\:sectionedit'
 ];
 ];
 
 
-const keepMainPageTag = [
+const removeClassExceptions = [
 	'div.main-page-tag-lcs',
 	'div.main-page-tag-lcs',
-	'div.lcs-container'
+	'div.lcs-container',
+	'div.wikibase-entityview',
+	'div.wikibase-entityview-main',
+	'div.wikibase-entitytermsview',
+	'div.wikibase-entitytermsview-heading',
+	'div.wikibase-entitytermsview-heading-description',
+	'div#wb-lexeme-header',
+	'div#wb-lexeme-header div:not([class]):not([id])',
+	'div.language-lexical-category-widget'
 ];
 ];
 
 
 /**
 /**
@@ -70,7 +86,7 @@ function parse_page(msg, content, embed, wiki, reaction, {title, contentmodel, p
 		if ( reaction ) reaction.removeEmoji();
 		if ( reaction ) reaction.removeEmoji();
 		return;
 		return;
 	}
 	}
-	if ( contentmodel !== 'wikitext' ) return got.get( wiki + 'api.php?action=query&prop=revisions&rvprop=content&rvslots=main&converttitles=true&titles=%1F' + encodeURIComponent( title ) + '&format=json' ).then( response => {
+	if ( !parsedContentModels.includes( contentmodel ) ) return got.get( wiki + 'api.php?action=query&prop=revisions&rvprop=content&rvslots=main&converttitles=true&titles=%1F' + encodeURIComponent( title ) + '&format=json' ).then( response => {
 		var body = response.body;
 		var body = response.body;
 		if ( body && body.warnings ) log_warn(body.warnings);
 		if ( body && body.warnings ) log_warn(body.warnings);
 		var revision = Object.values(( body?.query?.pages || {} ))?.[0]?.revisions?.[0];
 		var revision = Object.values(( body?.query?.pages || {} ))?.[0]?.revisions?.[0];
@@ -122,7 +138,7 @@ function parse_page(msg, content, embed, wiki, reaction, {title, contentmodel, p
 		
 		
 		if ( reaction ) reaction.removeEmoji();
 		if ( reaction ) reaction.removeEmoji();
 	} );
 	} );
-	got.get( wiki + 'api.php?action=parse&prop=text|images' + ( fragment || disambiguation !== undefined ? '' : '&section=0' ) + '&disablelimitreport=true&disableeditsection=true&disabletoc=true&sectionpreview=true&page=' + encodeURIComponent( title ) + '&format=json' ).then( response => {
+	got.get( wiki + 'api.php?action=parse&prop=text|images|displaytitle' + ( contentmodel !== 'wikitext' || fragment || disambiguation !== undefined ? '' : '&section=0' ) + '&disablelimitreport=true&disableeditsection=true&disabletoc=true&sectionpreview=true&page=' + encodeURIComponent( title ) + '&format=json' ).then( response => {
 		if ( response.statusCode !== 200 || !response?.body?.parse?.text ) {
 		if ( response.statusCode !== 200 || !response?.body?.parse?.text ) {
 			console.log( '- ' + response.statusCode + ': Error while parsing the page: ' + response?.body?.error?.info );
 			console.log( '- ' + response.statusCode + ': Error while parsing the page: ' + response?.body?.error?.info );
 			if ( embed.backupDescription && embed.length < 5000 ) {
 			if ( embed.backupDescription && embed.length < 5000 ) {
@@ -133,7 +149,10 @@ function parse_page(msg, content, embed, wiki, reaction, {title, contentmodel, p
 			}
 			}
 			return;
 			return;
 		}
 		}
-		var $ = cheerio.load(response.body.parse.text['*'].replace( /<br\/?>/g, '\n' ));
+		var displaytitle = htmlToDiscord( response.body.parse.displaytitle );
+		if ( displaytitle.length > 250 ) displaytitle = displaytitle.substring(0, 250) + '\u2026';
+		embed.setTitle( displaytitle );
+		var $ = cheerio.load(response.body.parse.text['*'].replace( /\n?<br(?: ?\/)?>\n?/g, '<br>' ));
 		if ( embed.brokenInfobox && $('aside.portable-infobox').length ) {
 		if ( embed.brokenInfobox && $('aside.portable-infobox').length ) {
 			let infobox = $('aside.portable-infobox');
 			let infobox = $('aside.portable-infobox');
 			embed.fields.forEach( field => {
 			embed.fields.forEach( field => {
@@ -175,6 +194,7 @@ function parse_page(msg, content, embed, wiki, reaction, {title, contentmodel, p
 			}
 			}
 			let rows = infobox.find([
 			let rows = infobox.find([
 				'> tbody > tr',
 				'> tbody > tr',
+				'> tbody > tr > th.mainheader',
 				'> table > tbody > tr',
 				'> table > tbody > tr',
 				'div.section > div.title',
 				'div.section > div.title',
 				'div.section > table > tbody > tr',
 				'div.section > table > tbody > tr',
@@ -187,19 +207,7 @@ function parse_page(msg, content, embed, wiki, reaction, {title, contentmodel, p
 			for ( let i = 0; i < rows.length; i++ ) {
 			for ( let i = 0; i < rows.length; i++ ) {
 				if ( embed.fields.length >= 25 || embed.length > 5400 ) break;
 				if ( embed.fields.length >= 25 || embed.length > 5400 ) break;
 				let row = rows.eq(i);
 				let row = rows.eq(i);
-				if ( row.is('tr, div.pi-data, div.infobox-row') ) {
-					let label = row.children(( tdLabel ? 'td, ' : '' ) + 'th, h3.pi-data-label, div.infobox-cell-header').eq(0);
-					label.find(removeClasses.join(', ')).remove();
-					let value = row.children('td, div.pi-data-value, div.infobox-cell-data').eq(( label.is('td') ? 1 : 0 ));
-					value.find(removeClasses.join(', ')).remove();
-					if ( !label.is('td') && label.html()?.trim() && value.html()?.trim() ) tdLabel = false;
-					label = htmlToPlain(label).trim().split('\n')[0];
-					value = htmlToDiscord(value, embed.url, true).trim().replace( /\n{3,}/g, '\n\n' );
-					if ( label.length > 100 ) label = label.substring(0, 100) + '\u2026';
-					if ( value.length > 500 ) value = limitLength(value, 500, 250);
-					if ( label && value ) embed.addField( label, value, true );
-				}
-				else if ( row.is('div.title, h2.pi-header') ) {
+				if ( row.is('th.mainheader, div.title, h2.pi-header') ) {
 					row.find(removeClasses.join(', ')).remove();
 					row.find(removeClasses.join(', ')).remove();
 					let label = htmlToPlain(row).trim();
 					let label = htmlToPlain(row).trim();
 					if ( label.length > 100 ) label = label.substring(0, 100) + '\u2026';
 					if ( label.length > 100 ) label = label.substring(0, 100) + '\u2026';
@@ -214,6 +222,18 @@ function parse_page(msg, content, embed, wiki, reaction, {title, contentmodel, p
 						else embed.addField( '\u200b', '**' + label + '**', false );
 						else embed.addField( '\u200b', '**' + label + '**', false );
 					}
 					}
 				}
 				}
+				else if ( row.is('tr, div.pi-data, div.infobox-row') ) {
+					let label = row.children(( tdLabel ? 'td, ' : '' ) + 'th, h3.pi-data-label, div.infobox-cell-header').eq(0);
+					label.find(removeClasses.join(', ')).remove();
+					let value = row.children('td, div.pi-data-value, div.infobox-cell-data').eq(( label.is('td') ? 1 : 0 ));
+					value.find(removeClasses.join(', ')).remove();
+					if ( !label.is('td') && label.html()?.trim() && value.html()?.trim() ) tdLabel = false;
+					label = htmlToPlain(label).trim().split('\n')[0];
+					value = htmlToDiscord(value, embed.url, true).trim().replace( /\n{3,}/g, '\n\n' );
+					if ( label.length > 100 ) label = label.substring(0, 100) + '\u2026';
+					if ( value.length > 500 ) value = limitLength(value, 500, 250);
+					if ( label && value ) embed.addField( label, value, true );
+				}
 			}
 			}
 			if ( embed.fields.length && embed.fields[embed.fields.length - 1].name === '\u200b' ) {
 			if ( embed.fields.length && embed.fields[embed.fields.length - 1].name === '\u200b' ) {
 				embed.spliceFields( embed.fields.length - 1, 1 );
 				embed.spliceFields( embed.fields.length - 1, 1 );
@@ -266,12 +286,12 @@ function parse_page(msg, content, embed, wiki, reaction, {title, contentmodel, p
 			}
 			}
 		}
 		}
 		if ( !embed.description && embed.length < 5000 ) {
 		if ( !embed.description && embed.length < 5000 ) {
-			if ( disambiguation === undefined || fragment ) {
+			if ( contentmodel !== 'wikitext' || disambiguation === undefined || fragment ) {
 				$('h1, h2, h3, h4, h5, h6').nextAll().remove();
 				$('h1, h2, h3, h4, h5, h6').nextAll().remove();
 				$('h1, h2, h3, h4, h5, h6').remove();
 				$('h1, h2, h3, h4, h5, h6').remove();
 			}
 			}
 			$(infoboxList.join(', ')).remove();
 			$(infoboxList.join(', ')).remove();
-			$('div, ' + removeClasses.join(', '), $('.mw-parser-output')).not(keepMainPageTag.join(', ')).remove();
+			$('div, ' + removeClasses.join(', '), $('.mw-parser-output')).not(removeClassExceptions.join(', ')).remove();
 			var description = htmlToDiscord($.html(), embed.url, true).trim().replace( /\n{3,}/g, '\n\n' );
 			var description = htmlToDiscord($.html(), embed.url, true).trim().replace( /\n{3,}/g, '\n\n' );
 			if ( description ) {
 			if ( description ) {
 				if ( disambiguation !== undefined && !fragment && embed.length < 4250 ) {
 				if ( disambiguation !== undefined && !fragment && embed.length < 4250 ) {

+ 20 - 12
util/functions.js

@@ -156,16 +156,17 @@ function toPlaintext(text = '', fullWikitext = false) {
  */
  */
 function htmlToPlain(html) {
 function htmlToPlain(html) {
 	var text = '';
 	var text = '';
-	var reference = false;
+	var ignoredTag = '';
 	var parser = new htmlparser.Parser( {
 	var parser = new htmlparser.Parser( {
 		onopentag: (tagname, attribs) => {
 		onopentag: (tagname, attribs) => {
-			if ( tagname === 'sup' && attribs.class === 'reference' ) reference = true;
+			if ( tagname === 'sup' && attribs.class === 'reference' ) ignoredTag = 'sup';
+			if ( tagname === 'span' && attribs.class === 'smwttcontent' ) ignoredTag = 'span';
 		},
 		},
 		ontext: (htmltext) => {
 		ontext: (htmltext) => {
-			if ( !reference ) text += escapeFormatting(htmltext);
+			if ( !ignoredTag ) text += escapeFormatting(htmltext);
 		},
 		},
 		onclosetag: (tagname) => {
 		onclosetag: (tagname) => {
-			if ( tagname === 'sup' ) reference = false;
+			if ( tagname === ignoredTag ) ignoredTag = '';
 		}
 		}
 	} );
 	} );
 	parser.write( html );
 	parser.write( html );
@@ -184,11 +185,13 @@ function htmlToDiscord(html, pagelink = '', ...escapeArgs) {
 	var text = '';
 	var text = '';
 	var code = false;
 	var code = false;
 	var href = '';
 	var href = '';
-	var reference = false;
+	var ignoredTag = '';
 	var listlevel = -1;
 	var listlevel = -1;
 	var parser = new htmlparser.Parser( {
 	var parser = new htmlparser.Parser( {
 		onopentag: (tagname, attribs) => {
 		onopentag: (tagname, attribs) => {
-			if ( code ) return;
+			if ( ignoredTag || code ) return;
+			if ( tagname === 'sup' && attribs.class === 'reference' ) ignoredTag = 'sup';
+			if ( tagname === 'span' && attribs.class === 'smwttcontent' ) ignoredTag = 'span';
 			if ( tagname === 'code' ) {
 			if ( tagname === 'code' ) {
 				code = true;
 				code = true;
 				text += '`';
 				text += '`';
@@ -201,7 +204,6 @@ function htmlToDiscord(html, pagelink = '', ...escapeArgs) {
 			if ( tagname === 'i' ) text += '*';
 			if ( tagname === 'i' ) text += '*';
 			if ( tagname === 's' ) text += '~~';
 			if ( tagname === 's' ) text += '~~';
 			if ( tagname === 'u' ) text += '__';
 			if ( tagname === 'u' ) text += '__';
-			if ( tagname === 'sup' && attribs.class === 'reference' ) reference = true;
 			if ( tagname === 'br' ) {
 			if ( tagname === 'br' ) {
 				text += '\n';
 				text += '\n';
 				if ( listlevel > -1 ) text += '\u200b '.repeat(4 * listlevel + 3);
 				if ( listlevel > -1 ) text += '\u200b '.repeat(4 * listlevel + 3);
@@ -236,7 +238,7 @@ function htmlToDiscord(html, pagelink = '', ...escapeArgs) {
 				if ( listlevel > -1 && attribs.class !== 'mw-empty-elt' ) text += '\u200b '.repeat(4 * (listlevel + 1));
 				if ( listlevel > -1 && attribs.class !== 'mw-empty-elt' ) text += '\u200b '.repeat(4 * (listlevel + 1));
 			}
 			}
 			if ( tagname === 'img' ) {
 			if ( tagname === 'img' ) {
-				if ( attribs.alt && attribs.src && !reference ) {
+				if ( attribs.alt && attribs.src ) {
 					let showAlt = true;
 					let showAlt = true;
 					if ( attribs['data-image-name'] === attribs.alt ) showAlt = false;
 					if ( attribs['data-image-name'] === attribs.alt ) showAlt = false;
 					else {
 					else {
@@ -284,17 +286,24 @@ function htmlToDiscord(html, pagelink = '', ...escapeArgs) {
 			if ( !pagelink ) return;
 			if ( !pagelink ) return;
 			if ( tagname === 'a' && attribs.href && attribs.class !== 'new' && /^(?:(?:https?:)?\/\/|\/|#)/.test(attribs.href) ) {
 			if ( tagname === 'a' && attribs.href && attribs.class !== 'new' && /^(?:(?:https?:)?\/\/|\/|#)/.test(attribs.href) ) {
 				href = new URL(attribs.href, pagelink).href;
 				href = new URL(attribs.href, pagelink).href;
-				text += '[';
+				if ( text.endsWith( '](<' + href.replace( /[()]/g, '\\$&' ) + '>)' ) ) {
+					text = text.substring(0, text.length - ( href.replace( /[()]/g, '\\$&' ).length + 5 ));
+				}
+				else text += '[';
 			}
 			}
 		},
 		},
 		ontext: (htmltext) => {
 		ontext: (htmltext) => {
-			if ( !reference ) {
+			if ( !ignoredTag ) {
 				if ( href && !code ) htmltext = htmltext.replace( /[\[\]]/g, '\\$&' );
 				if ( href && !code ) htmltext = htmltext.replace( /[\[\]]/g, '\\$&' );
 				if ( code ) text += htmltext.replace( /`/g, 'ˋ' );
 				if ( code ) text += htmltext.replace( /`/g, 'ˋ' );
 				else text += escapeFormatting(htmltext, ...escapeArgs);
 				else text += escapeFormatting(htmltext, ...escapeArgs);
 			}
 			}
 		},
 		},
 		onclosetag: (tagname) => {
 		onclosetag: (tagname) => {
+			if ( tagname === ignoredTag ) {
+				ignoredTag = '';
+				return;
+			}
 			if ( code ) {
 			if ( code ) {
 				if ( tagname === 'code' ) {
 				if ( tagname === 'code' ) {
 					code = false;
 					code = false;
@@ -310,7 +319,6 @@ function htmlToDiscord(html, pagelink = '', ...escapeArgs) {
 			if ( tagname === 'i' ) text += '*';
 			if ( tagname === 'i' ) text += '*';
 			if ( tagname === 's' ) text += '~~';
 			if ( tagname === 's' ) text += '~~';
 			if ( tagname === 'u' ) text += '__';
 			if ( tagname === 'u' ) text += '__';
-			if ( tagname === 'sup' ) reference = false;
 			if ( tagname === 'ul' ) listlevel--;
 			if ( tagname === 'ul' ) listlevel--;
 			if ( tagname === 'dl' ) listlevel--;
 			if ( tagname === 'dl' ) listlevel--;
 			if ( tagname === 'dt' ) text += '**';
 			if ( tagname === 'dt' ) text += '**';
@@ -361,7 +369,7 @@ function escapeFormatting(text = '', isMarkdown = false, keepLinks = false) {
 function limitLength(text = '', limit = 1000, maxExtra = 20) {
 function limitLength(text = '', limit = 1000, maxExtra = 20) {
 	var suffix = '\u2026';
 	var suffix = '\u2026';
 	var link = null;
 	var link = null;
-	var regex = /(?<!\\)\[((?:[^\[\]]|\\[\[\]])+?[^\\])\]\(<?(?:[^()]|\\[()])+?[^\\]>?\)/g;
+	var regex = /(?<!\\)\[((?:[^\[\]]|\\[\[\]])*?[^\\])\]\(<?(?:[^()]|\\[()])+?[^\\]>?\)/g;
 	while ( ( link = regex.exec(text) ) !== null ) {
 	while ( ( link = regex.exec(text) ) !== null ) {
 		if ( link.index < limit && link.index + link[0].length > limit ) {
 		if ( link.index < limit && link.index + link[0].length > limit ) {
 			if ( link.index + link[0].length < limit + maxExtra ) suffix = link[0];
 			if ( link.index + link[0].length < limit + maxExtra ) suffix = link[0];