1
0
Markus-Rost 4 жил өмнө
parent
commit
ba85caf766

+ 1 - 1
RcGcDb

@@ -1 +1 @@
-Subproject commit d4c44da1263a89f816b97fb3c02e36245bfb2094
+Subproject commit 0ef302ec7f7da31d6f84a42cc335d9e6e1f6e8b7

+ 58 - 6
functions/parse_page.js

@@ -26,7 +26,6 @@ const infoboxList = [
 
 const removeClasses = [
 	'table',
-	'div',
 	'script',
 	'input',
 	'style',
@@ -133,7 +132,7 @@ function parse_page(msg, content, embed, wiki, reaction, {title, contentmodel},
 		}
 		var $ = cheerio.load(response.body.parse.text['*'].replace( /<br\/?>/g, '\n' ));
 		if ( embed.brokenInfobox && $('aside.portable-infobox').length ) {
-			var infobox = $('aside.portable-infobox');
+			let infobox = $('aside.portable-infobox');
 			embed.fields.forEach( field => {
 				if ( embed.length > 5400 ) return;
 				if ( /^`.+`$/.test(field.name) ) {
@@ -156,8 +155,61 @@ function parse_page(msg, content, embed, wiki, reaction, {title, contentmodel},
 				}
 			} );
 		}
+		if ( !fragment && !embed.fields.length && $(infoboxList.join(', ')).length ) {
+			let infobox = $(infoboxList.join(', ')).first();
+			if ( embed.thumbnail?.url === thumbnail ) {
+				let image = infobox.find([
+					'div.images img',
+					'figure.pi-image img',
+					'div.infobox-imagearea img'
+				].join(', ')).toArray().find( img => {
+					let imgURL = img.attribs.src;
+					if ( !imgURL ) return false;
+					return ( /^(?:https?:)?\/\//.test(imgURL) && /\.(?:png|jpg|jpeg|gif)(?:\/|\?|$)/i.test(imgURL) );
+				} )?.attribs.src?.replace( /^(?:https?:)?\/\//, 'https://' );
+				if ( image ) embed.setThumbnail( new URL(image, wiki).href );
+			}
+			let rows = infobox.find([
+				'> tbody > tr',
+				'div.section > div.title',
+				'div.section > table > tbody > tr',
+				'h2.pi-header',
+				'div.pi-data',
+				'table.infobox-rows > tbody > tr'
+			].join(', '));
+			for ( let i = 0; i < rows.length; i++ ) {
+				if ( embed.fields.length >= 25 || embed.length > 5400 ) break;
+				let row = rows.eq(i);
+				if ( row.is('tr, div.pi-data') ) {
+					let label = row.children('th, td, h3.pi-data-label').eq(0);
+					label.find(removeClasses.join(', ')).remove();
+					let value = row.children('td, div.pi-data-value').eq(( label.is('td') ? 1 : 0 ));
+					value.find(removeClasses.join(', ')).remove();
+					label = htmlToPlain(label).trim().split('\n')[0];
+					value = htmlToDiscord(value, wiki.articleURL.href, true).trim().replace( /\n{3,}/g, '\n\n' );
+					if ( label.length > 100 ) label = label.substring(0, 100) + '\u2026';
+					if ( value.length > 500 ) value = limitLength(value, 500, 250);
+					if ( label && value ) embed.addField( label, value );
+				}
+				else if ( row.is('div.title, h2.pi-header') ) {
+					row.find(removeClasses.join(', ')).remove();
+					let label = htmlToPlain(row).trim();
+					if ( label.length > 100 ) label = label.substring(0, 100) + '\u2026';
+					if ( label ) {
+						if ( embed.fields.length && embed.fields[embed.fields.length - 1].name === '\u200b' ) {
+							embed.spliceFields( embed.fields.length - 1, 1, {
+								name: '\u200b',
+								value: '**' + label + '**',
+								inline: false
+							} );
+						}
+						else embed.addField( '\u200b', '**' + label + '**', false );
+					}
+				}
+			}
+		}
 		if ( embed.thumbnail?.url === thumbnail ) {
-			var image = response.body.parse.images.find( pageimage => ( /\.(?:png|jpg|jpeg|gif)$/.test(pageimage.toLowerCase()) && pageimage.toLowerCase().includes( title.toLowerCase().replace( / /g, '_' ) ) ) );
+			let image = response.body.parse.images.find( pageimage => ( /\.(?:png|jpg|jpeg|gif)$/.test(pageimage.toLowerCase()) && pageimage.toLowerCase().includes( title.toLowerCase().replace( / /g, '_' ) ) ) );
 			if ( !image ) {
 				thumbnail = $(infoboxList.join(', ')).find('img').filter( (i, img) => {
 					img = $(img).prop('src')?.toLowerCase();
@@ -184,9 +236,9 @@ function parse_page(msg, content, embed, wiki, reaction, {title, contentmodel},
 				var sectionContent = $('<div>').append(
 					section.nextUntil(['h1','h2','h3','h4','h5','h6'].slice(0, sectionLevel).join(', '))
 				);
-				section.find(removeClasses.join(', ')).remove();
+				section.find('div, ' + removeClasses.join(', ')).remove();
 				sectionContent.find(infoboxList.join(', ')).remove();
-				sectionContent.find(removeClasses.join(', ')).remove();
+				sectionContent.find('div, ' + removeClasses.join(', ')).remove();
 				var name = htmlToPlain(section).trim();
 				if ( name.length > 250 ) name = name.substring(0, 250) + '\u2026';
 				var value = htmlToDiscord(sectionContent, wiki.articleURL.href, true).trim().replace( /\n{3,}/g, '\n\n' );
@@ -206,7 +258,7 @@ function parse_page(msg, content, embed, wiki, reaction, {title, contentmodel},
 			$('h1, h2, h3, h4, h5, h6').nextAll().remove();
 			$('h1, h2, h3, h4, h5, h6').remove();
 			$(infoboxList.join(', ')).remove();
-			$(removeClasses.join(', '), $('.mw-parser-output')).not(keepMainPageTag.join(', ')).remove();
+			$('div, ' + removeClasses.join(', '), $('.mw-parser-output')).not(keepMainPageTag.join(', ')).remove();
 			var description = htmlToDiscord($.html(), wiki.articleURL.href, true).trim().replace( /\n{3,}/g, '\n\n' );
 			if ( description ) {
 				if ( description.length > 1000 ) description = limitLength(description, 1000, 500);

+ 2 - 1
package.json

@@ -38,7 +38,8 @@
     "wikibot",
     "voice-channel",
     "wiki",
-    "discord-bot"
+    "discord-bot",
+    "wikipedia"
   ],
   "bugs": {
     "url": "https://github.com/Markus-Rost/discord-wiki-bot/issues"

+ 0 - 1
util/database.js

@@ -1,4 +1,3 @@
-const {defaultSettings} = require('../util/default.json');
 const sqlite3 = require('sqlite3').verbose();
 const mode = ( process.env.READONLY ? sqlite3.OPEN_READONLY : sqlite3.OPEN_READWRITE );
 const db = new sqlite3.Database( './wikibot.db', mode, dberror => {

+ 29 - 0
util/functions.js

@@ -170,40 +170,52 @@ function htmlToPlain(html) {
 			if ( tagname === 'p' && !text.endsWith( '\n' ) ) text += '\n';
 			if ( tagname === 'ul' ) listlevel++;
 			if ( tagname === 'li' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				if ( listlevel > -1 ) text += '\u200b '.repeat(4 * listlevel);
 				text += '• ';
 			}
 			if ( tagname === 'dl' ) listlevel++;
 			if ( tagname === 'dt' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				if ( listlevel > -1 ) text += '\u200b '.repeat(4 * listlevel);
 			}
 			if ( tagname === 'dd' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				if ( listlevel > -1 ) text += '\u200b '.repeat(4 * (listlevel + 1));
 			}
+			if ( tagname === 'img' ) {
+				if ( attribs.alt && !reference ) text += escapeFormatting(attribs.alt);
+			}
 			if ( tagname === 'h1' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				text += '***__';
 			}
 			if ( tagname === 'h2' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				text += '**__';
 			}
 			if ( tagname === 'h3' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				text += '**';
 			}
 			if ( tagname === 'h4' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				text += '__';
 			}
 			if ( tagname === 'h5' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				text += '*';
 			}
 			if ( tagname === 'h6' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				text += '';
 			}
@@ -264,47 +276,64 @@ function htmlToDiscord(html, serverpath = '', ...escapeArgs) {
 				if ( listlevel > -1 ) text += '\u200b '.repeat(4 * listlevel + 3);
 			}
 			if ( tagname === 'hr' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				text += '─'.repeat(10) + '\n';
 			}
 			if ( tagname === 'p' && !text.endsWith( '\n' ) ) text += '\n';
 			if ( tagname === 'ul' ) listlevel++;
 			if ( tagname === 'li' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				if ( listlevel > -1 ) text += '\u200b '.repeat(4 * listlevel);
 				text += '• ';
 			}
 			if ( tagname === 'dl' ) listlevel++;
 			if ( tagname === 'dt' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				if ( listlevel > -1 ) text += '\u200b '.repeat(4 * listlevel);
 				text += '**';
 			}
 			if ( tagname === 'dd' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				if ( listlevel > -1 ) text += '\u200b '.repeat(4 * (listlevel + 1));
 			}
+			if ( tagname === 'img' ) {
+				if ( attribs.alt && !reference ) {
+					if ( href && !code ) attribs.alt = attribs.alt.replace( /[\[\]]/g, '\\$&' );
+					if ( code ) text += attribs.alt.replace( /`/g, 'ˋ' );
+					else text += escapeFormatting(attribs.alt, ...escapeArgs);
+				}
+			}
 			if ( tagname === 'h1' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				text += '***__';
 			}
 			if ( tagname === 'h2' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				text += '**__';
 			}
 			if ( tagname === 'h3' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				text += '**';
 			}
 			if ( tagname === 'h4' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				text += '__';
 			}
 			if ( tagname === 'h5' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				text += '*';
 			}
 			if ( tagname === 'h6' ) {
+				text = text.replace( / +$/, '' );
 				if ( !text.endsWith( '\n' ) ) text += '\n';
 				text += '';
 			}