Module:WikitextParser
Appearance
-- Module:WikitextParser is a general-purpose wikitext parser
-- Documentation and master version: https://en.wikipedia.org/wiki/Module:WikitextParser
-- Authors: User:Sophivorus, User:Certes, User:Aidan9382, et al.
-- License: CC-BY-SA-4.0
local WikitextParser = {}
-- Helper function to escape a string for use in regexes
local function escapeString( str )
return str:gsub( '[%^%$%(%)%.%[%]%*%+%-%?%%]', '%%%0' )
end
-- Get the lead section from the given wikitext
-- The lead section is any content before the first section title.
-- @param wikitext Required. Wikitext to parse.
-- @return Wikitext of the lead section. May be empty if the lead section is empty.
function WikitextParser.getLead( wikitext )
wikitext = '\n' .. wikitext
wikitext = wikitext:gsub( '\n==.*', '' )
wikitext = mw.text.trim( wikitext )
return wikitext
end
-- Get the sections from the given wikitext
-- This method doesn't get the lead section, use getLead for that
-- @param wikitext Required. Wikitext to parse.
-- @return Map from section title to section content
function WikitextParser.getSections( wikitext )
local sections = {}
wikitext = '\n' .. wikitext .. '\n=='
for title in wikitext:gmatch( '\n==+ *([^=]+) *==+' ) do
local section = wikitext:match( '\n==+ *' .. escapeString( title ) .. ' *==+(.-)\n==' )
section = mw.text.trim( section )
sections[ title ] = section
end
return sections
end
-- Get a section from the given wikitext (including any subsections)
-- If the given section title appears more than once, only the section of the first instance will be returned
-- @param wikitext Required. Wikitext to parse.
-- @param title Required. Title of the section
-- @return Wikitext of the section, or nil if it isn't found. May be empty if the section is empty or contains only subsections.
function WikitextParser.getSection( wikitext, title )
title = mw.text.trim( title )
title = escapeString( title )
wikitext = '\n' .. wikitext .. '\n'
local level, wikitext = wikitext:match( '\n(==+) *' .. title .. ' *==.-\n(.*)' )
if wikitext then
local nextSection = '\n==' .. string.rep( '=?', #level - 2 ) .. '[^=].*'
wikitext = wikitext:gsub( nextSection, '' ) -- remove later sections at this level or higher
wikitext = mw.text.trim( wikitext )
return wikitext
end
end
-- Get the content of a <section> tag from the given wikitext.
-- We can't use getTags because both opening and closing <section> tags are self-closing tags.
-- @param wikitext Required. Wikitext to parse.
-- @param name Required. Name of the <section> tag
-- @return Content of the <section> tag, or nil if it isn't found. May be empty if the section tag is empty.
function WikitextParser.getSectionTag( wikitext, name )
name = mw.text.trim( name )
name = escapeString( name )
wikitext = wikitext:match( '< *section +begin *= *["\']? *' .. name .. ' *["\']? */>(.-)< *section +end= *["\']? *'.. name ..' *["\']? */>' )
if wikitext then
return mw.text.trim( wikitext )
end
end
-- Get the lists from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of lists.
function WikitextParser.getLists( wikitext )
local lists = {}
wikitext = '\n' .. wikitext .. '\n\n'
for list in wikitext:gmatch( '\n([*#].-)\n[^*#]' ) do
table.insert( lists, list )
end
return lists
end
-- Get the paragraphs from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of paragraphs.
function WikitextParser.getParagraphs( wikitext )
local paragraphs = {}
-- Remove non-paragraphs
wikitext = '\n' .. wikitext .. '\n'
wikitext = wikitext:gsub( '\n[*#][^\n]*', '' ) -- remove lists
wikitext = wikitext:gsub( '\n%[%b[]%]\n', '' ) -- remove files and categories
wikitext = wikitext:gsub( '\n%b{} *\n', '\n%0\n' ) -- add spacing between tables and block templates
wikitext = wikitext:gsub( '\n%b{} *\n', '\n' ) -- remove tables and block templates
wikitext = wikitext:gsub( '\n==+[^=]+==+ *\n', '\n' ) -- remove section titles
wikitext = mw.text.trim( wikitext )
for paragraph in mw.text.gsplit( wikitext, '\n\n+' ) do
if mw.text.trim( paragraph ) ~= '' then
table.insert( paragraphs, paragraph )
end
end
return paragraphs
end
-- Get the templates from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of templates.
function WikitextParser.getTemplates( wikitext )
local templates = {}
for template in wikitext:gmatch( '{%b{}}' ) do
if wikitext:sub( 1, 3 ) ~= '{{#' then -- skip parser functions like #if
table.insert( templates, template )
end
end
return templates
end
-- Get the requested template from the given wikitext.
-- If the template appears more than once, only the first instance will be returned
-- @param wikitext Required. Wikitext to parse.
-- @param name Name of the template to get
-- @return Wikitext of the template, or nil if it wasn't found
function WikitextParser.getTemplate( wikitext, name )
local templates = WikitextParser.getTemplates( wikitext )
local lang = mw.language.getContentLanguage()
for _, template in pairs( templates ) do
local templateName = template:match( '^{{ *([^}|\n]+)' )
if lang:ucfirst( templateName ) == lang:ucfirst( name ) then
return template
end
end
end
-- Get the parameters from the given template.
-- @param wikitext Required. Template wikitext to parse.
-- @return Map from parameter name to parameter value
function WikitextParser.getParameters( template )
local parameters = {}
local params = template:match( '{{[^|}]-|(.*)}}' )
if params then
-- Temporarily replace pipes in subtemplates and links to avoid chaos
for subtemplate in params:gmatch( '{%b{}}' ) do
params = params:gsub( escapeString( subtemplate ), subtemplate:gsub( '.', { ['%']='%%', ['|']="@@:@@", ['=']='@@_@@' } ) )
end
for link in params:gmatch( '[%b[]]' ) do
params = params:gsub( escapeString( link ), link:gsub( '.', { ['%']='%%', ['|']='@@:@@', ['=']='@@_@@' } ) )
end
local count = 0
local parts, name, value
for param in mw.text.gsplit( params, '|' ) do
parts = mw.text.split( param, '=' )
name = mw.text.trim( parts[1] )
if #parts == 1 then
value = name
count = count + 1
name = count
else
value = mw.text.trim( table.concat( parts, '=', 2 ) )
end
value = value:gsub( '@@_@@', '=' )
value = value:gsub( '@@:@@', '|' )
parameters[ name ] = value
end
end
return parameters
end
-- Get the tags from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of tags.
function WikitextParser.getTags( wikitext )
local tags = {}
local tag, tagName, tagEnd
for tagStart, tagOpen in wikitext:gmatch( '()(<[^/].->)' ) do
tagName = tagOpen:match( '< ?(.-)[ >]' )
-- If we're in a self-closing tag, like <ref name="foo" />, <references/>, <br/>, <br>, <hr>, etc.
if tagOpen:match( '<.-/>' ) or tagName == 'br' or tagName == 'hr' then
tag = tagOpen
-- If we're in a tag that may contain others like it, like <div> or <span>
elseif tagName == 'div' or tagName == 'span' then
local position = tagStart + #tagOpen - 1
local depth = 1
while depth > 0 do
tagEnd = wikitext:match( '</ ?' .. tagName .. ' ?>()', position )
if tagEnd then
tagEnd = tagEnd - 1
else
break -- unclosed tag
end
position = wikitext:match( '()< ?' .. tagName .. '[ >]', position + 1 )
if not position then
position = tagEnd + 1
end
if position > tagEnd then
depth = depth - 1
else
depth = depth + 1
end
end
tag = wikitext:sub( tagStart, tagEnd )
-- Else we're in tag that shouldn't contain others like it, like <math> or <strong>
else
tagEnd = wikitext:match( '</ ?' .. tagName .. ' ?>()', tagStart ) - 1
tag = wikitext:sub( tagStart, tagEnd )
end
table.insert( tags, tag )
end
return tags
end
-- Get the <gallery> tags from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of gallery tags.
function WikitextParser.getGalleries( wikitext )
local galleries = {}
local tags = WikitextParser.getTags( wikitext )
for _, tag in pairs( tags ) do
local tagName = tag:match( '< ?(.-)[ >]' )
if tagName == 'gallery' then
table.insert( galleries, tag )
end
end
return galleries
end
-- Get the <ref> tags from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of ref tags.
function WikitextParser.getReferences( wikitext )
local references = {}
local tags = WikitextParser.getTags( wikitext )
for _, tag in pairs( tags ) do
local tagName = tag:match( '< ?(.-)[ >]' )
if tagName == 'ref' then
table.insert( references, tag )
end
end
return references
end
-- Get the tables from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of tables.
function WikitextParser.getTables( wikitext )
local tables = {}
wikitext = '\n' .. wikitext
for t in wikitext:gmatch( '\n%b{}' ) do
if t:sub( 1, 3 ) == '\n{|' then
t = mw.text.trim( t ) -- exclude the leading newline
table.insert( tables, t )
end
end
return tables
end
-- Get the id from the given table wikitext
-- @param t Required. Wikitext of the table to parse.
-- @return Id of the table or nil if not found
function WikitextParser.getTableId( t )
return string.match( t, '^{|[^\n]-id *= *["\']?([^"\'\n]+)["\']?[^\n]*\n' )
end
-- Get a table by id from the given wikitext
-- @param wikitext Required. Wikitext to parse.
-- @param id Required. Id of the table
-- @return Wikitext of the table or nil if not found
function WikitextParser.getTableById( wikitext, id )
local tables = WikitextParser.getTables( wikitext )
for _, t in ipairs( tables ) do
if id == WikitextParser.getTableId( t ) then
return t
end
end
end
-- Get the data from the given table wikitext
-- @param tableWikitext Required. Wikitext of the table to parse.
-- @return Table data
-- @todo Test and make more robust
function WikitextParser.getTableData( tableWikitext )
local tableData = {}
tableWikitext = mw.text.trim( tableWikitext );
tableWikitext = string.gsub( tableWikitext, '^{|.-\n', '' ) -- remove the header
tableWikitext = string.gsub( tableWikitext, '\n|}$', '' ) -- remove the footer
tableWikitext = string.gsub( tableWikitext, '^|%+.-\n', '' ) -- remove any caption
tableWikitext = string.gsub( tableWikitext, '|%-.-\n', '|-\n' ) -- remove any row attributes
tableWikitext = string.gsub( tableWikitext, '^|%-\n', '' ) -- remove any leading empty row
tableWikitext = string.gsub( tableWikitext, '\n|%-$', '' ) -- remove any trailing empty row
for rowWikitext in mw.text.gsplit( tableWikitext, '|-', true ) do
local rowData = {}
rowWikitext = string.gsub( rowWikitext, '||', '\n|' )
rowWikitext = string.gsub( rowWikitext, '!!', '\n|' )
rowWikitext = string.gsub( rowWikitext, '\n!', '\n|' )
rowWikitext = string.gsub( rowWikitext, '^!', '\n|' )
rowWikitext = string.gsub( rowWikitext, '^\n|', '' )
for cellWikitext in mw.text.gsplit( rowWikitext, '\n|' ) do
cellWikitext = mw.text.trim( cellWikitext )
table.insert( rowData, cellWikitext )
end
table.insert( tableData, rowData )
end
return tableData
end
-- Get the internal links from the given wikitext (includes category and file links).
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of internal links.
function WikitextParser.getLinks( wikitext )
local links = {}
for link in wikitext:gmatch( '%[%b[]%]' ) do
table.insert( links, link )
end
return links
end
-- Get the file links from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of file links.
function WikitextParser.getFiles( wikitext )
local files = {}
local links = WikitextParser.getLinks( wikitext )
for _, link in pairs( links ) do
local namespace = link:match( '%[%[ ?(.+) ?:.+%]%]' )
if namespace and mw.site.namespaces[ namespace ] and mw.site.namespaces[ namespace ].canonicalName == 'File' then
table.insert( files, link )
end
end
return files
end
-- Get the category links from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of category links.
function WikitextParser.getCategories( wikitext )
local categories = {}
local links = WikitextParser.getLinks( wikitext )
for _, link in pairs( links ) do
local namespace = link:match( '%[%[ ?(.+) ?:.+%]%]' )
if namespace and mw.site.namespaces[ namespace ] and mw.site.namespaces[ namespace ].canonicalName == 'Category' then
table.insert( categories, link )
end
end
return categories
end
-- Get the external links from the given wikitext.
-- @param wikitext Required. Wikitext to parse.
-- @return Sequence of external links.
function WikitextParser.getExternalLinks( wikitext )
local links = {}
for link in wikitext:gmatch( '%b[]' ) do
if link:match( '^%[//' ) or link:match( '^%[https?://' ) then
table.insert( links, link )
end
end
return links
end
return WikitextParser