User:Polygnotus/Scripts/SourcerySpell.js

Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
// == Wikipedia Source Editor Spell Checker ==
// Detects typos in wikitext using nspell (Hunspell-compatible).
// Adds a "Check spelling" tab on source editor pages.
// Results appear in a floating panel; clicking a result selects the
// word in the textarea and scrolls to it.

( function () {
  'use strict';

  // -----------------------------------------------------------------------
  // Configuration — add your own terms to the whitelist
  // -----------------------------------------------------------------------
  const WHITELIST = [
    // Wikipedia / MediaWiki terminology
    'Wikipedia', 'Wikimedia', 'MediaWiki', 'Wikidata', 'Wikisource',
    'Wiktionary', 'Wikinews', 'Wikivoyage', 'Wikibooks', 'Wikiquote',
    'wikitext', 'wikilink', 'wikitable', 'infobox', 'navbox', 'hatnote',
    'portlet', 'AfD', 'CSD', 'BLP', 'NPOV', 'POV', 'WP', 'MOS',
    'disambiguation', 'redirects', 'transclusion',
    // Add your own:
  ];

  // -----------------------------------------------------------------------
  // CDN endpoints
  // -----------------------------------------------------------------------
  const CDN_NSPELL = 'https://cdn.jsdelivr.net/npm/nspell/+esm';
  const CDN_AFF    = 'https://cdn.jsdelivr.net/npm/dictionary-en/index.aff';
  const CDN_DIC    = 'https://cdn.jsdelivr.net/npm/dictionary-en/index.dic';

  // Only run in the source editor
  const action = mw.config.get( 'wgAction' );
  if ( action !== 'edit' && action !== 'submit' ) return;

  let checker      = null;
  let whitelistSet = null;
  let resultsPanel = null;

  // -----------------------------------------------------------------------
  // Load nspell + dictionary (cached after first load)
  // -----------------------------------------------------------------------
  async function initChecker() {
    if ( checker ) return checker;

    mw.notify( 'Loading spell checker…', { tag: 'spellcheck', autoHide: false } );

    const [ { default: nspell }, affText, dicText ] = await Promise.all( [
      import( CDN_NSPELL ),
      fetch( CDN_AFF ).then( r => r.text() ),
      fetch( CDN_DIC ).then( r => r.text() ),
    ] );

    checker      = nspell( { aff: affText, dic: dicText } );
    whitelistSet = new Set( WHITELIST.map( w => w.toLowerCase() ) );

    // Automatically whitelist every word from the article title so that
    // e.g. editing [[Phosphatidylinositol]] never flags the title word as a typo.
    const pageTitle = mw.config.get( 'wgTitle' ) || '';
    for ( const word of pageTitle.split( /[\s\-_/()]+/ ) ) {
      const clean = word.replace( /^[^a-zA-Z]+|[^a-zA-Z]+$/g, '' ); // strip leading/trailing non-alpha
      if ( clean.length > 0 ) whitelistSet.add( clean.toLowerCase() );
    }

    mw.notify( 'Spell checker ready.', { tag: 'spellcheck', autoHide: true } );
    return checker;
  }

  // -----------------------------------------------------------------------
  // Strip wikicode from raw wikitext while preserving character offsets.
  //
  // Rather than removing markup (which would shift all subsequent indices),
  // we replace every markup character with a space.  This means the word
  // offsets in the stripped string are identical to those in the original,
  // so we can use them directly with textarea.setSelectionRange().
  // -----------------------------------------------------------------------
  function stripWikicode( text ) {
    const chars = text.split( '' );

    // Replace characters in [start, end) with spaces
    function blank( start, end ) {
      for ( let i = start; i < end && i < chars.length; i++ ) {
        chars[ i ] = ' ';
      }
    }

    // Find the closing marker for a nested construct (e.g. {{ }} or [[ ]]).
    // Starts scanning from `pos` (after the opening marker has been consumed).
    function findClosing( pos, open, close ) {
      let depth = 1;
      let i     = pos;
      const ol  = open.length;
      const cl  = close.length;
      while ( i <= text.length - cl ) {
        if ( text.startsWith( open,  i ) ) { depth++; i += ol; continue; }
        if ( text.startsWith( close, i ) ) {
          if ( --depth === 0 ) return i;
          i += cl;
          continue;
        }
        i++;
      }
      return -1;
    }

    let m;

    // 1. HTML comments: <!-- ... -->
    for ( m of text.matchAll( /<!--[\s\S]*?-->/g ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 2. <ref> tags (inline citations are never prose)
    for ( m of text.matchAll( /<ref(\s[^>]*)?>[\s\S]*?<\/ref>|<ref(\s[^>]*)?\s*\/>/gi ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 3. Other HTML tags whose content should not be checked
    const SKIP_HTML = 'nowiki|syntaxhighlight|source|math|score|gallery|poem|'
                    + 'div|span|table|td|th|tr|small|big|sub|sup|s|del|ins|'
                    + 'center|font|abbr|blockquote|br|hr|u';
    const skipHtmlRe = new RegExp(
      `<(${ SKIP_HTML })(?:\\s[^>]*)?>[\\s\\S]*?<\\/\\1>|<\\/?(?:${ SKIP_HTML })(?:\\s[^>]*)?>`,
      'gi'
    );
    for ( m of text.matchAll( skipHtmlRe ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 4. Templates: {{ ... }} — blank everything including nested templates
    {
      let i = 0;
      while ( i < text.length - 1 ) {
        if ( text[ i ] === '{' && text[ i + 1 ] === '{' ) {
          const end = findClosing( i + 2, '{{', '}}' );
          if ( end !== -1 ) { blank( i, end + 2 ); i = end + 2; }
          else              { i++; }
        } else {
          i++;
        }
      }
    }

	// <nowiki>
    // 5. Wikilinks: [[ ... ]]
    //    [[File:...]], [[Category:...]], etc. → blank entirely
    //    [[target|label]]suffix → blank "[[target|" and "]]", keep label;
    //                             also blank suffix if label starts uppercase (proper noun)
    //    [[ProperNoun]]suffix   → blank everything including suffix:
    //                             the suffix is just an inflection of the proper noun
    //                             (e.g. [[Belarus]]ian → all blanked; "ian" alone is not a word)
    //    [[target]]             → blank only the brackets, keep target text as prose
    //    [[target]]suffix       → blank brackets and suffix; the suffix is a fragment of
    //                             the compound word and meaningless on its own (e.g. [[woke]]ness)
	// </nowiki>
    {
      const NON_PROSE_NS = /^(?:file|image|category|media|template|help|portal|special|talk|user)\s*:/i;
      let i = 0;
      while ( i < text.length - 1 ) {
        if ( text[ i ] === '[' && text[ i + 1 ] === '[' ) {
          const end = findClosing( i + 2, '[[', ']]' );
          if ( end !== -1 ) {
            const inner    = text.slice( i + 2, end );
            const pipePos  = inner.indexOf( '|' );
            const target   = ( pipePos === -1 ? inner : inner.slice( 0, pipePos ) ).trim();

            // Measure any lowercase suffix immediately after the closing ]]
            // e.g. "ian" in [[Belarus]]ian, "s" in [[ship]]s
            let suffixEnd = end + 2;
            while ( suffixEnd < text.length && /[a-z'-]/.test( text[ suffixEnd ] ) ) {
              suffixEnd++;
            }
            const hasSuffix = suffixEnd > end + 2;

            if ( NON_PROSE_NS.test( target ) ) {
              // Blank the whole thing including any trailing suffix
              blank( i, suffixEnd );
            } else if ( pipePos !== -1 ) {
              // [[target|label]]suffix — keep label; blank brackets, target, and pipe
              blank( i, i + 2 + pipePos + 1 ); // "[[target|"
              blank( end, end + 2 );            // "]]"
              // If the label starts with a capital (proper noun display text), blank suffix too
              const firstLabelChar = inner[ pipePos + 1 ] || '';
              if ( hasSuffix && /[A-Z]/.test( firstLabelChar ) ) {
                blank( end + 2, suffixEnd );
              }
            } else if ( hasSuffix && /^[A-Z]/.test( target ) ) {
              // [[ProperNoun]]suffix — blank everything: suffix alone is not a real word
              blank( i, suffixEnd );
            } else {
              // [[target]] with no suffix: blank only the brackets, keep target text
              // [[target]]suffix: blank brackets AND suffix — the suffix is a fragment
              // of the compound word (e.g. [[woke]]ness → wokeness) and is not a
              // standalone word; the target itself is kept as a prose token
              blank( i, i + 2 );
              blank( end, end + 2 );
              if ( hasSuffix ) blank( end + 2, suffixEnd );
            }
            i = suffixEnd;
          } else {
            i++;
          }
        } else {
          i++;
        }
      }
    }

    // 6. External links: [http://url optional label]
    //    Blank the bracket and URL; keep label if present
    for ( m of text.matchAll( /\[https?:\/\/[^\s\]]*(\s[^\]]*)?\]/g ) ) {
      const full     = m[ 0 ];
      const label    = m[ 1 ];               // captured " label text" or undefined
      const absEnd   = m.index + full.length;
      if ( label ) {
        // Blank "[url " — find where label starts inside full match
        const labelStart = m.index + full.indexOf( label );
        blank( m.index, labelStart );        // "[url "
        blank( absEnd - 1, absEnd );         // "]"
      } else {
        blank( m.index, absEnd );
      }
    }

    // 7. Bold and italic markup: ''' or ''
    for ( m of text.matchAll( /'{2,}/g ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 8. Heading markup: = signs at start/end of lines (keep heading text)
    for ( m of text.matchAll( /^(={1,6})([\s\S]*?)(={1,6})\s*$/gm ) ) {
      blank( m.index, m.index + m[ 1 ].length );
      const closeStart = m.index + m[ 0 ].length - m[ 3 ].length;
      blank( closeStart, closeStart + m[ 3 ].length );
    }

    // 9. Table markup: lines beginning with {|, |}, |-, |, or !
    //    Blank the entire line so cell syntax / attributes are not checked
    for ( m of text.matchAll( /^(?:\{\||\|\}|\|-|[|!])[^\n]*/gm ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 10. List / indent prefixes at the start of a line: ; : * #
    for ( m of text.matchAll( /^[;:*#]+/gm ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 11. Magic words: __TOC__ __NOTOC__ etc.
    for ( m of text.matchAll( /__[A-Z_]+__/g ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 12. Bare URLs (not inside link brackets — those are handled above)
    for ( m of text.matchAll( /https?:\/\/\S+/g ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 13. ISBN / ISSN / RFC / PMID literals
    for ( m of text.matchAll( /\b(?:ISBN|ISSN|RFC|PMID|DOI)\s*[\w\-]+/gi ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 14. Horizontal rules: ---- at start of line
    for ( m of text.matchAll( /^-{4,}$/gm ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 15. Single-letter case-adjustment brackets: [g]enerally, [T]his etc.
    //     Wikipedia editors use [x] to change the case of the first letter of a
    //     quotation. The bracket pair and the following letters form one word;
    //     blank the entire span so neither "enerally" nor any other fragment
    //     is left as a token.
    for ( m of text.matchAll( /\[[a-zA-Z]\][a-zA-ZÀ-ɏ]+/g ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 16. HTML entities: &ndash; &amp; &nbsp; &#160; &#x2013; etc.
    for ( m of text.matchAll( /&(?:#\d+|#x[\da-fA-F]+|[a-zA-Z]{2,8});/g ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    return chars.join( '' );
  }

  // -----------------------------------------------------------------------
  // Word-level filters — returns true if the word should be skipped
  // -----------------------------------------------------------------------
  function shouldIgnore( word ) {
    if ( word.length <= 2 )                        return true; // too short
    if ( /\d/.test( word ) )                       return true; // contains digit
    if ( /[^\u0000-\u007F]/.test( word ) )         return true; // non-ASCII → loanword with diacritics (e.g. détente, naïve)
    if ( word === word.toUpperCase() )             return true; // ALL CAPS abbreviation
    if ( /^[A-Z]/.test( word ) )                  return true; // likely proper noun
    if ( /^https?/.test( word ) )                  return true; // stray URL fragment
    if ( /^[-']+$/.test( word ) )                  return true; // punctuation only
    if ( !/[aeiou]/i.test( word ) )               return true; // no vowels
    if ( whitelistSet.has( word.toLowerCase() ) ) return true; // whitelisted
    return false;
  }

  // Extract word tokens with their character offsets in the stripped string.
  // The character class includes Latin-1 Supplement and Latin Extended A/B
  // (U+00C0–U+024F) so that accented words like "détente" or "naïve" are
  // captured as a single token rather than being split at the accented character.
  function extractWords( text ) {
    return [ ...text.matchAll( /[a-zA-Z\u00C0-\u024F'']+/g ) ].map( m => ( {
      word: m[ 0 ].replace( /^['-]+|['-]+$/g, '' ),
      index: m.index,
      raw:   m[ 0 ],
    } ) );
  }

  // -----------------------------------------------------------------------
  // Floating results panel
  // -----------------------------------------------------------------------
  function buildPanel() {
    const panel = document.createElement( 'div' );
    panel.id = 'mw-spellcheck-panel';
    Object.assign( panel.style, {
      position:     'fixed',
      top:          '60px',
      right:        '16px',
      width:        '340px',
      maxHeight:    '65vh',
      overflowY:    'auto',
      background:   '#fff',
      border:       '1px solid #a2a9b1',
      borderRadius: '4px',
      boxShadow:    '0 4px 12px rgba(0,0,0,.25)',
      zIndex:       '9999',
      fontFamily:   'sans-serif',
      fontSize:     '13px',
      lineHeight:   '1.4',
    } );

    const header = document.createElement( 'div' );
    Object.assign( header.style, {
      background:     '#36c',
      color:          '#fff',
      padding:        '8px 12px',
      fontWeight:     'bold',
      display:        'flex',
      justifyContent: 'space-between',
      alignItems:     'center',
      position:       'sticky',
      top:            '0',
    } );
    header.textContent = 'Spell checker';

    const closeBtn = document.createElement( 'button' );
    closeBtn.textContent = '✕';
    closeBtn.title = 'Close';
    Object.assign( closeBtn.style, {
      background:  'transparent',
      border:      'none',
      color:       '#fff',
      cursor:      'pointer',
      fontSize:    '15px',
      lineHeight:  '1',
      padding:     '0 2px',
    } );
    closeBtn.addEventListener( 'click', () => { panel.remove(); resultsPanel = null; } );

    header.appendChild( closeBtn );
    panel.appendChild( header );
    return panel;
  }

  function showResults( typos, textarea ) {
    // Create panel fresh each run
    if ( resultsPanel ) resultsPanel.remove();
    resultsPanel = buildPanel();

    const summary = document.createElement( 'div' );
    Object.assign( summary.style, {
      padding:      '8px 12px',
      borderBottom: '1px solid #eaecf0',
      color:        '#555',
      background:   '#f8f9fa',
    } );
    summary.textContent = typos.length
      ? `${ typos.length } possible typo${ typos.length === 1 ? '' : 's' } found. Click to jump.`
      : 'No typos found.';
    resultsPanel.appendChild( summary );

    for ( const typo of typos ) {
      const item = document.createElement( 'div' );
      Object.assign( item.style, {
        padding:      '7px 12px',
        borderBottom: '1px solid #eaecf0',
        cursor:       'pointer',
      } );
      item.addEventListener( 'mouseenter', () => { item.style.background = '#f0f4ff'; } );
      item.addEventListener( 'mouseleave', () => { item.style.background = '';        } );

      // Word in red
      const wordEl = document.createElement( 'span' );
      wordEl.style.cssText = 'font-weight:bold; color:#d33;';
      wordEl.textContent = typo.word;

      // Context snippet in grey
      const ctxEl = document.createElement( 'div' );
      ctxEl.style.cssText = 'color:#72777d; font-size:11px; margin-top:3px; white-space:nowrap; overflow:hidden; text-overflow:ellipsis;';
      ctxEl.textContent = '…' + typo.context + '…';

      // Suggestions
      const sugEl = document.createElement( 'div' );
      sugEl.style.cssText = 'color:#0645ad; font-size:11px; margin-top:2px;';
      sugEl.textContent = typo.suggestions.length
        ? 'Suggestions: ' + typo.suggestions.join( ', ' )
        : 'No suggestions';

      item.append( wordEl, ctxEl, sugEl );

      item.addEventListener( 'click', () => {
        // Highlight in previously-clicked item: reset
        resultsPanel.querySelectorAll( '[data-active]' ).forEach( el => {
          el.style.background = '';
          el.removeAttribute( 'data-active' );
        } );
        item.dataset.active = '1';
        item.style.background = '#fef6e4';

        // Select the word in the textarea
        textarea.focus();
        textarea.setSelectionRange( typo.index, typo.index + typo.raw.length );

        // Scroll the textarea to show the selection.
        // Approximate line height to compute scrollTop.
        const textBefore  = textarea.value.slice( 0, typo.index );
        const linesBefore = textBefore.split( '\n' ).length - 1;
        const lineHeight  = parseFloat( getComputedStyle( textarea ).lineHeight ) || 18;
        textarea.scrollTop = Math.max( 0, ( linesBefore - 3 ) * lineHeight );
      } );

      resultsPanel.appendChild( item );
    }

    document.body.appendChild( resultsPanel );
  }

  // -----------------------------------------------------------------------
  // Main spell check routine
  // -----------------------------------------------------------------------
  async function runSpellCheck() {
    const textarea = document.getElementById( 'wpTextbox1' );
    if ( !textarea ) {
      mw.notify(
        'Could not find the source editor textarea (#wpTextbox1). ' +
        'This script only works with the source editor, not the visual editor.',
        { type: 'warn', tag: 'spellcheck', autoHide: false }
      );
      return;
    }

    const spell        = await initChecker();
    const rawText      = textarea.value;
    const strippedText = stripWikicode( rawText );
    const wordTokens   = extractWords( strippedText );

    const typos = [];

    for ( const { word, index, raw } of wordTokens ) {
      if ( !word || shouldIgnore( word ) ) continue;
      if ( spell.correct( word ) )         continue;

      const suggestions = spell.suggest( word ).slice( 0, 5 );

      // Build a short context snippet from the *raw* wikitext for readability
      const ctxStart = Math.max( 0, index - 30 );
      const ctxEnd   = Math.min( rawText.length, index + raw.length + 30 );
      const context  = rawText.slice( ctxStart, ctxEnd ).replace( /\s+/g, ' ' ).trim();

      typos.push( { word, index, raw, suggestions, context } );
    }

    showResults( typos, textarea );

    mw.notify(
      'Spell check complete — ' + typos.length +
      ' possible typo' + ( typos.length === 1 ? '' : 's' ) + ' found.',
      { tag: 'spellcheck', autoHide: true }
    );
  }

  // -----------------------------------------------------------------------
  // Add "Check spelling" tab to the editor toolbar
  // -----------------------------------------------------------------------
  mw.loader.using( 'mediawiki.util' ).then( () => {
    const link = mw.util.addPortletLink(
      'p-cactions',
      '#',
      'Check spelling',
      'ca-spellcheck',
      'Check wikitext for spelling errors'
    );
    if ( link ) {
      link.addEventListener( 'click', e => {
        e.preventDefault();
        runSpellCheck();
      } );
    }
  } );

}() );