User:Polygnotus/Scripts/SourcerySpell.js

Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump.
This code will be executed when previewing this page.
Documentation for this user script can be added at User:Polygnotus/Scripts/SourcerySpell.
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
// == Wikipedia Source Editor Spell Checker ==
// Detects typos in wikitext using nspell (Hunspell-compatible).
// Adds a "Check spelling" tab on source editor pages.
// Results appear in a floating panel; clicking a result selects the
// word in the textarea and scrolls to it.

( function () {
  'use strict';

  // -----------------------------------------------------------------------
  // Configuration — add your own terms to the whitelist
  // -----------------------------------------------------------------------
  const WHITELIST = [
    // Wikipedia / MediaWiki terminology
    'Wikipedia', 'Wikimedia', 'MediaWiki', 'Wikidata', 'Wikisource',
    'Wiktionary', 'Wikinews', 'Wikivoyage', 'Wikibooks', 'Wikiquote',
    'wikitext', 'wikilink', 'wikitable', 'infobox', 'navbox', 'hatnote',
    'portlet', 'AfD', 'CSD', 'BLP', 'NPOV', 'POV', 'WP', 'MOS',
    'disambiguation', 'redirects', 'transclusion',
    // Add your own:
  ];

  // -----------------------------------------------------------------------
  // CDN endpoints
  // -----------------------------------------------------------------------
  const CDN_NSPELL = 'https://cdn.jsdelivr.net/npm/nspell/+esm';
  const CDN_AFF    = 'https://cdn.jsdelivr.net/npm/dictionary-en/index.aff';
  const CDN_DIC    = 'https://cdn.jsdelivr.net/npm/dictionary-en/index.dic';

  // Only run in the source editor
  const action = mw.config.get( 'wgAction' );
  if ( action !== 'edit' && action !== 'submit' ) return;

  let checker      = null;
  let whitelistSet = null;
  let resultsPanel = null;

  // -----------------------------------------------------------------------
  // Load nspell + dictionary (cached after first load)
  // -----------------------------------------------------------------------
  async function initChecker() {
    if ( checker ) return checker;

    mw.notify( 'Loading spell checker…', { tag: 'spellcheck', autoHide: false } );

    const [ { default: nspell }, affText, dicText ] = await Promise.all( [
      import( CDN_NSPELL ),
      fetch( CDN_AFF ).then( r => r.text() ),
      fetch( CDN_DIC ).then( r => r.text() ),
    ] );

    checker      = nspell( { aff: affText, dic: dicText } );
    whitelistSet = new Set( WHITELIST.map( w => w.toLowerCase() ) );

    // Automatically whitelist every word from the article title so that
    // e.g. editing [[Phosphatidylinositol]] never flags the title word as a typo.
    const pageTitle = mw.config.get( 'wgTitle' ) || '';
    for ( const word of pageTitle.split( /[\s\-_/()]+/ ) ) {
      const clean = word.replace( /^[^a-zA-Z]+|[^a-zA-Z]+$/g, '' ); // strip leading/trailing non-alpha
      if ( clean.length > 0 ) whitelistSet.add( clean.toLowerCase() );
    }

    mw.notify( 'Spell checker ready.', { tag: 'spellcheck', autoHide: true } );
    return checker;
  }

  // -----------------------------------------------------------------------
  // Strip wikicode from raw wikitext while preserving character offsets.
  //
  // Rather than removing markup (which would shift all subsequent indices),
  // we replace every markup character with a space.  This means the word
  // offsets in the stripped string are identical to those in the original,
  // so we can use them directly with textarea.setSelectionRange().
  // -----------------------------------------------------------------------
  function stripWikicode( text ) {
    const chars = text.split( '' );

    // Replace characters in [start, end) with spaces
    function blank( start, end ) {
      for ( let i = start; i < end && i < chars.length; i++ ) {
        chars[ i ] = ' ';
      }
    }

    // Find the closing marker for a nested construct (e.g. {{ }} or [[ ]]).
    // Starts scanning from `pos` (after the opening marker has been consumed).
    function findClosing( pos, open, close ) {
      let depth = 1;
      let i     = pos;
      const ol  = open.length;
      const cl  = close.length;
      while ( i <= text.length - cl ) {
        if ( text.startsWith( open,  i ) ) { depth++; i += ol; continue; }
        if ( text.startsWith( close, i ) ) {
          if ( --depth === 0 ) return i;
          i += cl;
          continue;
        }
        i++;
      }
      return -1;
    }

    let m;

    // 1. HTML comments: <!-- ... -->
    for ( m of text.matchAll( /<!--[\s\S]*?-->/g ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 2. <ref> tags (inline citations are never prose)
    for ( m of text.matchAll( /<ref(\s[^>]*)?>[\s\S]*?<\/ref>|<ref(\s[^>]*)?\s*\/>/gi ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 3. Other HTML tags whose content should not be checked
    const SKIP_HTML = 'nowiki|syntaxhighlight|source|math|score|gallery|poem|'
                    + 'div|span|table|td|th|tr|small|big|sub|sup|s|del|ins|'
                    + 'center|font|abbr|blockquote|br|hr|u';
    const skipHtmlRe = new RegExp(
      `<(${ SKIP_HTML })(?:\\s[^>]*)?>[\\s\\S]*?<\\/\\1>|<\\/?(?:${ SKIP_HTML })(?:\\s[^>]*)?>`,
      'gi'
    );
    for ( m of text.matchAll( skipHtmlRe ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 4. Templates: {{ ... }} — blank everything including nested templates
    {
      let i = 0;
      while ( i < text.length - 1 ) {
        if ( text[ i ] === '{' && text[ i + 1 ] === '{' ) {
          const end = findClosing( i + 2, '{{', '}}' );
          if ( end !== -1 ) { blank( i, end + 2 ); i = end + 2; }
          else              { i++; }
        } else {
          i++;
        }
      }
    }

	// <nowiki>
    // 5. Wikilinks: [[ ... ]]
    //    [[File:...]], [[Category:...]], etc. → blank entirely
    //    [[target|label]]suffix → blank "[[target|" and "]]", keep label;
    //                             also blank suffix if label starts uppercase (proper noun)
    //    [[ProperNoun]]suffix   → blank everything including suffix:
    //                             the suffix is just an inflection of the proper noun
    //                             (e.g. [[Belarus]]ian → all blanked; "ian" alone is not a word)
    //    [[target]]             → blank only the brackets, keep target text as prose
    //    [[target]]suffix       → blank brackets and suffix; the suffix is a fragment of
    //                             the compound word and meaningless on its own (e.g. [[woke]]ness)
	// </nowiki>
    {
      const NON_PROSE_NS = /^(?:file|image|category|media|template|help|portal|special|talk|user)\s*:/i;
      let i = 0;
      while ( i < text.length - 1 ) {
        if ( text[ i ] === '[' && text[ i + 1 ] === '[' ) {
          const end = findClosing( i + 2, '[[', ']]' );
          if ( end !== -1 ) {
            const inner    = text.slice( i + 2, end );
            const pipePos  = inner.indexOf( '|' );
            const target   = ( pipePos === -1 ? inner : inner.slice( 0, pipePos ) ).trim();

            // Measure any lowercase suffix immediately after the closing ]]
            // e.g. "ian" in [[Belarus]]ian, "s" in [[ship]]s
            let suffixEnd = end + 2;
            while ( suffixEnd < text.length && /[a-z'-]/.test( text[ suffixEnd ] ) ) {
              suffixEnd++;
            }
            const hasSuffix = suffixEnd > end + 2;

            if ( NON_PROSE_NS.test( target ) ) {
              // Blank the whole thing including any trailing suffix
              blank( i, suffixEnd );
            } else if ( pipePos !== -1 ) {
              // [[target|label]]suffix — keep label; blank brackets, target, and pipe
              blank( i, i + 2 + pipePos + 1 ); // "[[target|"
              blank( end, end + 2 );            // "]]"
              // If the label starts with a capital (proper noun display text), blank suffix too
              const firstLabelChar = inner[ pipePos + 1 ] || '';
              if ( hasSuffix && /[A-Z]/.test( firstLabelChar ) ) {
                blank( end + 2, suffixEnd );
              }
            } else if ( hasSuffix && /^[A-Z]/.test( target ) ) {
              // [[ProperNoun]]suffix — blank everything: suffix alone is not a real word
              blank( i, suffixEnd );
            } else {
              // [[target]] with no suffix: blank only the brackets, keep target text
              // [[target]]suffix: blank brackets AND suffix — the suffix is a fragment
              // of the compound word (e.g. [[woke]]ness → wokeness) and is not a
              // standalone word; the target itself is kept as a prose token
              blank( i, i + 2 );
              blank( end, end + 2 );
              if ( hasSuffix ) blank( end + 2, suffixEnd );
            }
            i = suffixEnd;
          } else {
            i++;
          }
        } else {
          i++;
        }
      }
    }

    // 6. External links: [http://url optional label]
    //    Blank the bracket and URL; keep label if present
    for ( m of text.matchAll( /\[https?:\/\/[^\s\]]*(\s[^\]]*)?\]/g ) ) {
      const full     = m[ 0 ];
      const label    = m[ 1 ];               // captured " label text" or undefined
      const absEnd   = m.index + full.length;
      if ( label ) {
        // Blank "[url " — find where label starts inside full match
        const labelStart = m.index + full.indexOf( label );
        blank( m.index, labelStart );        // "[url "
        blank( absEnd - 1, absEnd );         // "]"
      } else {
        blank( m.index, absEnd );
      }
    }

    // 7. Bold and italic markup: ''' or ''
    for ( m of text.matchAll( /'{2,}/g ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 8. Heading markup: = signs at start/end of lines (keep heading text)
    for ( m of text.matchAll( /^(={1,6})([\s\S]*?)(={1,6})\s*$/gm ) ) {
      blank( m.index, m.index + m[ 1 ].length );
      const closeStart = m.index + m[ 0 ].length - m[ 3 ].length;
      blank( closeStart, closeStart + m[ 3 ].length );
    }

    // 9. Table markup: lines beginning with {|, |}, |-, |, or !
    //    Blank the entire line so cell syntax / attributes are not checked
    for ( m of text.matchAll( /^(?:\{\||\|\}|\|-|[|!])[^\n]*/gm ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 10. List / indent prefixes at the start of a line: ; : * #
    for ( m of text.matchAll( /^[;:*#]+/gm ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 11. Magic words: __TOC__ __NOTOC__ etc.
    for ( m of text.matchAll( /__[A-Z_]+__/g ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 12. Bare URLs (not inside link brackets — those are handled above)
    for ( m of text.matchAll( /https?:\/\/\S+/g ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 13. ISBN / ISSN / RFC / PMID literals
    for ( m of text.matchAll( /\b(?:ISBN|ISSN|RFC|PMID|DOI)\s*[\w\-]+/gi ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 14. Horizontal rules: ---- at start of line
    for ( m of text.matchAll( /^-{4,}$/gm ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 15. Single-letter case-adjustment brackets: [g]enerally, [T]his etc.
    //     Wikipedia editors use [x] to change the case of the first letter of a
    //     quotation. The bracket pair and the following letters form one word;
    //     blank the entire span so neither "enerally" nor any other fragment
    //     is left as a token.
    for ( m of text.matchAll( /\[[a-zA-Z]\][a-zA-ZÀ-ɏ]+/g ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    // 16. HTML entities: &ndash; &amp; &nbsp; &#160; &#x2013; etc.
    for ( m of text.matchAll( /&(?:#\d+|#x[\da-fA-F]+|[a-zA-Z]{2,8});/g ) ) {
      blank( m.index, m.index + m[ 0 ].length );
    }

    return chars.join( '' );
  }

  // -----------------------------------------------------------------------
  // Word-level filters — returns true if the word should be skipped
  // -----------------------------------------------------------------------
  function shouldIgnore( word ) {
    if ( word.length <= 2 )                        return true; // too short
    if ( /\d/.test( word ) )                       return true; // contains digit
    if ( /[^\u0000-\u007F]/.test( word ) )         return true; // non-ASCII → loanword with diacritics (e.g. détente, naïve)
    if ( word === word.toUpperCase() )             return true; // ALL CAPS abbreviation
    if ( /^[A-Z]/.test( word ) )                  return true; // likely proper noun
    if ( /^https?/.test( word ) )                  return true; // stray URL fragment
    if ( /^[-']+$/.test( word ) )                  return true; // punctuation only
    if ( !/[aeiou]/i.test( word ) )               return true; // no vowels
    if ( whitelistSet.has( word.toLowerCase() ) ) return true; // whitelisted
    return false;
  }

  // Extract word tokens with their character offsets in the stripped string.
  // The character class includes Latin-1 Supplement and Latin Extended A/B
  // (U+00C0–U+024F) so that accented words like "détente" or "naïve" are
  // captured as a single token rather than being split at the accented character.
  function extractWords( text ) {
    return [ ...text.matchAll( /[a-zA-Z\u00C0-\u024F'']+/g ) ].map( m => ( {
      word: m[ 0 ].replace( /^['-]+|['-]+$/g, '' ),
      index: m.index,
      raw:   m[ 0 ],
    } ) );
  }

  // -----------------------------------------------------------------------
  // Floating results panel
  // -----------------------------------------------------------------------
  function buildPanel() {
    const panel = document.createElement( 'div' );
    panel.id = 'mw-spellcheck-panel';
    Object.assign( panel.style, {
      position:     'fixed',
      top:          '60px',
      right:        '16px',
      width:        '340px',
      maxHeight:    '65vh',
      overflowY:    'auto',
      background:   '#fff',
      border:       '1px solid #a2a9b1',
      borderRadius: '4px',
      boxShadow:    '0 4px 12px rgba(0,0,0,.25)',
      zIndex:       '9999',
      fontFamily:   'sans-serif',
      fontSize:     '13px',
      lineHeight:   '1.4',
    } );

    const header = document.createElement( 'div' );
    Object.assign( header.style, {
      background:     '#36c',
      color:          '#fff',
      padding:        '8px 12px',
      fontWeight:     'bold',
      display:        'flex',
      justifyContent: 'space-between',
      alignItems:     'center',
      position:       'sticky',
      top:            '0',
    } );
    header.textContent = 'Spell checker';

    const closeBtn = document.createElement( 'button' );
    closeBtn.textContent = '✕';
    closeBtn.title = 'Close';
    Object.assign( closeBtn.style, {
      background:  'transparent',
      border:      'none',
      color:       '#fff',
      cursor:      'pointer',
      fontSize:    '15px',
      lineHeight:  '1',
      padding:     '0 2px',
    } );
    closeBtn.addEventListener( 'click', () => { panel.remove(); resultsPanel = null; } );

    header.appendChild( closeBtn );
    panel.appendChild( header );
    return panel;
  }

  function showResults( typos, textarea ) {
    // Create panel fresh each run
    if ( resultsPanel ) resultsPanel.remove();
    resultsPanel = buildPanel();

    const summary = document.createElement( 'div' );
    Object.assign( summary.style, {
      padding:      '8px 12px',
      borderBottom: '1px solid #eaecf0',
      color:        '#555',
      background:   '#f8f9fa',
    } );
    summary.textContent = typos.length
      ? `${ typos.length } possible typo${ typos.length === 1 ? '' : 's' } found. Click to jump.`
      : 'No typos found.';
    resultsPanel.appendChild( summary );

    for ( const typo of typos ) {
      const item = document.createElement( 'div' );
      Object.assign( item.style, {
        padding:      '7px 12px',
        borderBottom: '1px solid #eaecf0',
        cursor:       'pointer',
      } );
      item.addEventListener( 'mouseenter', () => { item.style.background = '#f0f4ff'; } );
      item.addEventListener( 'mouseleave', () => { item.style.background = '';        } );

      // Word in red
      const wordEl = document.createElement( 'span' );
      wordEl.style.cssText = 'font-weight:bold; color:#d33;';
      wordEl.textContent = typo.word;

      // Context snippet in grey
      const ctxEl = document.createElement( 'div' );
      ctxEl.style.cssText = 'color:#72777d; font-size:11px; margin-top:3px; white-space:nowrap; overflow:hidden; text-overflow:ellipsis;';
      ctxEl.textContent = '…' + typo.context + '…';

      // Suggestions
      const sugEl = document.createElement( 'div' );
      sugEl.style.cssText = 'color:#0645ad; font-size:11px; margin-top:2px;';
      sugEl.textContent = typo.suggestions.length
        ? 'Suggestions: ' + typo.suggestions.join( ', ' )
        : 'No suggestions';

      item.append( wordEl, ctxEl, sugEl );

      item.addEventListener( 'click', () => {
        // Highlight in previously-clicked item: reset
        resultsPanel.querySelectorAll( '[data-active]' ).forEach( el => {
          el.style.background = '';
          el.removeAttribute( 'data-active' );
        } );
        item.dataset.active = '1';
        item.style.background = '#fef6e4';

        // Select the word in the textarea
        textarea.focus();
        textarea.setSelectionRange( typo.index, typo.index + typo.raw.length );

        // Scroll the textarea to show the selection.
        // Approximate line height to compute scrollTop.
        const textBefore  = textarea.value.slice( 0, typo.index );
        const linesBefore = textBefore.split( '\n' ).length - 1;
        const lineHeight  = parseFloat( getComputedStyle( textarea ).lineHeight ) || 18;
        textarea.scrollTop = Math.max( 0, ( linesBefore - 3 ) * lineHeight );
      } );

      resultsPanel.appendChild( item );
    }

    document.body.appendChild( resultsPanel );
  }

  // -----------------------------------------------------------------------
  // Main spell check routine
  // -----------------------------------------------------------------------
  async function runSpellCheck() {
    const textarea = document.getElementById( 'wpTextbox1' );
    if ( !textarea ) {
      mw.notify(
        'Could not find the source editor textarea (#wpTextbox1). ' +
        'This script only works with the source editor, not the visual editor.',
        { type: 'warn', tag: 'spellcheck', autoHide: false }
      );
      return;
    }

    const spell        = await initChecker();
    const rawText      = textarea.value;
    const strippedText = stripWikicode( rawText );
    const wordTokens   = extractWords( strippedText );

    const typos = [];

    for ( const { word, index, raw } of wordTokens ) {
      if ( !word || shouldIgnore( word ) ) continue;
      if ( spell.correct( word ) )         continue;

      const suggestions = spell.suggest( word ).slice( 0, 5 );

      // Build a short context snippet from the *raw* wikitext for readability
      const ctxStart = Math.max( 0, index - 30 );
      const ctxEnd   = Math.min( rawText.length, index + raw.length + 30 );
      const context  = rawText.slice( ctxStart, ctxEnd ).replace( /\s+/g, ' ' ).trim();

      typos.push( { word, index, raw, suggestions, context } );
    }

    showResults( typos, textarea );

    mw.notify(
      'Spell check complete — ' + typos.length +
      ' possible typo' + ( typos.length === 1 ? '' : 's' ) + ' found.',
      { tag: 'spellcheck', autoHide: true }
    );
  }

  // -----------------------------------------------------------------------
  // Add "Check spelling" tab to the editor toolbar
  // -----------------------------------------------------------------------
  mw.loader.using( 'mediawiki.util' ).then( () => {
    const link = mw.util.addPortletLink(
      'p-cactions',
      '#',
      'Check spelling',
      'ca-spellcheck',
      'Check wikitext for spelling errors'
    );
    if ( link ) {
      link.addEventListener( 'click', e => {
        e.preventDefault();
        runSpellCheck();
      } );
    }
  } );

}() );