{
  "@context": "https://schema.org",
  "@type": "TechArticle",
  "@id": "https://anchorfact.org/kb/kb-2026-00285",
  "headline": "Tokenization in NLP",
  "description": "Tokenization splits text into units (tokens) that the model processes. Methods: word-level (large vocabulary, OOV problem), character-level (tiny vocabulary, long sequences), subword (balanced: BPE, WordPiece, SentencePiece — standard for modern LLMs). Subword tokenization handles rare and unseen words by decomposing into known subword units.",
  "dateCreated": "2026-05-22T14:59:47.504Z",
  "dateModified": "2026-05-22T14:59:47.504Z",
  "author": {
    "@type": "Organization",
    "name": "AnchorFact"
  },
  "publisher": {
    "@type": "Organization",
    "name": "AnchorFact",
    "url": "https://anchorfact.org"
  },
  "license": "https://creativecommons.org/licenses/by/4.0/",
  "anchorfact:confidence": "high",
  "anchorfact:generationMethod": "human_only",
  "citation": []
}