项目原始demo,不改动
Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.
Це архівний репозитарій. Ви можете переглядати і клонувати файли, але не можете робити пуш або відкривати питання/запити.
 
 
 
 

877 рядки
25 KiB

  1. module.exports = Tokenizer;
  2. var entityMap = require("./entities/entities.json"),
  3. legacyMap = require("./entities/legacy.json"),
  4. xmlMap = require("./entities/xml.json"),
  5. decodeMap = require("./entities/decode.json"),
  6. i = 0,
  7. TEXT = i++,
  8. BEFORE_TAG_NAME = i++, //after <
  9. IN_TAG_NAME = i++,
  10. IN_SELF_CLOSING_TAG = i++,
  11. BEFORE_CLOSING_TAG_NAME = i++,
  12. IN_CLOSING_TAG_NAME = i++,
  13. AFTER_CLOSING_TAG_NAME = i++,
  14. //attributes
  15. BEFORE_ATTRIBUTE_NAME = i++,
  16. IN_ATTRIBUTE_NAME = i++,
  17. AFTER_ATTRIBUTE_NAME = i++,
  18. BEFORE_ATTRIBUTE_VALUE = i++,
  19. IN_ATTRIBUTE_VALUE_DQ = i++, // "
  20. IN_ATTRIBUTE_VALUE_SQ = i++, // '
  21. IN_ATTRIBUTE_VALUE_NQ = i++,
  22. //declarations
  23. BEFORE_DECLARATION = i++, // !
  24. IN_DECLARATION = i++,
  25. //processing instructions
  26. IN_PROCESSING_INSTRUCTION = i++, // ?
  27. //comments
  28. BEFORE_COMMENT = i++,
  29. IN_COMMENT = i++,
  30. AFTER_COMMENT_1 = i++,
  31. AFTER_COMMENT_2 = i++,
  32. //cdata
  33. BEFORE_CDATA_1 = i++, // [
  34. BEFORE_CDATA_2 = i++, // C
  35. BEFORE_CDATA_3 = i++, // D
  36. BEFORE_CDATA_4 = i++, // A
  37. BEFORE_CDATA_5 = i++, // T
  38. BEFORE_CDATA_6 = i++, // A
  39. IN_CDATA = i++,// [
  40. AFTER_CDATA_1 = i++, // ]
  41. AFTER_CDATA_2 = i++, // ]
  42. //special tags
  43. BEFORE_SPECIAL = i++, //S
  44. BEFORE_SPECIAL_END = i++, //S
  45. BEFORE_SCRIPT_1 = i++, //C
  46. BEFORE_SCRIPT_2 = i++, //R
  47. BEFORE_SCRIPT_3 = i++, //I
  48. BEFORE_SCRIPT_4 = i++, //P
  49. BEFORE_SCRIPT_5 = i++, //T
  50. AFTER_SCRIPT_1 = i++, //C
  51. AFTER_SCRIPT_2 = i++, //R
  52. AFTER_SCRIPT_3 = i++, //I
  53. AFTER_SCRIPT_4 = i++, //P
  54. AFTER_SCRIPT_5 = i++, //T
  55. BEFORE_STYLE_1 = i++, //T
  56. BEFORE_STYLE_2 = i++, //Y
  57. BEFORE_STYLE_3 = i++, //L
  58. BEFORE_STYLE_4 = i++, //E
  59. AFTER_STYLE_1 = i++, //T
  60. AFTER_STYLE_2 = i++, //Y
  61. AFTER_STYLE_3 = i++, //L
  62. AFTER_STYLE_4 = i++, //E
  63. BEFORE_ENTITY = i++, //&
  64. BEFORE_NUMERIC_ENTITY = i++, //#
  65. IN_NAMED_ENTITY = i++,
  66. IN_NUMERIC_ENTITY = i++,
  67. IN_HEX_ENTITY = i++, //X
  68. j = 0,
  69. SPECIAL_NONE = j++,
  70. SPECIAL_SCRIPT = j++,
  71. SPECIAL_STYLE = j++;
  72. function whitespace(c){
  73. return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r";
  74. }
  75. function ifElseState(upper, SUCCESS, FAILURE){
  76. var lower = upper.toLowerCase();
  77. if(upper === lower){
  78. return function(c){
  79. this._state = c === lower ? SUCCESS : FAILURE;
  80. };
  81. } else {
  82. return function(c){
  83. this._state = (c === lower || c === upper) ? SUCCESS : FAILURE;
  84. };
  85. }
  86. }
  87. function consumeSpecialNameChar(upper, NEXT_STATE){
  88. var lower = upper.toLowerCase();
  89. return function(c){
  90. if(c === lower || c === upper){
  91. this._state = NEXT_STATE;
  92. } else {
  93. this._state = IN_TAG_NAME;
  94. this._index--; //consume the token again
  95. }
  96. };
  97. }
  98. function Tokenizer(options, cbs){
  99. this._state = TEXT;
  100. this._buffer = "";
  101. this._sectionStart = 0;
  102. this._index = 0;
  103. this._baseState = TEXT;
  104. this._special = SPECIAL_NONE;
  105. this._cbs = cbs;
  106. this._running = true;
  107. this._xmlMode = !!(options && options.xmlMode);
  108. this._decodeEntities = !!(options && options.decodeEntities);
  109. }
  110. Tokenizer.prototype._stateText = function(c){
  111. if(c === "<"){
  112. if(this._index > this._sectionStart){
  113. this._cbs.ontext(this._getSection());
  114. }
  115. this._state = BEFORE_TAG_NAME;
  116. this._sectionStart = this._index;
  117. } else if(this._decodeEntities && this._special === SPECIAL_NONE && c === "&"){
  118. if(this._index > this._sectionStart){
  119. this._cbs.ontext(this._getSection());
  120. }
  121. this._baseState = TEXT;
  122. this._state = BEFORE_ENTITY;
  123. this._sectionStart = this._index;
  124. }
  125. };
  126. Tokenizer.prototype._stateBeforeTagName = function(c){
  127. if(c === "/"){
  128. this._state = BEFORE_CLOSING_TAG_NAME;
  129. } else if(c === ">" || this._special !== SPECIAL_NONE || whitespace(c)) {
  130. this._state = TEXT;
  131. } else if(c === "!"){
  132. this._state = BEFORE_DECLARATION;
  133. this._sectionStart = this._index + 1;
  134. } else if(c === "?"){
  135. this._state = IN_PROCESSING_INSTRUCTION;
  136. this._sectionStart = this._index + 1;
  137. } else if(c === "<"){
  138. this._cbs.ontext(this._getSection());
  139. this._sectionStart = this._index;
  140. } else {
  141. this._state = (!this._xmlMode && (c === "s" || c === "S")) ?
  142. BEFORE_SPECIAL : IN_TAG_NAME;
  143. this._sectionStart = this._index;
  144. }
  145. };
  146. Tokenizer.prototype._stateInTagName = function(c){
  147. if(c === "/" || c === ">" || whitespace(c)){
  148. this._emitToken("onopentagname");
  149. this._state = BEFORE_ATTRIBUTE_NAME;
  150. this._index--;
  151. }
  152. };
  153. Tokenizer.prototype._stateBeforeCloseingTagName = function(c){
  154. if(whitespace(c));
  155. else if(c === ">"){
  156. this._state = TEXT;
  157. } else if(this._special !== SPECIAL_NONE){
  158. if(c === "s" || c === "S"){
  159. this._state = BEFORE_SPECIAL_END;
  160. } else {
  161. this._state = TEXT;
  162. this._index--;
  163. }
  164. } else {
  165. this._state = IN_CLOSING_TAG_NAME;
  166. this._sectionStart = this._index;
  167. }
  168. };
  169. Tokenizer.prototype._stateInCloseingTagName = function(c){
  170. if(c === ">" || whitespace(c)){
  171. this._emitToken("onclosetag");
  172. this._state = AFTER_CLOSING_TAG_NAME;
  173. this._index--;
  174. }
  175. };
  176. Tokenizer.prototype._stateAfterCloseingTagName = function(c){
  177. //skip everything until ">"
  178. if(c === ">"){
  179. this._state = TEXT;
  180. this._sectionStart = this._index + 1;
  181. }
  182. };
  183. Tokenizer.prototype._stateBeforeAttributeName = function(c){
  184. if(c === ">"){
  185. this._cbs.onopentagend();
  186. this._state = TEXT;
  187. this._sectionStart = this._index + 1;
  188. } else if(c === "/"){
  189. this._state = IN_SELF_CLOSING_TAG;
  190. } else if(!whitespace(c)){
  191. this._state = IN_ATTRIBUTE_NAME;
  192. this._sectionStart = this._index;
  193. }
  194. };
  195. Tokenizer.prototype._stateInSelfClosingTag = function(c){
  196. if(c === ">"){
  197. this._cbs.onselfclosingtag();
  198. this._state = TEXT;
  199. this._sectionStart = this._index + 1;
  200. } else if(!whitespace(c)){
  201. this._state = BEFORE_ATTRIBUTE_NAME;
  202. this._index--;
  203. }
  204. };
  205. Tokenizer.prototype._stateInAttributeName = function(c){
  206. if(c === "=" || c === "/" || c === ">" || whitespace(c)){
  207. if(this._index > this._sectionStart){
  208. this._cbs.onattribname(this._getSection());
  209. }
  210. this._sectionStart = -1;
  211. this._state = AFTER_ATTRIBUTE_NAME;
  212. this._index--;
  213. }
  214. };
  215. Tokenizer.prototype._stateAfterAttributeName = function(c){
  216. if(c === "="){
  217. this._state = BEFORE_ATTRIBUTE_VALUE;
  218. } else if(c === "/" || c === ">"){
  219. this._cbs.onattribend();
  220. this._state = BEFORE_ATTRIBUTE_NAME;
  221. this._index--;
  222. } else if(!whitespace(c)){
  223. this._cbs.onattribend();
  224. this._state = IN_ATTRIBUTE_NAME;
  225. this._sectionStart = this._index;
  226. }
  227. };
  228. Tokenizer.prototype._stateBeforeAttributeValue = function(c){
  229. if(c === "\""){
  230. this._state = IN_ATTRIBUTE_VALUE_DQ;
  231. this._sectionStart = this._index + 1;
  232. } else if(c === "'"){
  233. this._state = IN_ATTRIBUTE_VALUE_SQ;
  234. this._sectionStart = this._index + 1;
  235. } else if(!whitespace(c)){
  236. this._state = IN_ATTRIBUTE_VALUE_NQ;
  237. this._sectionStart = this._index;
  238. }
  239. };
  240. Tokenizer.prototype._stateInAttributeValueDoubleQuotes = function(c){
  241. if(c === "\""){
  242. this._emitToken("onattribdata");
  243. this._cbs.onattribend();
  244. this._state = BEFORE_ATTRIBUTE_NAME;
  245. } else if(this._decodeEntities && c === "&"){
  246. this._emitToken("onattribdata");
  247. this._baseState = this._state;
  248. this._state = BEFORE_ENTITY;
  249. this._sectionStart = this._index;
  250. }
  251. };
  252. Tokenizer.prototype._stateInAttributeValueSingleQuotes = function(c){
  253. if(c === "'"){
  254. this._emitToken("onattribdata");
  255. this._cbs.onattribend();
  256. this._state = BEFORE_ATTRIBUTE_NAME;
  257. } else if(this._decodeEntities && c === "&"){
  258. this._emitToken("onattribdata");
  259. this._baseState = this._state;
  260. this._state = BEFORE_ENTITY;
  261. this._sectionStart = this._index;
  262. }
  263. };
  264. Tokenizer.prototype._stateInAttributeValueNoQuotes = function(c){
  265. if(whitespace(c) || c === ">"){
  266. this._emitToken("onattribdata");
  267. this._cbs.onattribend();
  268. this._state = BEFORE_ATTRIBUTE_NAME;
  269. this._index--;
  270. } else if(this._decodeEntities && c === "&"){
  271. this._emitToken("onattribdata");
  272. this._baseState = this._state;
  273. this._state = BEFORE_ENTITY;
  274. this._sectionStart = this._index;
  275. }
  276. };
  277. Tokenizer.prototype._stateBeforeDeclaration = function(c){
  278. this._state = c === "[" ? BEFORE_CDATA_1 :
  279. c === "-" ? BEFORE_COMMENT :
  280. IN_DECLARATION;
  281. };
  282. Tokenizer.prototype._stateInDeclaration = function(c){
  283. if(c === ">"){
  284. this._cbs.ondeclaration(this._getSection());
  285. this._state = TEXT;
  286. this._sectionStart = this._index + 1;
  287. }
  288. };
  289. Tokenizer.prototype._stateInProcessingInstruction = function(c){
  290. if(c === ">"){
  291. this._cbs.onprocessinginstruction(this._getSection());
  292. this._state = TEXT;
  293. this._sectionStart = this._index + 1;
  294. }
  295. };
  296. Tokenizer.prototype._stateBeforeComment = function(c){
  297. if(c === "-"){
  298. this._state = IN_COMMENT;
  299. this._sectionStart = this._index + 1;
  300. } else {
  301. this._state = IN_DECLARATION;
  302. }
  303. };
  304. Tokenizer.prototype._stateInComment = function(c){
  305. if(c === "-") this._state = AFTER_COMMENT_1;
  306. };
  307. Tokenizer.prototype._stateAfterComment1 = ifElseState("-", AFTER_COMMENT_2, IN_COMMENT);
  308. Tokenizer.prototype._stateAfterComment2 = function(c){
  309. if(c === ">"){
  310. //remove 2 trailing chars
  311. this._cbs.oncomment(this._buffer.substring(this._sectionStart, this._index - 2));
  312. this._state = TEXT;
  313. this._sectionStart = this._index + 1;
  314. } else if(c !== "-"){
  315. this._state = IN_COMMENT;
  316. }
  317. // else: stay in AFTER_COMMENT_2 (`--->`)
  318. };
  319. Tokenizer.prototype._stateBeforeCdata1 = ifElseState("C", BEFORE_CDATA_2, IN_DECLARATION);
  320. Tokenizer.prototype._stateBeforeCdata2 = ifElseState("D", BEFORE_CDATA_3, IN_DECLARATION);
  321. Tokenizer.prototype._stateBeforeCdata3 = ifElseState("A", BEFORE_CDATA_4, IN_DECLARATION);
  322. Tokenizer.prototype._stateBeforeCdata4 = ifElseState("T", BEFORE_CDATA_5, IN_DECLARATION);
  323. Tokenizer.prototype._stateBeforeCdata5 = ifElseState("A", BEFORE_CDATA_6, IN_DECLARATION);
  324. Tokenizer.prototype._stateBeforeCdata6 = function(c){
  325. if(c === "["){
  326. this._state = IN_CDATA;
  327. this._sectionStart = this._index + 1;
  328. } else {
  329. this._state = IN_DECLARATION;
  330. }
  331. };
  332. Tokenizer.prototype._stateInCdata = function(c){
  333. if(c === "]") this._state = AFTER_CDATA_1;
  334. };
  335. Tokenizer.prototype._stateAfterCdata1 = ifElseState("]", AFTER_CDATA_2, IN_CDATA);
  336. Tokenizer.prototype._stateAfterCdata2 = function(c){
  337. if(c === ">"){
  338. //remove 2 trailing chars
  339. this._cbs.oncdata(this._buffer.substring(this._sectionStart, this._index - 2));
  340. this._state = TEXT;
  341. this._sectionStart = this._index + 1;
  342. } else if (c !== "]") {
  343. this._state = IN_CDATA;
  344. }
  345. //else: stay in AFTER_CDATA_2 (`]]]>`)
  346. };
  347. Tokenizer.prototype._stateBeforeSpecial = function(c){
  348. if(c === "c" || c === "C"){
  349. this._state = BEFORE_SCRIPT_1;
  350. } else if(c === "t" || c === "T"){
  351. this._state = BEFORE_STYLE_1;
  352. } else {
  353. this._state = IN_TAG_NAME;
  354. this._index--; //consume the token again
  355. }
  356. };
  357. Tokenizer.prototype._stateBeforeSpecialEnd = function(c){
  358. if(this._special === SPECIAL_SCRIPT && (c === "c" || c === "C")){
  359. this._state = AFTER_SCRIPT_1;
  360. } else if(this._special === SPECIAL_STYLE && (c === "t" || c === "T")){
  361. this._state = AFTER_STYLE_1;
  362. }
  363. else this._state = TEXT;
  364. };
  365. Tokenizer.prototype._stateBeforeScript1 = consumeSpecialNameChar("R", BEFORE_SCRIPT_2);
  366. Tokenizer.prototype._stateBeforeScript2 = consumeSpecialNameChar("I", BEFORE_SCRIPT_3);
  367. Tokenizer.prototype._stateBeforeScript3 = consumeSpecialNameChar("P", BEFORE_SCRIPT_4);
  368. Tokenizer.prototype._stateBeforeScript4 = consumeSpecialNameChar("T", BEFORE_SCRIPT_5);
  369. Tokenizer.prototype._stateBeforeScript5 = function(c){
  370. if(c === "/" || c === ">" || whitespace(c)){
  371. this._special = SPECIAL_SCRIPT;
  372. }
  373. this._state = IN_TAG_NAME;
  374. this._index--; //consume the token again
  375. };
  376. Tokenizer.prototype._stateAfterScript1 = ifElseState("R", AFTER_SCRIPT_2, TEXT);
  377. Tokenizer.prototype._stateAfterScript2 = ifElseState("I", AFTER_SCRIPT_3, TEXT);
  378. Tokenizer.prototype._stateAfterScript3 = ifElseState("P", AFTER_SCRIPT_4, TEXT);
  379. Tokenizer.prototype._stateAfterScript4 = ifElseState("T", AFTER_SCRIPT_5, TEXT);
  380. Tokenizer.prototype._stateAfterScript5 = function(c){
  381. if(c === ">" || whitespace(c)){
  382. this._special = SPECIAL_NONE;
  383. this._state = IN_CLOSING_TAG_NAME;
  384. this._sectionStart = this._index - 6;
  385. this._index--; //reconsume the token
  386. }
  387. else this._state = TEXT;
  388. };
  389. Tokenizer.prototype._stateBeforeStyle1 = consumeSpecialNameChar("Y", BEFORE_STYLE_2);
  390. Tokenizer.prototype._stateBeforeStyle2 = consumeSpecialNameChar("L", BEFORE_STYLE_3);
  391. Tokenizer.prototype._stateBeforeStyle3 = consumeSpecialNameChar("E", BEFORE_STYLE_4);
  392. Tokenizer.prototype._stateBeforeStyle4 = function(c){
  393. if(c === "/" || c === ">" || whitespace(c)){
  394. this._special = SPECIAL_STYLE;
  395. }
  396. this._state = IN_TAG_NAME;
  397. this._index--; //consume the token again
  398. };
  399. Tokenizer.prototype._stateAfterStyle1 = ifElseState("Y", AFTER_STYLE_2, TEXT);
  400. Tokenizer.prototype._stateAfterStyle2 = ifElseState("L", AFTER_STYLE_3, TEXT);
  401. Tokenizer.prototype._stateAfterStyle3 = ifElseState("E", AFTER_STYLE_4, TEXT);
  402. Tokenizer.prototype._stateAfterStyle4 = function(c){
  403. if(c === ">" || whitespace(c)){
  404. this._special = SPECIAL_NONE;
  405. this._state = IN_CLOSING_TAG_NAME;
  406. this._sectionStart = this._index - 5;
  407. this._index--; //reconsume the token
  408. }
  409. else this._state = TEXT;
  410. };
  411. Tokenizer.prototype._stateBeforeEntity = ifElseState("#", BEFORE_NUMERIC_ENTITY, IN_NAMED_ENTITY);
  412. Tokenizer.prototype._stateBeforeNumericEntity = ifElseState("X", IN_HEX_ENTITY, IN_NUMERIC_ENTITY);
  413. //for entities within attributes
  414. Tokenizer.prototype._parseNamedEntityStrict = function(){
  415. //offset = 1
  416. if(this._sectionStart + 1 < this._index){
  417. var entity = this._buffer.substring(this._sectionStart + 1, this._index),
  418. map = this._xmlMode ? xmlMap : entityMap;
  419. if(map.hasOwnProperty(entity)){
  420. this._emitPartial(map[entity]);
  421. this._sectionStart = this._index + 1;
  422. }
  423. }
  424. };
  425. //parses legacy entities (without trailing semicolon)
  426. Tokenizer.prototype._parseLegacyEntity = function(){
  427. var start = this._sectionStart + 1,
  428. limit = this._index - start;
  429. if(limit > 6) limit = 6; //the max length of legacy entities is 6
  430. while(limit >= 2){ //the min length of legacy entities is 2
  431. var entity = this._buffer.substr(start, limit);
  432. if(legacyMap.hasOwnProperty(entity)){
  433. this._emitPartial(legacyMap[entity]);
  434. this._sectionStart += limit + 2;
  435. break;
  436. } else {
  437. limit--;
  438. }
  439. }
  440. };
  441. Tokenizer.prototype._stateInNamedEntity = function(c){
  442. if(c === ";"){
  443. this._parseNamedEntityStrict();
  444. if(this._sectionStart + 1 < this._index && !this._xmlMode){
  445. this._parseLegacyEntity();
  446. }
  447. this._state = this._baseState;
  448. } else if((c < "a" || c > "z") && (c < "A" || c > "Z") && (c < "0" || c > "9")){
  449. if(this._xmlMode);
  450. else if(this._baseState !== TEXT){
  451. if(c !== "="){
  452. this._parseNamedEntityStrict();
  453. this._sectionStart--; //include the current character in the section
  454. }
  455. } else {
  456. this._parseLegacyEntity();
  457. this._sectionStart--;
  458. }
  459. this._state = this._baseState;
  460. this._index--;
  461. }
  462. };
  463. // modified version of https://github.com/mathiasbynens/he/blob/master/src/he.js#L94-L119
  464. function decodeCodePoint(codePoint){
  465. var output = "";
  466. if((codePoint >= 0xD800 && codePoint <= 0xDFFF) || codePoint > 0x10FFFF){
  467. return "\uFFFD";
  468. }
  469. if(codePoint in decodeMap){
  470. codePoint = decodeMap[codePoint];
  471. }
  472. if(codePoint > 0xFFFF){
  473. codePoint -= 0x10000;
  474. output += String.fromCharCode(codePoint >>> 10 & 0x3FF | 0xD800);
  475. codePoint = 0xDC00 | codePoint & 0x3FF;
  476. }
  477. output += String.fromCharCode(codePoint);
  478. return output;
  479. }
  480. Tokenizer.prototype._decodeNumericEntity = function(offset, base){
  481. var sectionStart = this._sectionStart + offset;
  482. if(sectionStart !== this._index){
  483. //parse entity
  484. var entity = this._buffer.substring(sectionStart, this._index);
  485. var parsed = parseInt(entity, base);
  486. if(parsed === parsed){ //not NaN (TODO: when can this happen?)
  487. this._emitPartial(decodeCodePoint(parsed));
  488. this._sectionStart = this._index;
  489. }
  490. }
  491. this._state = this._baseState;
  492. };
  493. Tokenizer.prototype._stateInNumericEntity = function(c){
  494. if(c === ";"){
  495. this._decodeNumericEntity(2, 10);
  496. this._sectionStart++;
  497. } else if(c < "0" || c > "9"){
  498. if(!this._xmlMode){
  499. this._decodeNumericEntity(2, 10);
  500. } else {
  501. this._state = this._baseState;
  502. }
  503. this._index--;
  504. }
  505. };
  506. Tokenizer.prototype._stateInHexEntity = function(c){
  507. if(c === ";"){
  508. this._decodeNumericEntity(3, 16);
  509. this._sectionStart++;
  510. } else if((c < "a" || c > "f") && (c < "A" || c > "F") && (c < "0" || c > "9")){
  511. if(!this._xmlMode){
  512. this._decodeNumericEntity(3, 16);
  513. } else {
  514. this._state = this._baseState;
  515. }
  516. this._index--;
  517. }
  518. };
  519. Tokenizer.prototype._cleanup = function () {
  520. if(this._sectionStart < 0){
  521. this._buffer = "";
  522. this._index = 0;
  523. } else {
  524. if(this._state === TEXT){
  525. if(this._sectionStart !== this._index){
  526. this._cbs.ontext(this._buffer.substr(this._sectionStart));
  527. }
  528. this._buffer = "";
  529. this._index = 0;
  530. } else if(this._sectionStart === this._index){
  531. //the section just started
  532. this._buffer = "";
  533. this._index = 0;
  534. } else {
  535. //remove everything unnecessary
  536. this._buffer = this._buffer.substr(this._sectionStart);
  537. this._index -= this._sectionStart;
  538. }
  539. this._sectionStart = 0;
  540. }
  541. };
  542. //TODO make events conditional
  543. Tokenizer.prototype.write = function(chunk){
  544. this._buffer += chunk;
  545. while(this._index < this._buffer.length && this._running){
  546. var c = this._buffer.charAt(this._index);
  547. if(this._state === TEXT) {
  548. this._stateText(c);
  549. } else if(this._state === BEFORE_TAG_NAME){
  550. this._stateBeforeTagName(c);
  551. } else if(this._state === IN_TAG_NAME) {
  552. this._stateInTagName(c);
  553. } else if(this._state === BEFORE_CLOSING_TAG_NAME){
  554. this._stateBeforeCloseingTagName(c);
  555. } else if(this._state === IN_CLOSING_TAG_NAME){
  556. this._stateInCloseingTagName(c);
  557. } else if(this._state === AFTER_CLOSING_TAG_NAME){
  558. this._stateAfterCloseingTagName(c);
  559. } else if(this._state === IN_SELF_CLOSING_TAG){
  560. this._stateInSelfClosingTag(c);
  561. }
  562. /*
  563. * attributes
  564. */
  565. else if(this._state === BEFORE_ATTRIBUTE_NAME){
  566. this._stateBeforeAttributeName(c);
  567. } else if(this._state === IN_ATTRIBUTE_NAME){
  568. this._stateInAttributeName(c);
  569. } else if(this._state === AFTER_ATTRIBUTE_NAME){
  570. this._stateAfterAttributeName(c);
  571. } else if(this._state === BEFORE_ATTRIBUTE_VALUE){
  572. this._stateBeforeAttributeValue(c);
  573. } else if(this._state === IN_ATTRIBUTE_VALUE_DQ){
  574. this._stateInAttributeValueDoubleQuotes(c);
  575. } else if(this._state === IN_ATTRIBUTE_VALUE_SQ){
  576. this._stateInAttributeValueSingleQuotes(c);
  577. } else if(this._state === IN_ATTRIBUTE_VALUE_NQ){
  578. this._stateInAttributeValueNoQuotes(c);
  579. }
  580. /*
  581. * declarations
  582. */
  583. else if(this._state === BEFORE_DECLARATION){
  584. this._stateBeforeDeclaration(c);
  585. } else if(this._state === IN_DECLARATION){
  586. this._stateInDeclaration(c);
  587. }
  588. /*
  589. * processing instructions
  590. */
  591. else if(this._state === IN_PROCESSING_INSTRUCTION){
  592. this._stateInProcessingInstruction(c);
  593. }
  594. /*
  595. * comments
  596. */
  597. else if(this._state === BEFORE_COMMENT){
  598. this._stateBeforeComment(c);
  599. } else if(this._state === IN_COMMENT){
  600. this._stateInComment(c);
  601. } else if(this._state === AFTER_COMMENT_1){
  602. this._stateAfterComment1(c);
  603. } else if(this._state === AFTER_COMMENT_2){
  604. this._stateAfterComment2(c);
  605. }
  606. /*
  607. * cdata
  608. */
  609. else if(this._state === BEFORE_CDATA_1){
  610. this._stateBeforeCdata1(c);
  611. } else if(this._state === BEFORE_CDATA_2){
  612. this._stateBeforeCdata2(c);
  613. } else if(this._state === BEFORE_CDATA_3){
  614. this._stateBeforeCdata3(c);
  615. } else if(this._state === BEFORE_CDATA_4){
  616. this._stateBeforeCdata4(c);
  617. } else if(this._state === BEFORE_CDATA_5){
  618. this._stateBeforeCdata5(c);
  619. } else if(this._state === BEFORE_CDATA_6){
  620. this._stateBeforeCdata6(c);
  621. } else if(this._state === IN_CDATA){
  622. this._stateInCdata(c);
  623. } else if(this._state === AFTER_CDATA_1){
  624. this._stateAfterCdata1(c);
  625. } else if(this._state === AFTER_CDATA_2){
  626. this._stateAfterCdata2(c);
  627. }
  628. /*
  629. * special tags
  630. */
  631. else if(this._state === BEFORE_SPECIAL){
  632. this._stateBeforeSpecial(c);
  633. } else if(this._state === BEFORE_SPECIAL_END){
  634. this._stateBeforeSpecialEnd(c);
  635. }
  636. /*
  637. * script
  638. */
  639. else if(this._state === BEFORE_SCRIPT_1){
  640. this._stateBeforeScript1(c);
  641. } else if(this._state === BEFORE_SCRIPT_2){
  642. this._stateBeforeScript2(c);
  643. } else if(this._state === BEFORE_SCRIPT_3){
  644. this._stateBeforeScript3(c);
  645. } else if(this._state === BEFORE_SCRIPT_4){
  646. this._stateBeforeScript4(c);
  647. } else if(this._state === BEFORE_SCRIPT_5){
  648. this._stateBeforeScript5(c);
  649. }
  650. else if(this._state === AFTER_SCRIPT_1){
  651. this._stateAfterScript1(c);
  652. } else if(this._state === AFTER_SCRIPT_2){
  653. this._stateAfterScript2(c);
  654. } else if(this._state === AFTER_SCRIPT_3){
  655. this._stateAfterScript3(c);
  656. } else if(this._state === AFTER_SCRIPT_4){
  657. this._stateAfterScript4(c);
  658. } else if(this._state === AFTER_SCRIPT_5){
  659. this._stateAfterScript5(c);
  660. }
  661. /*
  662. * style
  663. */
  664. else if(this._state === BEFORE_STYLE_1){
  665. this._stateBeforeStyle1(c);
  666. } else if(this._state === BEFORE_STYLE_2){
  667. this._stateBeforeStyle2(c);
  668. } else if(this._state === BEFORE_STYLE_3){
  669. this._stateBeforeStyle3(c);
  670. } else if(this._state === BEFORE_STYLE_4){
  671. this._stateBeforeStyle4(c);
  672. }
  673. else if(this._state === AFTER_STYLE_1){
  674. this._stateAfterStyle1(c);
  675. } else if(this._state === AFTER_STYLE_2){
  676. this._stateAfterStyle2(c);
  677. } else if(this._state === AFTER_STYLE_3){
  678. this._stateAfterStyle3(c);
  679. } else if(this._state === AFTER_STYLE_4){
  680. this._stateAfterStyle4(c);
  681. }
  682. /*
  683. * entities
  684. */
  685. else if(this._state === BEFORE_ENTITY){
  686. this._stateBeforeEntity(c);
  687. } else if(this._state === BEFORE_NUMERIC_ENTITY){
  688. this._stateBeforeNumericEntity(c);
  689. } else if(this._state === IN_NAMED_ENTITY){
  690. this._stateInNamedEntity(c);
  691. } else if(this._state === IN_NUMERIC_ENTITY){
  692. this._stateInNumericEntity(c);
  693. } else if(this._state === IN_HEX_ENTITY){
  694. this._stateInHexEntity(c);
  695. }
  696. else {
  697. this._cbs.onerror(Error("unknown _state"), this._state);
  698. }
  699. this._index++;
  700. }
  701. this._cleanup();
  702. };
  703. Tokenizer.prototype.pause = function(){
  704. this._running = false;
  705. };
  706. Tokenizer.prototype.resume = function(){
  707. this._running = true;
  708. };
  709. Tokenizer.prototype.end = function(chunk){
  710. if(chunk) this.write(chunk);
  711. //if there is remaining data, emit it in a reasonable way
  712. if(this._sectionStart < this._index){
  713. this._handleTrailingData();
  714. }
  715. this._cbs.onend();
  716. };
  717. Tokenizer.prototype._handleTrailingData = function(){
  718. var data = this._buffer.substr(this._sectionStart);
  719. if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){
  720. this._cbs.oncdata(data);
  721. } else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){
  722. this._cbs.oncomment(data);
  723. } else if(this._state === IN_TAG_NAME){
  724. this._cbs.onopentagname(data);
  725. } else if(this._state === BEFORE_ATTRIBUTE_NAME || this._state === BEFORE_ATTRIBUTE_VALUE || this._state === AFTER_ATTRIBUTE_NAME){
  726. this._cbs.onopentagend();
  727. } else if(this._state === IN_ATTRIBUTE_NAME){
  728. this._cbs.onattribname(data);
  729. } else if(this._state === IN_ATTRIBUTE_VALUE_SQ || this._state === IN_ATTRIBUTE_VALUE_DQ || this._state === IN_ATTRIBUTE_VALUE_NQ){
  730. this._cbs.onattribdata(data);
  731. this._cbs.onattribend();
  732. } else if(this._state === IN_CLOSING_TAG_NAME){
  733. this._cbs.onclosetag(data);
  734. } else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){
  735. this._parseLegacyEntity();
  736. if(--this._sectionStart < this._index){
  737. this._state = this._baseState;
  738. this._handleTrailingData();
  739. }
  740. } else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){
  741. this._decodeNumericEntity(2, 10);
  742. if(this._sectionStart < this._index){
  743. this._state = this._baseState;
  744. this._handleTrailingData();
  745. }
  746. } else if(this._state === IN_HEX_ENTITY && !this._xmlMode){
  747. this._decodeNumericEntity(3, 16);
  748. if(this._sectionStart < this._index){
  749. this._state = this._baseState;
  750. this._handleTrailingData();
  751. }
  752. } else {
  753. this._cbs.ontext(data);
  754. }
  755. };
  756. Tokenizer.prototype.reset = function(){
  757. Tokenizer.call(this, {xmlMode: this._xmlMode, decodeEntities: this._decodeEntities}, this._cbs);
  758. };
  759. Tokenizer.prototype._getSection = function(){
  760. return this._buffer.substring(this._sectionStart, this._index);
  761. };
  762. Tokenizer.prototype._emitToken = function(name){
  763. this._cbs[name](this._getSection());
  764. this._sectionStart = -1;
  765. };
  766. Tokenizer.prototype._emitPartial = function(value){
  767. if(this._baseState !== TEXT){
  768. this._cbs.onattribdata(value); //TODO implement the new event
  769. } else {
  770. this._cbs.ontext(value);
  771. }
  772. };