Hubbub
detect.c
Go to the documentation of this file.
1 /*
2  * This file is part of Hubbub.
3  * Licensed under the MIT License,
4  * http://www.opensource.org/licenses/mit-license.php
5  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
6  */
7 
8 #include <assert.h>
9 #include <stdbool.h>
10 #include <string.h>
11 
12 #include <parserutils/charset/mibenum.h>
13 
14 #include <hubbub/types.h>
15 
16 #include "utils/utils.h"
17 
18 #include "detect.h"
19 
20 static uint16_t hubbub_charset_read_bom(const uint8_t *data, size_t len);
21 static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len);
22 static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
23  const uint8_t *end);
24 static bool hubbub_charset_get_attribute(const uint8_t **data,
25  const uint8_t *end,
26  const uint8_t **name, uint32_t *namelen,
27  const uint8_t **value, uint32_t *valuelen);
28 
43 parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
44  uint16_t *mibenum, uint32_t *source)
45 {
46  uint16_t charset = 0;
47 
48  if (data == NULL || mibenum == NULL || source == NULL)
49  return PARSERUTILS_BADPARM;
50 
63  /* 1. */
64 
65  /* If the source is dictated, there's nothing for us to do */
66  if (*source == HUBBUB_CHARSET_CONFIDENT ||
67  *source == HUBBUB_CHARSET_TENTATIVE) {
68  return PARSERUTILS_OK;
69  }
70 
71  /* 2. */
72 
75  /* 3. */
76 
77  /* We need at least 3 bytes of data */
78  if (len < 3)
79  goto default_encoding;
80 
81  /* First, look for a BOM */
82  charset = hubbub_charset_read_bom(data, len);
83  if (charset != 0) {
84  *mibenum = charset;
85  *source = HUBBUB_CHARSET_CONFIDENT;
86 
87  return PARSERUTILS_OK;
88  }
89 
90  /* 4. */
91 
92  /* No BOM was found, so we must look for a meta charset within
93  * the document itself. */
94  charset = hubbub_charset_scan_meta(data, len);
95  if (charset != 0) {
96  /* Fix charsets according to HTML5,
97  * section 8.2.2.2. Character encoding requirements */
99 
100  /* If we've encountered a meta charset for a non-ASCII-
101  * compatible encoding, don't trust it.
102  *
103  * Firstly, it should have been sent with a BOM (and thus
104  * detected above).
105  *
106  * Secondly, we've just used an ASCII-only parser to
107  * extract the encoding from the document. Therefore,
108  * the document plainly isn't what the meta charset
109  * claims it is.
110  *
111  * What we do in this case is to ignore the meta charset's
112  * claims and leave the charset determination to the
113  * autodetection routines (or the fallback case if they
114  * fail).
115  */
116  if (charset != parserutils_charset_mibenum_from_name(
117  "UTF-32", SLEN("UTF-32")) &&
118  charset != parserutils_charset_mibenum_from_name(
119  "UTF-32LE", SLEN("UTF-32LE")) &&
120  charset != parserutils_charset_mibenum_from_name(
121  "UTF-32BE", SLEN("UTF-32BE"))) {
122 
123  *mibenum = charset;
124  *source = HUBBUB_CHARSET_TENTATIVE;
125 
126  return PARSERUTILS_OK;
127  }
128  }
129 
130  /* No charset was specified within the document, attempt to
131  * autodetect the encoding from the data that we have available. */
132 
135  /* We failed to autodetect a charset, so use the default fallback */
136 default_encoding:
137 
138  /* 7. */
139 
140  charset = parserutils_charset_mibenum_from_name("Windows-1252",
141  SLEN("Windows-1252"));
142  if (charset == 0)
143  charset = parserutils_charset_mibenum_from_name("ISO-8859-1",
144  SLEN("ISO-8859-1"));
145 
146  *mibenum = charset;
147  *source = HUBBUB_CHARSET_TENTATIVE;
148 
149  return PARSERUTILS_OK;
150 }
151 
152 
161 uint16_t hubbub_charset_read_bom(const uint8_t *data, size_t len)
162 {
163  if (data == NULL)
164  return 0;
165 
166  /* We require at least 3 bytes of data */
167  if (len < 3)
168  return 0;
169 
170  if (data[0] == 0xFE && data[1] == 0xFF) {
171  return parserutils_charset_mibenum_from_name("UTF-16BE",
172  SLEN("UTF-16BE"));
173  } else if (data[0] == 0xFF && data[1] == 0xFE) {
174  return parserutils_charset_mibenum_from_name("UTF-16LE",
175  SLEN("UTF-16LE"));
176  } else if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) {
177  return parserutils_charset_mibenum_from_name("UTF-8",
178  SLEN("UTF-8"));
179  }
180 
181  return 0;
182 }
183 
184 #define PEEK(a) \
185  (pos < end - SLEN(a) && \
186  strncasecmp((const char *) pos, a, SLEN(a)) == 0)
187 
188 #define ADVANCE(a) \
189  while (pos < end - SLEN(a)) { \
190  if (PEEK(a)) \
191  break; \
192  pos++; \
193  } \
194  \
195  if (pos == end - SLEN(a)) \
196  return 0;
197 
198 #define ISSPACE(a) \
199  (a == 0x09 || a == 0x0a || a == 0x0c || \
200  a == 0x0d || a == 0x20 || a == 0x2f)
201 
209 uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len)
210 {
211  const uint8_t *pos = data;
212  const uint8_t *end;
213  uint16_t mibenum;
214 
215  if (data == NULL)
216  return 0;
217 
218  end = pos + min(512, len);
219 
220  /* 1. */
221  while (pos < end) {
222  /* a */
223  if (PEEK("<!--")) {
224  pos += SLEN("<!--");
225  ADVANCE("-->");
226  /* b */
227  } else if (PEEK("<meta")) {
228  if (pos + SLEN("<meta") >= end - 1)
229  return 0;
230 
231  if (ISSPACE(*(pos + SLEN("<meta")))) {
232  /* 1 */
233  pos += SLEN("<meta");
234 
236  &pos, end);
237  if (mibenum != 0)
238  return mibenum;
239 
240  if (pos >= end)
241  return 0;
242  }
243  /* c */
244  } else if ((PEEK("</") && (pos < end - 3 &&
245  (0x41 <= (*(pos + 2) & ~ 0x20) &&
246  (*(pos + 2) & ~ 0x20) <= 0x5A))) ||
247  (pos < end - 2 && *pos == '<' &&
248  (0x41 <= (*(pos + 1) & ~ 0x20) &&
249  (*(pos + 1) & ~ 0x20) <= 0x5A))) {
250 
251  /* skip '<' */
252  pos++;
253 
254  /* 1. */
255  while (pos < end) {
256  if (ISSPACE(*pos) ||
257  *pos == '>' || *pos == '<')
258  break;
259  pos++;
260  }
261 
262  if (pos >= end)
263  return 0;
264 
265  /* 3 */
266  if (*pos != '<') {
267  const uint8_t *n;
268  const uint8_t *v;
269  uint32_t nl, vl;
270 
271  while (hubbub_charset_get_attribute(&pos, end,
272  &n, &nl, &v, &vl))
273  ; /* do nothing */
274  /* 2 */
275  } else
276  continue;
277  /* d */
278  } else if (PEEK("<!") || PEEK("</") || PEEK("<?")) {
279  pos++;
280  ADVANCE(">");
281  }
282 
283  /* e - do nothing */
284 
285  /* 2 */
286  pos++;
287  }
288 
289  return 0;
290 }
291 
299 uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
300  const uint8_t *end)
301 {
302  const uint8_t *name;
303  const uint8_t *value;
304  uint32_t namelen, valuelen;
305  uint16_t mibenum = 0;
306 
307  if (pos == NULL || *pos == NULL || end == NULL)
308  return 0;
309 
310  /* 2 */
311  while (hubbub_charset_get_attribute(pos, end,
312  &name, &namelen, &value, &valuelen)) {
313  /* 3 done by default */
314 
315  /* 4 */
316  if (namelen == SLEN("charset") && valuelen > 0 &&
317  strncasecmp((const char *) name, "charset",
318  SLEN("charset")) == 0) {
319  /* strip value */
320  while (ISSPACE(*value)) {
321  value++;
322  valuelen--;
323  }
324 
325  while (valuelen > 0 && ISSPACE(value[valuelen - 1]))
326  valuelen--;
327 
328  mibenum = parserutils_charset_mibenum_from_name(
329  (const char *) value, valuelen);
330  /* 5 */
331  } else if (namelen == SLEN("content") && valuelen > 0 &&
332  strncasecmp((const char *) name, "content",
333  SLEN("content")) == 0) {
334  mibenum = hubbub_charset_parse_content(value,
335  valuelen);
336  }
337 
338  /* 6 */
339  if (mibenum == parserutils_charset_mibenum_from_name(
340  "UTF-16LE", SLEN("UTF-16LE")) ||
341  mibenum ==
342  parserutils_charset_mibenum_from_name(
343  "UTF-16BE", SLEN("UTF-16BE")) ||
344  mibenum ==
345  parserutils_charset_mibenum_from_name(
346  "UTF-16", SLEN("UTF-16"))) {
347  mibenum = parserutils_charset_mibenum_from_name(
348  "UTF-8", SLEN("UTF-8"));
349  }
350 
351  /* 7 */
352  if (mibenum != 0) {
353  /* confidence = tentative; */
354  return mibenum;
355  }
356  }
357 
358  return 0;
359 }
360 
368 uint16_t hubbub_charset_parse_content(const uint8_t *value,
369  uint32_t valuelen)
370 {
371  const uint8_t *end;
372  const uint8_t *tentative = NULL;
373  uint32_t tentative_len = 0;
374 
375  if (value == NULL)
376  return 0;
377 
378  end = value + valuelen;
379 
380  /* 1 */
381  while (value < end) {
382  if (*value == ';') {
383  value++;
384  break;
385  }
386 
387  value++;
388  }
389 
390  if (value >= end)
391  return 0;
392 
393  /* 2 */
394  while (value < end && ISSPACE(*value)) {
395  value++;
396  }
397 
398  if (value >= end)
399  return 0;
400 
401  /* 3 */
402  if (value < end - SLEN("charset") &&
403  strncasecmp((const char *) value,
404  "charset", SLEN("charset")) != 0)
405  return 0;
406 
407  value += SLEN("charset");
408 
409  /* 4 */
410  while (value < end && ISSPACE(*value)) {
411  value++;
412  }
413 
414  if (value >= end)
415  return 0;
416 
417  /* 5 */
418  if (*value != '=')
419  return 0;
420  /* skip '=' */
421  value++;
422 
423  /* 6 */
424  while (value < end && ISSPACE(*value)) {
425  value++;
426  }
427 
428  if (value >= end)
429  return 0;
430 
431  /* 7 */
432  tentative = value;
433 
434  /* a */
435  if (*value == '"') {
436  while (++value < end && *value != '"') {
437  tentative_len++;
438  }
439 
440  if (value < end)
441  tentative++;
442  else
443  tentative = NULL;
444  /* b */
445  } else if (*value == '\'') {
446  while (++value < end && *value != '\'') {
447  tentative_len++;
448  }
449 
450  if (value < end)
451  tentative++;
452  else
453  tentative = NULL;
454  /* c */
455  } else {
456  while (value < end && !ISSPACE(*value)) {
457  value++;
458  tentative_len++;
459  }
460  }
461 
462  /* 8 */
463  if (tentative != NULL) {
464  return parserutils_charset_mibenum_from_name(
465  (const char *) tentative, tentative_len);
466  }
467 
468  /* 9 */
469  return 0;
470 }
471 
486 bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end,
487  const uint8_t **name, uint32_t *namelen,
488  const uint8_t **value, uint32_t *valuelen)
489 {
490  const uint8_t *pos;
491 
492  if (data == NULL || *data == NULL || end == NULL || name == NULL ||
493  namelen == NULL || value == NULL || valuelen == NULL)
494  return false;
495 
496  pos = *data;
497 
498  /* 1. Skip leading spaces or '/' characters */
499  while (pos < end && (ISSPACE(*pos) || *pos == '/')) {
500  pos++;
501  }
502 
503  if (pos >= end) {
504  *data = pos;
505  return false;
506  }
507 
508  /* 2. Invalid element open character */
509  if (*pos == '<') {
510  pos--;
511  *data = pos;
512  return false;
513  }
514 
515  /* 3. End of element */
516  if (*pos == '>') {
517  *data = pos;
518  return false;
519  }
520 
521  /* 4. Initialise name & value to empty string */
522  *name = pos;
523  *namelen = 0;
524  *value = (const uint8_t *) "";
525  *valuelen = 0;
526 
527  /* 5. Extract name */
528  while (pos < end) {
529  /* a */
530  if (*pos == '=') {
531  break;
532  }
533 
534  /* b */
535  if (ISSPACE(*pos)) {
536  break;
537  }
538 
539  /* c */
540  if (*pos == '/' || *pos == '<' || *pos == '>') {
541  *data = pos;
542  return true;
543  }
544 
545  /* d is handled by strncasecmp in _parse_attributes */
546 
547  /* e */
548  (*namelen)++;
549 
550  /* 6 */
551  pos++;
552  }
553 
554  if (pos >= end) {
555  *data = pos;
556  return false;
557  }
558 
559  if (ISSPACE(*pos)) {
560  /* 7. Skip trailing spaces */
561  while (pos < end && ISSPACE(*pos)) {
562  pos++;
563  }
564 
565  if (pos >= end) {
566  *data = pos;
567  return false;
568  }
569 
570  /* 8. Must be '=' */
571  if (*pos != '=') {
572  pos--;
573  *data = pos;
574  return true;
575  }
576  }
577 
578  /* 9. Skip '=' */
579  pos++;
580 
581  /* 10. Skip any spaces after '=' */
582  while (pos < end && ISSPACE(*pos)) {
583  pos++;
584  }
585 
586  if (pos >= end) {
587  *data = pos;
588  return false;
589  }
590 
591  /* 11. Extract value, if quoted */
592  /* a */
593  if (*pos == '\'' || *pos == '"') {
594  /* 1 */
595  const uint8_t *quote = pos;
596 
597  /* 2 */
598  while (++pos < end) {
599  /* 3 */
600  if (*pos == *quote) {
601  *value = (quote + 1);
602  *data = ++pos;
603  return true;
604  }
605 
606  /* 4 is handled by strncasecmp */
607 
608  /* 5 */
609  (*valuelen)++;
610 
611  /* 6 */
612  }
613 
614  if (pos >= end) {
615  *data = pos;
616  return false;
617  }
618  }
619 
620  /* b */
621  if (*pos == '<' || *pos == '>') {
622  *data = pos;
623  return true;
624  }
625 
626  /* c is handled by strncasecmp */
627 
628  /* d */
629  *value = pos;
630 
631  while (pos < end) {
632  /* 12. Extract unquoted value */
633  /* a */
634  if (ISSPACE(*pos) || *pos == '<' || *pos == '>') {
635  *data = pos;
636  return true;
637  }
638 
639  /* b is handled by strncasecmp */
640 
641  /* c */
642  (*valuelen)++;
643 
644  /* 13. Advance */
645  pos++;
646  }
647 
648  if (pos >= end) {
649  *data = pos;
650  return false;
651  }
652 
653  /* should never be reached */
654  abort();
655 
656  return false;
657 }
658 
666 void hubbub_charset_fix_charset(uint16_t *charset)
667 {
668  uint16_t tmp = 0;
669  assert(*charset != 0);
670 
671  /* ISO-8859-1 -> Windows-1252 */
672  if (*charset == parserutils_charset_mibenum_from_name(
673  "ISO-8859-1", SLEN("ISO-8859-1"))) {
674  tmp = parserutils_charset_mibenum_from_name(
675  "Windows-1252", SLEN("Windows-1252"));
676  assert(tmp != 0 && "Windows-1252 MUST be supported");
677  /* ISO-8859-9 -> Windows-1254 */
678  } else if (*charset == parserutils_charset_mibenum_from_name(
679  "ISO-8859-9", SLEN("ISO-8859-9"))) {
680  tmp = parserutils_charset_mibenum_from_name(
681  "Windows-1254", SLEN("Windows-1254"));
682  /* ISO-8859-11 -> Windows-874 */
683  } else if (*charset == parserutils_charset_mibenum_from_name(
684  "ISO-8859-11", SLEN("ISO-8859-11"))) {
685  tmp = parserutils_charset_mibenum_from_name(
686  "Windows-874", SLEN("Windows-874"));
687  /* KS_C_5601-1987 and EUC-KR -> Windows-949 */
688  } else if (*charset == parserutils_charset_mibenum_from_name(
689  "KS_C_5601-1987", SLEN("KS_C_5601-1987")) ||
690  *charset == parserutils_charset_mibenum_from_name(
691  "EUC-KR", SLEN("EUC-KR"))) {
692  tmp = parserutils_charset_mibenum_from_name(
693  "Windows-949", SLEN("Windows-949"));
694  /* TIS-620 -> Windows-874 */
695  } else if (*charset == parserutils_charset_mibenum_from_name(
696  "TIS-620", SLEN("TIS-620"))) {
697  tmp = parserutils_charset_mibenum_from_name(
698  "Windows-874", SLEN("Windows-874"));
699  /* x-x-big5 -> Big5 */
700  } else if (*charset == parserutils_charset_mibenum_from_name(
701  "x-x-big5", SLEN("x-x-big5"))) {
702  tmp = parserutils_charset_mibenum_from_name(
703  "Big5", SLEN("Big5"));
704  /* GB2312 and GB_2312-80 -> GBK */
705  } else if (*charset == parserutils_charset_mibenum_from_name(
706  "GB2312", SLEN("GB2312")) ||
707  *charset == parserutils_charset_mibenum_from_name(
708  "GB_2312-80", SLEN("GB_2312-80"))) {
709  tmp = parserutils_charset_mibenum_from_name(
710  "GBK", SLEN("GBK"));
711  }
712 
713  if (tmp != 0)
714  *charset = tmp;
715 }
Charset may be changed with further data.
Definition: types.h:24
#define SLEN(s)
Definition: utils.h:34
static uint16_t hubbub_charset_read_bom(const uint8_t *data, size_t len)
Inspect the beginning of a buffer of data for the presence of a UTF Byte Order Mark.
Definition: detect.c:161
#define ADVANCE(a)
Definition: detect.c:188
Charset definite.
Definition: types.h:26
static bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, const uint8_t **name, uint32_t *namelen, const uint8_t **value, uint32_t *valuelen)
Extract an attribute from the data stream.
Definition: detect.c:486
static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len)
Search for a meta charset within a buffer of data.
Definition: detect.c:209
static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, const uint8_t *end)
Parse attributes on a meta tag.
Definition: detect.c:299
uint16_t hubbub_charset_parse_content(const uint8_t *value, uint32_t valuelen)
Parse a content= attribute&#39;s value.
Definition: detect.c:368
#define min(a, b)
Definition: utils.h:29
const char * name
Definition: initial.c:22
#define ISSPACE(a)
Definition: detect.c:198
size_t len
Definition: initial.c:23
#define PEEK(a)
Definition: detect.c:184
parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len, uint16_t *mibenum, uint32_t *source)
Extract a charset from a chunk of data.
Definition: detect.c:43
void hubbub_charset_fix_charset(uint16_t *charset)
Fix charsets, according to the override table in HTML5, section 8.2.2.2.
Definition: detect.c:666