Hubbub $Id$
Classes | Macros | Typedefs | Enumerations | Functions
tokeniser.c File Reference
#include <assert.h>
#include <stdbool.h>
#include <string.h>
#include <stdio.h>
#include <parserutils/charset/utf8.h>
#include "utils/parserutilserror.h"
#include "utils/utils.h"
#include "hubbub/errors.h"
#include "tokeniser/entities.h"
#include "tokeniser/tokeniser.h"

Classes

struct  hubbub_tokeniser_context
 Context for tokeniser. More...
 
struct  hubbub_tokeniser
 Tokeniser data structure. More...
 

Macros

#define state(x)    case x:
 
#define START_BUF(str, cptr, length)
 Various macros for manipulating buffers. More...
 
#define COLLECT(str, cptr, length)
 
#define COLLECT_MS(str, cptr, length)
 
#define DOCTYPE   "DOCTYPE"
 
#define DOCTYPE_LEN   (SLEN(DOCTYPE) - 1)
 
#define PUBLIC   "PUBLIC"
 
#define PUBLIC_LEN   (SLEN(PUBLIC) - 1)
 
#define SYSTEM   "SYSTEM"
 
#define SYSTEM_LEN   (SLEN(SYSTEM) - 1)
 
#define CDATA   "[CDATA["
 
#define CDATA_LEN   (SLEN(CDATA) - 1)
 

Typedefs

typedef enum hubbub_tokeniser_state hubbub_tokeniser_state
 Tokeniser states. More...
 
typedef struct hubbub_tokeniser_context hubbub_tokeniser_context
 Context for tokeniser. More...
 

Enumerations

enum  hubbub_tokeniser_state {
  STATE_DATA , STATE_CHARACTER_REFERENCE_DATA , STATE_TAG_OPEN , STATE_CLOSE_TAG_OPEN ,
  STATE_TAG_NAME , STATE_BEFORE_ATTRIBUTE_NAME , STATE_ATTRIBUTE_NAME , STATE_AFTER_ATTRIBUTE_NAME ,
  STATE_BEFORE_ATTRIBUTE_VALUE , STATE_ATTRIBUTE_VALUE_DQ , STATE_ATTRIBUTE_VALUE_SQ , STATE_ATTRIBUTE_VALUE_UQ ,
  STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE , STATE_AFTER_ATTRIBUTE_VALUE_Q , STATE_SELF_CLOSING_START_TAG , STATE_BOGUS_COMMENT ,
  STATE_MARKUP_DECLARATION_OPEN , STATE_MATCH_COMMENT , STATE_COMMENT_START , STATE_COMMENT_START_DASH ,
  STATE_COMMENT , STATE_COMMENT_END_DASH , STATE_COMMENT_END , STATE_MATCH_DOCTYPE ,
  STATE_DOCTYPE , STATE_BEFORE_DOCTYPE_NAME , STATE_DOCTYPE_NAME , STATE_AFTER_DOCTYPE_NAME ,
  STATE_MATCH_PUBLIC , STATE_BEFORE_DOCTYPE_PUBLIC , STATE_DOCTYPE_PUBLIC_DQ , STATE_DOCTYPE_PUBLIC_SQ ,
  STATE_AFTER_DOCTYPE_PUBLIC , STATE_MATCH_SYSTEM , STATE_BEFORE_DOCTYPE_SYSTEM , STATE_DOCTYPE_SYSTEM_DQ ,
  STATE_DOCTYPE_SYSTEM_SQ , STATE_AFTER_DOCTYPE_SYSTEM , STATE_BOGUS_DOCTYPE , STATE_MATCH_CDATA ,
  STATE_CDATA_BLOCK , STATE_NUMBERED_ENTITY , STATE_NAMED_ENTITY
}
 Tokeniser states. More...
 

Functions

hubbub_error hubbub_tokeniser_create (parserutils_inputstream *input, hubbub_tokeniser **tokeniser)
 Create a hubbub tokeniser. More...
 
hubbub_error hubbub_tokeniser_destroy (hubbub_tokeniser *tokeniser)
 Destroy a hubbub tokeniser. More...
 
hubbub_error hubbub_tokeniser_setopt (hubbub_tokeniser *tokeniser, hubbub_tokeniser_opttype type, hubbub_tokeniser_optparams *params)
 Configure a hubbub tokeniser. More...
 
hubbub_error hubbub_tokeniser_insert_chunk (hubbub_tokeniser *tokeniser, const uint8_t *data, size_t len)
 Insert a chunk of data into the input stream. More...
 
hubbub_error hubbub_tokeniser_run (hubbub_tokeniser *tokeniser)
 Process remaining data in the input stream. More...
 

Macro Definition Documentation

◆ CDATA

#define CDATA   "[CDATA["

◆ CDATA_LEN

#define CDATA_LEN   (SLEN(CDATA) - 1)

◆ COLLECT

#define COLLECT (   str,
  cptr,
  length 
)
Value:
do { \
parserutils_error perror; \
assert(str.len != 0); \
perror = parserutils_buffer_append(tokeniser->buffer, \
(uint8_t *) (cptr), (length)); \
if (perror != PARSERUTILS_OK) \
return hubbub_error_from_parserutils_error(perror); \
(str).len += (length); \
} while (0)
size_t len
Definition: initial.c:23

◆ COLLECT_MS

#define COLLECT_MS (   str,
  cptr,
  length 
)
Value:
do { \
parserutils_error perror; \
perror = parserutils_buffer_append(tokeniser->buffer, \
(uint8_t *) (cptr), (length)); \
if (perror != PARSERUTILS_OK) \
return hubbub_error_from_parserutils_error(perror); \
(str).len += (length); \
} while (0)

◆ DOCTYPE

#define DOCTYPE   "DOCTYPE"

◆ DOCTYPE_LEN

#define DOCTYPE_LEN   (SLEN(DOCTYPE) - 1)

◆ PUBLIC

#define PUBLIC   "PUBLIC"

◆ PUBLIC_LEN

#define PUBLIC_LEN   (SLEN(PUBLIC) - 1)

◆ START_BUF

#define START_BUF (   str,
  cptr,
  length 
)
Value:
do { \
parserutils_error perror; \
perror = parserutils_buffer_append(tokeniser->buffer, \
(uint8_t *) (cptr), (length)); \
if (perror != PARSERUTILS_OK) \
return hubbub_error_from_parserutils_error(perror); \
(str).len = (length); \
} while (0)

Various macros for manipulating buffers.

Todo:

make some of these inline functions (type-safety)

document them properly here

◆ state

#define state (   x)     case x:

◆ SYSTEM

#define SYSTEM   "SYSTEM"

◆ SYSTEM_LEN

#define SYSTEM_LEN   (SLEN(SYSTEM) - 1)

Typedef Documentation

◆ hubbub_tokeniser_context

Context for tokeniser.

◆ hubbub_tokeniser_state

Tokeniser states.

Enumeration Type Documentation

◆ hubbub_tokeniser_state

Tokeniser states.

Enumerator
STATE_DATA 
STATE_CHARACTER_REFERENCE_DATA 
STATE_TAG_OPEN 
STATE_CLOSE_TAG_OPEN 
STATE_TAG_NAME 
STATE_BEFORE_ATTRIBUTE_NAME 
STATE_ATTRIBUTE_NAME 
STATE_AFTER_ATTRIBUTE_NAME 
STATE_BEFORE_ATTRIBUTE_VALUE 
STATE_ATTRIBUTE_VALUE_DQ 
STATE_ATTRIBUTE_VALUE_SQ 
STATE_ATTRIBUTE_VALUE_UQ 
STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE 
STATE_AFTER_ATTRIBUTE_VALUE_Q 
STATE_SELF_CLOSING_START_TAG 
STATE_BOGUS_COMMENT 
STATE_MARKUP_DECLARATION_OPEN 
STATE_MATCH_COMMENT 
STATE_COMMENT_START 
STATE_COMMENT_START_DASH 
STATE_COMMENT 
STATE_COMMENT_END_DASH 
STATE_COMMENT_END 
STATE_MATCH_DOCTYPE 
STATE_DOCTYPE 
STATE_BEFORE_DOCTYPE_NAME 
STATE_DOCTYPE_NAME 
STATE_AFTER_DOCTYPE_NAME 
STATE_MATCH_PUBLIC 
STATE_BEFORE_DOCTYPE_PUBLIC 
STATE_DOCTYPE_PUBLIC_DQ 
STATE_DOCTYPE_PUBLIC_SQ 
STATE_AFTER_DOCTYPE_PUBLIC 
STATE_MATCH_SYSTEM 
STATE_BEFORE_DOCTYPE_SYSTEM 
STATE_DOCTYPE_SYSTEM_DQ 
STATE_DOCTYPE_SYSTEM_SQ 
STATE_AFTER_DOCTYPE_SYSTEM 
STATE_BOGUS_DOCTYPE 
STATE_MATCH_CDATA 
STATE_CDATA_BLOCK 
STATE_NUMBERED_ENTITY 
STATE_NAMED_ENTITY 

Function Documentation

◆ hubbub_tokeniser_create()

hubbub_error hubbub_tokeniser_create ( parserutils_inputstream *  input,
hubbub_tokeniser **  tokeniser 
)

Create a hubbub tokeniser.

Parameters
inputInput stream instance
tokeniserPointer to location to receive tokeniser instance
Returns
HUBBUB_OK on success, HUBBUB_BADPARM on bad parameters, HUBBUB_NOMEM on memory exhaustion

◆ hubbub_tokeniser_destroy()

hubbub_error hubbub_tokeniser_destroy ( hubbub_tokeniser tokeniser)

Destroy a hubbub tokeniser.

Parameters
tokeniserThe tokeniser instance to destroy
Returns
HUBBUB_OK on success, appropriate error otherwise

◆ hubbub_tokeniser_insert_chunk()

hubbub_error hubbub_tokeniser_insert_chunk ( hubbub_tokeniser tokeniser,
const uint8_t *  data,
size_t  len 
)

Insert a chunk of data into the input stream.

Inserts the given data into the input stream ready for parsing but does not cause any additional processing of the input.

Parameters
tokeniserTokeniser instance
dataData to insert (UTF-8 encoded)
lenLength, in bytes, of data
Returns
HUBBUB_OK on success, appropriate error otherwise

◆ hubbub_tokeniser_run()

hubbub_error hubbub_tokeniser_run ( hubbub_tokeniser tokeniser)

Process remaining data in the input stream.

Parameters
tokeniserThe tokeniser instance to invoke
Returns
HUBBUB_OK on success, appropriate error otherwise

◆ hubbub_tokeniser_setopt()

hubbub_error hubbub_tokeniser_setopt ( hubbub_tokeniser tokeniser,
hubbub_tokeniser_opttype  type,
hubbub_tokeniser_optparams params 
)

Configure a hubbub tokeniser.

Parameters
tokeniserThe tokeniser instance to configure
typeThe option type to set
paramsOption-specific parameters
Returns
HUBBUB_OK on success, appropriate error otherwise