N3Lexer.js

Jump To … +

IRIs.js N3DataFactory.js N3Lexer.js N3Parser.js N3Store.js N3StreamParser.js N3StreamWriter.js N3Util.js N3Writer.js index.js

N3Lexer.js

N3Lexer tokenizes N3 documents.

import queueMicrotask from 'queue-microtask';
import namespaces from './IRIs';

const { xsd } = namespaces;

Regular expression and replacement string to escape N3 strings

const escapeSequence = /\\u([a-fA-F0-9]{4})|\\U([a-fA-F0-9]{8})|\\([^])/g;
const escapeReplacements = {
  '\\': '\\', "'": "'", '"': '"',
  'n': '\n', 'r': '\r', 't': '\t', 'f': '\f', 'b': '\b',
  '_': '_', '~': '~', '.': '.', '-': '-', '!': '!', '$': '$', '&': '&',
  '(': '(', ')': ')', '*': '*', '+': '+', ',': ',', ';': ';', '=': '=',
  '/': '/', '?': '?', '#': '#', '@': '@', '%': '%',
};
const illegalIriChars = /[\x00-\x20<>\\"\{\}\|\^\`]/;

const lineModeRegExps = {
  _iri: true,
  _unescapedIri: true,
  _simpleQuotedString: true,
  _langcode: true,
  _blank: true,
  _newline: true,
  _comment: true,
  _whitespace: true,
  _endOfFile: true,
};
const invalidRegExp = /$0^/;

Constructor

export default class N3Lexer {
  constructor(options) {

Regular expressions

It’s slightly faster to have these as properties than as in-scope variables

    this._iri = /^<((?:[^ <>{}\\]|\\[uU])+)>[ \t]*/; // IRI with escape sequences; needs sanity check after unescaping
    this._unescapedIri = /^<([^\x00-\x20<>\\"\{\}\|\^\`]*)>[ \t]*/; // IRI without escape sequences; no unescaping
    this._simpleQuotedString = /^"([^"\\\r\n]*)"(?=[^"])/; // string without escape sequences
    this._simpleApostropheString = /^'([^'\\\r\n]*)'(?=[^'])/;
    this._langcode = /^@([a-z]+(?:-[a-z0-9]+)*)(?=[^a-z0-9\-])/i;
    this._prefix = /^((?:[A-Za-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)?:(?=[#\s<])/;
    this._prefixed = /^((?:[A-Za-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)?:((?:(?:[0-:A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])(?:(?:[\.\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])*(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~]))?)?)(?:[ \t]+|(?=\.?[,;!\^\s#()\[\]\{\}"'<>]))/;
    this._variable = /^\?(?:(?:[A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)(?=[.,;!\^\s#()\[\]\{\}"'<>])/;
    this._blank = /^_:((?:[0-9A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)(?:[ \t]+|(?=\.?[,;:\s#()\[\]\{\}"'<>]))/;
    this._number = /^[\-+]?(?:(\d+\.\d*|\.?\d+)[eE][\-+]?|\d*(\.)?)\d+(?=\.?[,;:\s#()\[\]\{\}"'<>])/;
    this._boolean = /^(?:true|false)(?=[.,;\s#()\[\]\{\}"'<>])/;
    this._keyword = /^@[a-z]+(?=[\s#<:])/i;
    this._sparqlKeyword = /^(?:PREFIX|BASE|GRAPH)(?=[\s#<])/i;
    this._shortPredicates = /^a(?=[\s#()\[\]\{\}"'<>])/;
    this._newline = /^[ \t]*(?:#[^\n\r]*)?(?:\r\n|\n|\r)[ \t]*/;
    this._comment = /#([^\n\r]*)/;
    this._whitespace = /^[ \t]+/;
    this._endOfFile = /^(?:#[^\n\r]*)?$/;
    options = options || {};

In line mode (N-Triples or N-Quads), only simple features may be parsed

    if (this._lineMode = !!options.lineMode) {
      this._n3Mode = false;

Don’t tokenize special literals

      for (const key in this) {
        if (!(key in lineModeRegExps) && this[key] instanceof RegExp)
          this[key] = invalidRegExp;
      }
    }

When not in line mode, enable N3 functionality by default

    else {
      this._n3Mode = options.n3 !== false;
    }

Don’t output comment tokens by default

    this._comments = !!options.comments;

§

Cache the last tested closing position of long literals
```
    this._literalClosingPos = 0;
  }
```
§

Private methods
§

_tokenizeToEnd tokenizes as for as possible, emitting tokens through the callback
```
  _tokenizeToEnd(callback, inputFinished) {
```

Continue parsing as far as possible; the loop will return eventually

    let input = this._input;
    let currentLineLength = input.length;
    while (true) {

Count and skip whitespace lines

      let whiteSpaceMatch, comment;
      while (whiteSpaceMatch = this._newline.exec(input)) {

Try to find a comment

        if (this._comments && (comment = this._comment.exec(whiteSpaceMatch[0])))
          emitToken('comment', comment[1], '', this._line, whiteSpaceMatch[0].length);

Advance the input

        input = input.substr(whiteSpaceMatch[0].length, input.length);
        currentLineLength = input.length;
        this._line++;
      }

Skip whitespace on current line

      if (!whiteSpaceMatch && (whiteSpaceMatch = this._whitespace.exec(input)))
        input = input.substr(whiteSpaceMatch[0].length, input.length);

Stop for now if we’re at the end

      if (this._endOfFile.test(input)) {

§

If the input is finished, emit EOF
```
        if (inputFinished) {
```

Try to find a final comment

          if (this._comments && (comment = this._comment.exec(input)))
            emitToken('comment', comment[1], '', this._line, input.length);
          input = null;
          emitToken('eof', '', '', this._line, 0);
        }
        return this._input = input;
      }

Look for specific token types based on the first character

      const line = this._line, firstChar = input[0];
      let type = '', value = '', prefix = '',
          match = null, matchLength = 0, inconclusive = false;
      switch (firstChar) {
      case '^':

§

We need at least 3 tokens lookahead to distinguish ^^ and ^^pre:fixed
```
        if (input.length < 3)
          break;
```

Try to match a type

        else if (input[1] === '^') {
          this._previousMarker = '^^';

Move to type IRI or prefixed name

          input = input.substr(2);
          if (input[0] !== '<') {
            inconclusive = true;
            break;
          }
        }

If no type, it must be a path expression

        else {
          if (this._n3Mode) {
            matchLength = 1;
            type = '^';
          }
          break;
        }

§

Fall through in case the type is an IRI
```
      case '<':
```

Try to find a full IRI without escape sequences

        if (match = this._unescapedIri.exec(input))
          type = 'IRI', value = match[1];

Try to find a full IRI with escape sequences

        else if (match = this._iri.exec(input)) {
          value = this._unescape(match[1]);
          if (value === null || illegalIriChars.test(value))
            return reportSyntaxError(this);
          type = 'IRI';
        }

Try to find a nested triple

        else if (input.length > 1 && input[1] === '<')
          type = '<<', matchLength = 2;

Try to find a backwards implication arrow

        else if (this._n3Mode && input.length > 1 && input[1] === '=')
          type = 'inverse', matchLength = 2, value = '>';
        break;

      case '>':
        if (input.length > 1 && input[1] === '>')
          type = '>>', matchLength = 2;
        break;

      case '_':

Try to find a blank node. Since it can contain (but not end with) a dot, we always need a non-dot character before deciding it is a blank node. Therefore, try inserting a space if we’re at the end of the input.

        if ((match = this._blank.exec(input)) ||
            inputFinished && (match = this._blank.exec(`${input} `)))
          type = 'blank', prefix = '_', value = match[1];
        break;

      case '"':

Try to find a literal without escape sequences

        if (match = this._simpleQuotedString.exec(input))
          value = match[1];

Try to find a literal wrapped in three pairs of quotes

        else {
          ({ value, matchLength } = this._parseLiteral(input));
          if (value === null)
            return reportSyntaxError(this);
        }
        if (match !== null || matchLength !== 0) {
          type = 'literal';
          this._literalClosingPos = 0;
        }
        break;

      case "'":
        if (!this._lineMode) {

Try to find a literal without escape sequences

          if (match = this._simpleApostropheString.exec(input))
            value = match[1];

Try to find a literal wrapped in three pairs of quotes

          else {
            ({ value, matchLength } = this._parseLiteral(input));
            if (value === null)
              return reportSyntaxError(this);
          }
          if (match !== null || matchLength !== 0) {
            type = 'literal';
            this._literalClosingPos = 0;
          }
        }
        break;

      case '?':

Try to find a variable

        if (this._n3Mode && (match = this._variable.exec(input)))
          type = 'var', value = match[0];
        break;

      case '@':

Try to find a language code

        if (this._previousMarker === 'literal' && (match = this._langcode.exec(input)))
          type = 'langcode', value = match[1];

Try to find a keyword

        else if (match = this._keyword.exec(input))
          type = match[0];
        break;

      case '.':

Try to find a dot as punctuation

        if (input.length === 1 ? inputFinished : (input[1] < '0' || input[1] > '9')) {
          type = '.';
          matchLength = 1;
          break;
        }

Fall through to numerical case (could be a decimal dot)

      case '0':
      case '1':
      case '2':
      case '3':
      case '4':
      case '5':
      case '6':
      case '7':
      case '8':
      case '9':
      case '+':
      case '-':

Try to find a number. Since it can contain (but not end with) a dot, we always need a non-dot character before deciding it is a number. Therefore, try inserting a space if we’re at the end of the input.

        if (match = this._number.exec(input) ||
            inputFinished && (match = this._number.exec(`${input} `))) {
          type = 'literal', value = match[0];
          prefix = (typeof match[1] === 'string' ? xsd.double :
                    (typeof match[2] === 'string' ? xsd.decimal : xsd.integer));
        }
        break;

      case 'B':
      case 'b':
      case 'p':
      case 'P':
      case 'G':
      case 'g':

Try to find a SPARQL-style keyword

        if (match = this._sparqlKeyword.exec(input))
          type = match[0].toUpperCase();
        else
          inconclusive = true;
        break;

      case 'f':
      case 't':

Try to match a boolean

        if (match = this._boolean.exec(input))
          type = 'literal', value = match[0], prefix = xsd.boolean;
        else
          inconclusive = true;
        break;

      case 'a':

Try to find an abbreviated predicate

        if (match = this._shortPredicates.exec(input))
          type = 'abbreviation', value = 'a';
        else
          inconclusive = true;
        break;

      case '=':

Try to find an implication arrow or equals sign

        if (this._n3Mode && input.length > 1) {
          type = 'abbreviation';
          if (input[1] !== '>')
            matchLength = 1, value = '=';
          else
            matchLength = 2, value = '>';
        }
        break;

      case '!':
        if (!this._n3Mode)
          break;
      case ',':
      case ';':
      case '[':
      case ']':
      case '(':
      case ')':
      case '}':
        if (!this._lineMode) {
          matchLength = 1;
          type = firstChar;
        }
        break;
      case '{':

§

We need at least 2 tokens lookahead to distinguish “{|” and “{ “
```
        if (!this._lineMode && input.length >= 2) {
```

Try to find a quoted triple annotation start

          if (input[1] === '|')
            type = '{|', matchLength = 2;
          else
            type = firstChar, matchLength = 1;
        }
        break;
      case '|':

We need 2 tokens lookahead to parse “|}” Try to find a quoted triple annotation end

        if (input.length >= 2 && input[1] === '}')
          type = '|}', matchLength = 2;
        break;

      default:
        inconclusive = true;
      }

§

Some first characters do not allow an immediate decision, so inspect more
```
      if (inconclusive) {
```

Try to find a prefix

        if ((this._previousMarker === '@prefix' || this._previousMarker === 'PREFIX') &&
            (match = this._prefix.exec(input)))
          type = 'prefix', value = match[1] || '';

Try to find a prefixed name. Since it can contain (but not end with) a dot, we always need a non-dot character before deciding it is a prefixed name. Therefore, try inserting a space if we’re at the end of the input.

        else if ((match = this._prefixed.exec(input)) ||
                 inputFinished && (match = this._prefixed.exec(`${input} `)))
          type = 'prefixed', prefix = match[1] || '', value = this._unescape(match[2]);
      }

A type token is special: it can only be emitted after an IRI or prefixed name is read

      if (this._previousMarker === '^^') {
        switch (type) {
        case 'prefixed': type = 'type';    break;
        case 'IRI':      type = 'typeIRI'; break;
        default:         type = '';
        }
      }

§

What if nothing of the above was found?
```
      if (!type) {
```

We could be in streaming mode, and then we just wait for more input to arrive. Otherwise, a syntax error has occurred in the input. One exception: error on an unaccounted linebreak (= not inside a triple-quoted literal).

        if (inputFinished || (!/^'''|^"""/.test(input) && /\n|\r/.test(input)))
          return reportSyntaxError(this);
        else
          return this._input = input;
      }

Emit the parsed token

      const length = matchLength || match[0].length;
      const token = emitToken(type, value, prefix, line, length);
      this.previousToken = token;
      this._previousMarker = type;

Advance to next part to tokenize

      input = input.substr(length, input.length);
    }

Emits the token through the callback

    function emitToken(type, value, prefix, line, length) {
      const start = input ? currentLineLength - input.length : currentLineLength;
      const end = start + length;
      const token = { type, value, prefix, line, start, end };
      callback(null, token);
      return token;
    }

Signals the syntax error through the callback

    function reportSyntaxError(self) { callback(self._syntaxError(/^\S*/.exec(input)[0])); }
  }

`_unescape` replaces N3 escape codes by their corresponding characters

  _unescape(item) {
    let invalid = false;
    const replaced = item.replace(escapeSequence, (sequence, unicode4, unicode8, escapedChar) => {

4-digit unicode character

      if (typeof unicode4 === 'string')
        return String.fromCharCode(Number.parseInt(unicode4, 16));

8-digit unicode character

      if (typeof unicode8 === 'string') {
        let charCode = Number.parseInt(unicode8, 16);
        return charCode <= 0xFFFF ? String.fromCharCode(Number.parseInt(unicode8, 16)) :
          String.fromCharCode(0xD800 + ((charCode -= 0x10000) >> 10), 0xDC00 + (charCode & 0x3FF));
      }

fixed escape sequence

      if (escapedChar in escapeReplacements)
        return escapeReplacements[escapedChar];

invalid escape sequence

      invalid = true;
      return '';
    });
    return invalid ? null : replaced;
  }

§

_parseLiteral parses a literal into an unescaped value
```
  _parseLiteral(input) {
```
§

Ensure we have enough lookahead to identify triple-quoted strings
```
    if (input.length >= 3) {
```

Identify the opening quote(s)

      const opening = input.match(/^(?:"""|"|'''|'|)/)[0];
      const openingLength = opening.length;

Find the next candidate closing quotes

      let closingPos = Math.max(this._literalClosingPos, openingLength);
      while ((closingPos = input.indexOf(opening, closingPos)) > 0) {

Count backslashes right before the closing quotes

        let backslashCount = 0;
        while (input[closingPos - backslashCount - 1] === '\\')
          backslashCount++;

§

An even number of backslashes (in particular 0) means these are actual, non-escaped closing quotes
```
        if (backslashCount % 2 === 0) {
```

Extract and unescape the value

          const raw = input.substring(openingLength, closingPos);
          const lines = raw.split(/\r\n|\r|\n/).length - 1;
          const matchLength = closingPos + openingLength;

Only triple-quoted strings can be multi-line

          if (openingLength === 1 && lines !== 0 ||
              openingLength === 3 && this._lineMode)
            break;
          this._line += lines;
          return { value: this._unescape(raw), matchLength };
        }
        closingPos++;
      }
      this._literalClosingPos = input.length - openingLength + 1;
    }
    return { value: '', matchLength: 0 };
  }

`_syntaxError` creates a syntax error for the given issue

  _syntaxError(issue) {
    this._input = null;
    const err = new Error(`Unexpected "${issue}" on line ${this._line}.`);
    err.context = {
      token: undefined,
      line: this._line,
      previousToken: this.previousToken,
    };
    return err;
  }

Strips off any starting UTF BOM mark.

  _readStartingBom(input) {
    return input.startsWith('\ufeff') ? input.substr(1) : input;
  }

§

Public methods
§

tokenize starts the transformation of an N3 document into an array of tokens.

The input can be a string or a stream.
```
  tokenize(input, callback) {
    this._line = 1;
```

If the input is a string, continuously emit tokens through the callback until the end

    if (typeof input === 'string') {
      this._input = this._readStartingBom(input);

If a callback was passed, asynchronously call it

      if (typeof callback === 'function')
        queueMicrotask(() => this._tokenizeToEnd(callback, true));

If no callback was passed, tokenize synchronously and return

      else {
        const tokens = [];
        let error;
        this._tokenizeToEnd((e, t) => e ? (error = e) : tokens.push(t), true);
        if (error) throw error;
        return tokens;
      }
    }

Otherwise, the input must be a stream

    else {
      this._pendingBuffer = null;
      if (typeof input.setEncoding === 'function')
        input.setEncoding('utf8');

Adds the data chunk to the buffer and parses as far as possible

      input.on('data', data => {
        if (this._input !== null && data.length !== 0) {

Prepend any previous pending writes

          if (this._pendingBuffer) {
            data = Buffer.concat([this._pendingBuffer, data]);
            this._pendingBuffer = null;
          }

Hold if the buffer ends in an incomplete unicode sequence

          if (data[data.length - 1] & 0x80) {
            this._pendingBuffer = data;
          }

§

Otherwise, tokenize as far as possible
```
          else {
```

Only read a BOM at the start

            if (typeof this._input === 'undefined')
              this._input = this._readStartingBom(typeof data === 'string' ? data : data.toString());
            else
              this._input += data;
            this._tokenizeToEnd(callback, false);
          }
        }
      });

Parses until the end

      input.on('end', () => {
        if (typeof this._input === 'string')
          this._tokenizeToEnd(callback, true);
      });
      input.on('error', callback);
    }
  }
}

N3Lexer.js

Constructor

Regular expressions

Private methods

`_tokenizeToEnd` tokenizes as for as possible, emitting tokens through the callback

`_unescape` replaces N3 escape codes by their corresponding characters

`_parseLiteral` parses a literal into an unescaped value

`_syntaxError` creates a syntax error for the given issue

Strips off any starting UTF BOM mark.

Public methods

`tokenize` starts the transformation of an N3 document into an array of tokens.

N3Lexer.js

Constructor

Regular expressions

Private methods

_tokenizeToEnd tokenizes as for as possible, emitting tokens through the callback

_unescape replaces N3 escape codes by their corresponding characters

_parseLiteral parses a literal into an unescaped value

_syntaxError creates a syntax error for the given issue

Strips off any starting UTF BOM mark.

Public methods

tokenize starts the transformation of an N3 document into an array of tokens.

`_tokenizeToEnd` tokenizes as for as possible, emitting tokens through the callback

`_unescape` replaces N3 escape codes by their corresponding characters

`_parseLiteral` parses a literal into an unescaped value

`_syntaxError` creates a syntax error for the given issue

`tokenize` starts the transformation of an N3 document into an array of tokens.