import queueMicrotask from 'queue-microtask';
import namespaces from './IRIs';
const { xsd } = namespaces;
N3Lexer tokenizes N3 documents.
import queueMicrotask from 'queue-microtask';
import namespaces from './IRIs';
const { xsd } = namespaces;
Regular expression and replacement string to escape N3 strings
const escapeSequence = /\\u([a-fA-F0-9]{4})|\\U([a-fA-F0-9]{8})|\\([^])/g;
const escapeReplacements = {
'\\': '\\', "'": "'", '"': '"',
'n': '\n', 'r': '\r', 't': '\t', 'f': '\f', 'b': '\b',
'_': '_', '~': '~', '.': '.', '-': '-', '!': '!', '$': '$', '&': '&',
'(': '(', ')': ')', '*': '*', '+': '+', ',': ',', ';': ';', '=': '=',
'/': '/', '?': '?', '#': '#', '@': '@', '%': '%',
};
const illegalIriChars = /[\x00-\x20<>\\"\{\}\|\^\`]/;
const lineModeRegExps = {
_iri: true,
_unescapedIri: true,
_simpleQuotedString: true,
_langcode: true,
_blank: true,
_newline: true,
_comment: true,
_whitespace: true,
_endOfFile: true,
};
const invalidRegExp = /$0^/;
export default class N3Lexer {
constructor(options) {
this._iri = /^<((?:[^ <>{}\\]|\\[uU])+)>[ \t]*/; // IRI with escape sequences; needs sanity check after unescaping
this._unescapedIri = /^<([^\x00-\x20<>\\"\{\}\|\^\`]*)>[ \t]*/; // IRI without escape sequences; no unescaping
this._simpleQuotedString = /^"([^"\\\r\n]*)"(?=[^"])/; // string without escape sequences
this._simpleApostropheString = /^'([^'\\\r\n]*)'(?=[^'])/;
this._langcode = /^@([a-z]+(?:-[a-z0-9]+)*)(?=[^a-z0-9\-])/i;
this._prefix = /^((?:[A-Za-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)?:(?=[#\s<])/;
this._prefixed = /^((?:[A-Za-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)?:((?:(?:[0-:A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])(?:(?:[\.\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])*(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~]))?)?)(?:[ \t]+|(?=\.?[,;!\^\s#()\[\]\{\}"'<>]))/;
this._variable = /^\?(?:(?:[A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)(?=[.,;!\^\s#()\[\]\{\}"'<>])/;
this._blank = /^_:((?:[0-9A-Z_a-z\xc0-\xd6\xd8-\xf6\xf8-\u02ff\u0370-\u037d\u037f-\u1fff\u200c\u200d\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)(?:[ \t]+|(?=\.?[,;:\s#()\[\]\{\}"'<>]))/;
this._number = /^[\-+]?(?:(\d+\.\d*|\.?\d+)[eE][\-+]?|\d*(\.)?)\d+(?=\.?[,;:\s#()\[\]\{\}"'<>])/;
this._boolean = /^(?:true|false)(?=[.,;\s#()\[\]\{\}"'<>])/;
this._keyword = /^@[a-z]+(?=[\s#<:])/i;
this._sparqlKeyword = /^(?:PREFIX|BASE|GRAPH)(?=[\s#<])/i;
this._shortPredicates = /^a(?=[\s#()\[\]\{\}"'<>])/;
this._newline = /^[ \t]*(?:#[^\n\r]*)?(?:\r\n|\n|\r)[ \t]*/;
this._comment = /#([^\n\r]*)/;
this._whitespace = /^[ \t]+/;
this._endOfFile = /^(?:#[^\n\r]*)?$/;
options = options || {};
In line mode (N-Triples or N-Quads), only simple features may be parsed
if (this._lineMode = !!options.lineMode) {
this._n3Mode = false;
Don’t tokenize special literals
for (const key in this) {
if (!(key in lineModeRegExps) && this[key] instanceof RegExp)
this[key] = invalidRegExp;
}
}
When not in line mode, enable N3 functionality by default
else {
this._n3Mode = options.n3 !== false;
}
Don’t output comment tokens by default
this._comments = !!options.comments;
Cache the last tested closing position of long literals
this._literalClosingPos = 0;
}
_tokenizeToEnd
tokenizes as for as possible, emitting tokens through the callback _tokenizeToEnd(callback, inputFinished) {
Continue parsing as far as possible; the loop will return eventually
let input = this._input;
let currentLineLength = input.length;
while (true) {
Count and skip whitespace lines
let whiteSpaceMatch, comment;
while (whiteSpaceMatch = this._newline.exec(input)) {
Try to find a comment
if (this._comments && (comment = this._comment.exec(whiteSpaceMatch[0])))
emitToken('comment', comment[1], '', this._line, whiteSpaceMatch[0].length);
Advance the input
input = input.substr(whiteSpaceMatch[0].length, input.length);
currentLineLength = input.length;
this._line++;
}
Skip whitespace on current line
if (!whiteSpaceMatch && (whiteSpaceMatch = this._whitespace.exec(input)))
input = input.substr(whiteSpaceMatch[0].length, input.length);
Stop for now if we’re at the end
if (this._endOfFile.test(input)) {
If the input is finished, emit EOF
if (inputFinished) {
Try to find a final comment
if (this._comments && (comment = this._comment.exec(input)))
emitToken('comment', comment[1], '', this._line, input.length);
input = null;
emitToken('eof', '', '', this._line, 0);
}
return this._input = input;
}
Look for specific token types based on the first character
const line = this._line, firstChar = input[0];
let type = '', value = '', prefix = '',
match = null, matchLength = 0, inconclusive = false;
switch (firstChar) {
case '^':
We need at least 3 tokens lookahead to distinguish ^^
if (input.length < 3)
break;
Try to match a type
else if (input[1] === '^') {
this._previousMarker = '^^';
Move to type IRI or prefixed name
input = input.substr(2);
if (input[0] !== '<') {
inconclusive = true;
break;
}
}
If no type, it must be a path expression
else {
if (this._n3Mode) {
matchLength = 1;
type = '^';
}
break;
}
Fall through in case the type is an IRI
case '<':
Try to find a full IRI without escape sequences
if (match = this._unescapedIri.exec(input))
type = 'IRI', value = match[1];
Try to find a full IRI with escape sequences
else if (match = this._iri.exec(input)) {
value = this._unescape(match[1]);
if (value === null || illegalIriChars.test(value))
return reportSyntaxError(this);
type = 'IRI';
}
Try to find a nested triple
else if (input.length > 1 && input[1] === '<')
type = '<<', matchLength = 2;
Try to find a backwards implication arrow
else if (this._n3Mode && input.length > 1 && input[1] === '=')
type = 'inverse', matchLength = 2, value = '>';
break;
case '>':
if (input.length > 1 && input[1] === '>')
type = '>>', matchLength = 2;
break;
case '_':
Try to find a blank node. Since it can contain (but not end with) a dot, we always need a non-dot character before deciding it is a blank node. Therefore, try inserting a space if we’re at the end of the input.
if ((match = this._blank.exec(input)) ||
inputFinished && (match = this._blank.exec(`${input} `)))
type = 'blank', prefix = '_', value = match[1];
break;
case '"':
Try to find a literal without escape sequences
if (match = this._simpleQuotedString.exec(input))
value = match[1];
Try to find a literal wrapped in three pairs of quotes
else {
({ value, matchLength } = this._parseLiteral(input));
if (value === null)
return reportSyntaxError(this);
}
if (match !== null || matchLength !== 0) {
type = 'literal';
this._literalClosingPos = 0;
}
break;
case "'":
if (!this._lineMode) {
Try to find a literal without escape sequences
if (match = this._simpleApostropheString.exec(input))
value = match[1];
Try to find a literal wrapped in three pairs of quotes
else {
({ value, matchLength } = this._parseLiteral(input));
if (value === null)
return reportSyntaxError(this);
}
if (match !== null || matchLength !== 0) {
type = 'literal';
this._literalClosingPos = 0;
}
}
break;
case '?':
Try to find a variable
if (this._n3Mode && (match = this._variable.exec(input)))
type = 'var', value = match[0];
break;
case '@':
Try to find a language code
if (this._previousMarker === 'literal' && (match = this._langcode.exec(input)))
type = 'langcode', value = match[1];
Try to find a keyword
else if (match = this._keyword.exec(input))
type = match[0];
break;
case '.':
Try to find a dot as punctuation
if (input.length === 1 ? inputFinished : (input[1] < '0' || input[1] > '9')) {
type = '.';
matchLength = 1;
break;
}
Fall through to numerical case (could be a decimal dot)
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '+':
case '-':
Try to find a number. Since it can contain (but not end with) a dot, we always need a non-dot character before deciding it is a number. Therefore, try inserting a space if we’re at the end of the input.
if (match = this._number.exec(input) ||
inputFinished && (match = this._number.exec(`${input} `))) {
type = 'literal', value = match[0];
prefix = (typeof match[1] === 'string' ? xsd.double :
(typeof match[2] === 'string' ? xsd.decimal : xsd.integer));
}
break;
case 'B':
case 'b':
case 'p':
case 'P':
case 'G':
case 'g':
Try to find a SPARQL-style keyword
if (match = this._sparqlKeyword.exec(input))
type = match[0].toUpperCase();
else
inconclusive = true;
break;
case 'f':
case 't':
Try to match a boolean
if (match = this._boolean.exec(input))
type = 'literal', value = match[0], prefix = xsd.boolean;
else
inconclusive = true;
break;
case 'a':
Try to find an abbreviated predicate
if (match = this._shortPredicates.exec(input))
type = 'abbreviation', value = 'a';
else
inconclusive = true;
break;
case '=':
Try to find an implication arrow or equals sign
if (this._n3Mode && input.length > 1) {
type = 'abbreviation';
if (input[1] !== '>')
matchLength = 1, value = '=';
else
matchLength = 2, value = '>';
}
break;
case '!':
if (!this._n3Mode)
break;
case ',':
case ';':
case '[':
case ']':
case '(':
case ')':
case '}':
if (!this._lineMode) {
matchLength = 1;
type = firstChar;
}
break;
case '{':
We need at least 2 tokens lookahead to distinguish “{|” and “{ “
if (!this._lineMode && input.length >= 2) {
Try to find a quoted triple annotation start
if (input[1] === '|')
type = '{|', matchLength = 2;
else
type = firstChar, matchLength = 1;
}
break;
case '|':
We need 2 tokens lookahead to parse “|}” Try to find a quoted triple annotation end
if (input.length >= 2 && input[1] === '}')
type = '|}', matchLength = 2;
break;
default:
inconclusive = true;
}
Some first characters do not allow an immediate decision, so inspect more
if (inconclusive) {
Try to find a prefix
if ((this._previousMarker === '@prefix' || this._previousMarker === 'PREFIX') &&
(match = this._prefix.exec(input)))
type = 'prefix', value = match[1] || '';
Try to find a prefixed name. Since it can contain (but not end with) a dot, we always need a non-dot character before deciding it is a prefixed name. Therefore, try inserting a space if we’re at the end of the input.
else if ((match = this._prefixed.exec(input)) ||
inputFinished && (match = this._prefixed.exec(`${input} `)))
type = 'prefixed', prefix = match[1] || '', value = this._unescape(match[2]);
}
A type token is special: it can only be emitted after an IRI or prefixed name is read
if (this._previousMarker === '^^') {
switch (type) {
case 'prefixed': type = 'type'; break;
case 'IRI': type = 'typeIRI'; break;
default: type = '';
}
}
What if nothing of the above was found?
if (!type) {
We could be in streaming mode, and then we just wait for more input to arrive. Otherwise, a syntax error has occurred in the input. One exception: error on an unaccounted linebreak (= not inside a triple-quoted literal).
if (inputFinished || (!/^'''|^"""/.test(input) && /\n|\r/.test(input)))
return reportSyntaxError(this);
else
return this._input = input;
}
Emit the parsed token
const length = matchLength || match[0].length;
const token = emitToken(type, value, prefix, line, length);
this.previousToken = token;
this._previousMarker = type;
Advance to next part to tokenize
input = input.substr(length, input.length);
}
Emits the token through the callback
function emitToken(type, value, prefix, line, length) {
const start = input ? currentLineLength - input.length : currentLineLength;
const end = start + length;
const token = { type, value, prefix, line, start, end };
callback(null, token);
return token;
}
Signals the syntax error through the callback
function reportSyntaxError(self) { callback(self._syntaxError(/^\S*/.exec(input)[0])); }
}
_unescape
replaces N3 escape codes by their corresponding characters _unescape(item) {
let invalid = false;
const replaced = item.replace(escapeSequence, (sequence, unicode4, unicode8, escapedChar) => {
4-digit unicode character
if (typeof unicode4 === 'string')
return String.fromCharCode(Number.parseInt(unicode4, 16));
8-digit unicode character
if (typeof unicode8 === 'string') {
let charCode = Number.parseInt(unicode8, 16);
return charCode <= 0xFFFF ? String.fromCharCode(Number.parseInt(unicode8, 16)) :
String.fromCharCode(0xD800 + ((charCode -= 0x10000) >> 10), 0xDC00 + (charCode & 0x3FF));
}
fixed escape sequence
if (escapedChar in escapeReplacements)
return escapeReplacements[escapedChar];
invalid escape sequence
invalid = true;
return '';
});
return invalid ? null : replaced;
}
_parseLiteral
parses a literal into an unescaped value _parseLiteral(input) {
Ensure we have enough lookahead to identify triple-quoted strings
if (input.length >= 3) {
Identify the opening quote(s)
const opening = input.match(/^(?:"""|"|'''|'|)/)[0];
const openingLength = opening.length;
Find the next candidate closing quotes
let closingPos = Math.max(this._literalClosingPos, openingLength);
while ((closingPos = input.indexOf(opening, closingPos)) > 0) {
Count backslashes right before the closing quotes
let backslashCount = 0;
while (input[closingPos - backslashCount - 1] === '\\')
backslashCount++;
An even number of backslashes (in particular 0) means these are actual, non-escaped closing quotes
if (backslashCount % 2 === 0) {
Extract and unescape the value
const raw = input.substring(openingLength, closingPos);
const lines = raw.split(/\r\n|\r|\n/).length - 1;
const matchLength = closingPos + openingLength;
Only triple-quoted strings can be multi-line
if (openingLength === 1 && lines !== 0 ||
openingLength === 3 && this._lineMode)
break;
this._line += lines;
return { value: this._unescape(raw), matchLength };
}
closingPos++;
}
this._literalClosingPos = input.length - openingLength + 1;
}
return { value: '', matchLength: 0 };
}
_syntaxError
creates a syntax error for the given issue _syntaxError(issue) {
this._input = null;
const err = new Error(`Unexpected "${issue}" on line ${this._line}.`);
err.context = {
token: undefined,
line: this._line,
previousToken: this.previousToken,
};
return err;
}
_readStartingBom(input) {
return input.startsWith('\ufeff') ? input.substr(1) : input;
}
tokenize
starts the transformation of an N3 document into an array of tokens.The input can be a string or a stream.
tokenize(input, callback) {
this._line = 1;
If the input is a string, continuously emit tokens through the callback until the end
if (typeof input === 'string') {
this._input = this._readStartingBom(input);
If a callback was passed, asynchronously call it
if (typeof callback === 'function')
queueMicrotask(() => this._tokenizeToEnd(callback, true));
If no callback was passed, tokenize synchronously and return
else {
const tokens = [];
let error;
this._tokenizeToEnd((e, t) => e ? (error = e) : tokens.push(t), true);
if (error) throw error;
return tokens;
}
}
Otherwise, the input must be a stream
else {
this._pendingBuffer = null;
if (typeof input.setEncoding === 'function')
input.setEncoding('utf8');
Adds the data chunk to the buffer and parses as far as possible
input.on('data', data => {
if (this._input !== null && data.length !== 0) {
Prepend any previous pending writes
if (this._pendingBuffer) {
data = Buffer.concat([this._pendingBuffer, data]);
this._pendingBuffer = null;
}
Hold if the buffer ends in an incomplete unicode sequence
if (data[data.length - 1] & 0x80) {
this._pendingBuffer = data;
}
Otherwise, tokenize as far as possible
else {
Only read a BOM at the start
if (typeof this._input === 'undefined')
this._input = this._readStartingBom(typeof data === 'string' ? data : data.toString());
else
this._input += data;
this._tokenizeToEnd(callback, false);
}
}
});
Parses until the end
input.on('end', () => {
if (typeof this._input === 'string')
this._tokenizeToEnd(callback, true);
});
input.on('error', callback);
}
}
}