The Mercury Library Reference Manual: parsing

%--------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%--------------------------------------------------%
% Copyright (C) 2009-2012 The University of Melbourne.
% Copyright (C) 2014-2020, 2022 The Mercury team.
% This file is distributed under the terms specified in COPYING.LIB.
%--------------------------------------------------%
%
% File: parsing_utils.m
% Authors: Ralph Becket <rafe@csse.unimelb.edu.au>, maclarty
% Stability: low
%
% Utilities for recursive descent parsers. Parsers take at least three
% arguments: a source (src) containing the input string, and an input/output
% pair of parser states (ps) tracking the current offset into the input.
%
% Call parse(InputString, SkipWS, Parser, Result) to parse an input string
% and return an error context and message if parsing failed.
% The SkipWS predicate is used by the primitive parsers to skip over any
% following whitespace (providing a skipping predicate allows users to define
% comments as whitespace).
% Alternatively, a new src and ps can be constructed by calling
% new_src_and_ps(InputString, SkipWS, Src, !:PS).
%
% Parsing predicates are semidet and typically take the form
% p(...parameters..., Src, Result, !PS). A parser matching variable
% assignments of the form `x = 42' might be defined like this:
%
%   var_assignment(Src, {Var, Value}, !PS) :-
%       var(Src, Var, !PS),
%       punct(Src, "=", !PS),
%       expr(Src, Expr, !PS).
%
% where var/4 and expr/4 are parsers for variables and expressions respectively
% and punct/4 is provided by this module for matching punctuation.
%
%--------------------------------------------------%
%--------------------------------------------------%

:- module parsing_utils.
:- interface.

:- import_module char.
:- import_module list.
:- import_module maybe.
:- import_module unit.

%--------------------------------------------------%

    % The parser source (input string).
    %
:- type src.

    % The parser "state", passed around in DCG arguments.
    %
:- type ps.

    % These types and insts are useful for specifying "standard" parser
    % signatures.
    %
:- type parser(T) == pred(src, T, ps, ps).
:- inst parser == (pred(in, out, in, out) is semidet).

    % The following are for parsers that also transform a separate state value.
    %
:- type parser_with_state(T, S) == pred(src, T, S, S, ps, ps).
:- inst parser_with_state == (pred(in, out, in, out, in, out) is semidet).

    % Predicates of this type are used to skip whitespace in the primitive
    % parsers provided by this module.
    %
:- type skip_whitespace_pred == parser(unit).

:- type parse_result(T)
    --->    ok(T)
    ;       error(
                error_message :: maybe(string),
                error_line    :: int,
                error_col     :: int
            ).

    % parse(Input, SkipWS, Parser, Result).
    % Try to parse Input using Parser and SkipWS to consume whitespace.
    % If Parser succeeds then return ok with the parsed value,
    % otherwise return error. If there were any calls to fail_with_message
    % without any subsequent progress being made, then the error message
    % passed to the last call to fail_with_message will be returned in the
    % error result. Otherwise no message is returned and the furthest
    % position the parser got in the input string is returned.
    %
:- pred parse(string::in, skip_whitespace_pred::in(parser),
    parser(T)::in(parser), parse_result(T)::out) is cc_multi.

    % As above but using the default whitespace parser, whitespace/4.
    %
:- pred parse(string::in, parser(T)::in(parser), parse_result(T)::out)
    is cc_multi.

%--------------------------------------------------%

    % Construct a new parser source and state from a string, also specifying
    % a predicate for skipping over whitespace (several primitive parsers
    % use this predicate to consume whitespace after a token; this argument
    % allows the user to specify a predicate for, say, skipping over comments
    % as well).
    %
:- pred new_src_and_ps(string::in, skip_whitespace_pred::in(parser),
    src::out, ps::out) is det.

    % Construct a new parser source and state from a string.
    % The default whitespace parser, whitespace/4, is used.
    %
:- pred new_src_and_ps(string::in, src::out, ps::out) is det.

%--------------------------------------------------%

    % Return the input string and its length from the parser source.
    %
:- pred input_string(src::in, string::out, int::out) is det.

    % Obtain the current offset from the start of the input string
    % (the first character in the input has offset 0).
    %
:- pred current_offset(src::in, int::out, ps::in, ps::out) is det.

    % Return the parser to skip over whitespace from the parser source.
    %
:- pred get_skip_whitespace_pred(src::in, skip_whitespace_pred::out(parser))
    is det.

%--------------------------------------------------%

    % input_substring(Src, StartOffset, EndOffsetPlusOne, Substring):
    % Copy the substring from the input occupying the offsets
    % [StartOffset, EndOffsetPlusOne).
    %
:- pred input_substring(src::in, int::in, int::in, string::out) is semidet.

%--------------------------------------------------%

:- type line_numbers.

    % Compute a structure from the parser source which can be used to
    % convert offsets into line numbers and positions in the file (this
    % is useful for error reporting).
    %
:- func src_to_line_numbers(src) = line_numbers.

    % Convert an offset into a line number and position within the line
    % (the first line is number 1; the first character in a line is
    % position 1).
    %
:- pred offset_to_line_number_and_position(line_numbers::in, int::in,
    int::out, int::out) is det.

%--------------------------------------------------%

    % Read the next char.
    %
:- pred next_char(src::in, char::out, ps::in, ps::out) is semidet.

    % Read the next char but do not record progress information.
    % This is more efficient than next_char, but may produce less informative
    % error messages in case of a parse error.
    %
:- pred next_char_no_progress(src::in, char::out, ps::in, ps::out) is semidet.

%--------------------------------------------------%

    % Match a char from the given string.
    %
:- pred char_in_class(string::in, src::in, char::out,
    ps::in, ps::out) is semidet.

%--------------------------------------------------%

    % Match a string exactly and any subsequent whitespace.
    %
:- pred punct(string::in, src::in, unit::out, ps::in, ps::out) is semidet.

    % keyword(IdChars, Keyword, Src, _, !PS) matches Keyword exactly (i.e., it
    % must not be followed by any character in IdChars) and any subsequent
    % whitespace.
    %
:- pred keyword(string::in, string::in, src::in, unit::out,
    ps::in, ps::out) is semidet.

    % ikeyword(IdChars, Keyword, Src, _, !PS)
    % Case-insensitive version of keyword/6.
    % Only uppercase and lowercase letters in the ASCII range (A-Z, a-z)
    % are compared case insensitively.
    %
:- pred ikeyword(string::in, string::in, src::in, unit::out,
    ps::in, ps::out) is semidet.

    % identifier(InitIdChars, IdChars, Src, Identifier, !PS) matches the next
    % identifier (result in Identifier) comprising a char from InitIdChars
    % followed by zero or more chars from IdChars. Any subsequent whitespace
    % is consumed.
    %
:- pred identifier(string::in, string::in, src::in, string::out,
    ps::in, ps::out) is semidet.

    % Consume any whitespace (defined as a sequence of characters
    % satisfying char.is_whitespace).
    %
:- pred whitespace(src::in, unit::out,
    ps::in, ps::out) is semidet.

%--------------------------------------------------%

    % Consume any input up to, and including, the next newline character
    % marking the end of the current line.
    %
:- pred skip_to_eol(src::in, unit::out,
    ps::in, ps::out) is semidet.

    % Succeed if we have reached the end of the input.
    %
:- pred eof(src::in, unit::out, ps::in, ps::out) is semidet.

%--------------------------------------------------%

    % Parse a float literal matching [-][0-9]+[.][0-9]+([Ee][-+][0-9]+)?
    % followed by any whitespace. The float_literal_as_string version simply
    % returns the matched string. The float_literal version uses
    % string.to_float to convert the output of float_literal_as_string; this
    % may return an approximate answer since not all floating point numbers
    % can be perfectly represented as Mercury floats.
    %
:- pred float_literal_as_string(src::in, string::out,
    ps::in, ps::out) is semidet.
:- pred float_literal(src::in, float::out,
    ps::in, ps::out) is semidet.

    % Parse an int literal matching [-][0-9]+, not followed by [.][0-9]+,
    % followed by any whitespace. The int_literal_as_string version simply
    % returns the matched string. The int_literal version uses string.to_int
    % to convert the output of int_literal_as_string; this may fail if the
    % number in question cannot be represented as a Mercury int.
    %
:- pred int_literal_as_string(src::in, string::out,
    ps::in, ps::out) is semidet.
:- pred int_literal(src::in, int::out,
    ps::in, ps::out) is semidet.

    % Parse a string literal. The string argument is the quote character.
    % A backslash (\) character in the string makes the next character
    % literal (e.g., for embedding quotes). These 'escaped' characters
    % are included as-is in the result, along with the preceding backslash.
    % Any following whitespace is also consumed.
    %
:- pred string_literal(char::in, src::in, string::out,
    ps::in, ps::out) is semidet.

%--------------------------------------------------%

% Each basic parser combinators has a version that has a separate state
% argument is threaded through the computation, for parsers that e.g.
% incrementally construct a symbol table.

    % optional(P, Src, Result, !PS) returns Result = yes(X) if P(Src, X, !PS),
    % or Result = no if P does not succeed.
    %
:- pred optional(parser(T)::in(parser), src::in, maybe(T)::out,
    ps::in, ps::out) is semidet.

    % optional(P, Src, Result, !S, !PS) returns Result = yes(X)
    % if P(Src, X, !S, !PS), or Result = no if P does not succeed.
    %
:- pred optional(parser_with_state(T, S)::in(parser_with_state), src::in,
    maybe(T)::out, S::in, S::out, ps::in, ps::out) is semidet.

%--------------------------------------------------%

    % zero_or_more(P, Src, Xs, !PS) returns the list of results Xs obtained
    % by repeatedly applying P until P fails. The nth item in Xs is
    % the result from the nth application of P.
    %
:- pred zero_or_more(parser(T)::in(parser), src::in, list(T)::out,
    ps::in, ps::out) is semidet.

    % zero_or_more(P, Src, Xs, !S, !PS) returns the list of results Xs obtained
    % by repeatedly applying P until P fails. The nth item in Xs is
    % the result from the nth application of P.
    %
:- pred zero_or_more(parser_with_state(T, S)::in(parser_with_state), src::in,
    list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.

%--------------------------------------------------%

    % one_or_more(P, Src, Xs, !PS) returns the list of results Xs obtained
    % by repeatedly applying P until P fails. The nth item in Xs is
    % the result from the nth application of P. P must succeed at least once.
    %
:- pred one_or_more(parser(T)::in(parser), src::in, list(T)::out,
    ps::in, ps::out) is semidet.

    % one_or_more(P, Src, Xs, !S, !PS) returns the list of results Xs obtained
    % by repeatedly applying P until P fails. The nth item in Xs is
    % the result from the nth application of P. P must succeed at least once.
    %
:- pred one_or_more(parser_with_state(T, S)::in(parser_with_state), src::in,
    list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.

%--------------------------------------------------%

    % brackets(L, R, P, Src, X, !PS) is equivalent to
    %   punct(L, Src, _, !PS), P(Src, X, !PS), punct(R, Src, _, !PS).
    %
:- pred brackets(string::in, string::in, parser(T)::in(parser), src::in,
    T::out, ps::in, ps::out) is semidet.

    % brackets(L, R, P, Src, X, !S, !PS) is equivalent to
    %   punct(L, Src, _, !PS), P(Src, X, !S, !PS), punct(R, Src, _, !PS).
    %
:- pred brackets(string::in, string::in,
    parser_with_state(T, S)::in(parser_with_state), src::in,
    T::out, S::in, S::out, ps::in, ps::out) is semidet.

%--------------------------------------------------%

    % separated_list(Separator, P, Src, Xs, !PS) is like
    % zero_or_more(P, Src, Xs, !PS) except that successive applications of
    % P must be separated by punct(Separator, Src, _, !PS).
    %
:- pred separated_list(string::in, parser(T)::in(parser), src::in,
    list(T)::out, ps::in, ps::out) is semidet.

    % separated_list(Separator, P, Src, Xs, !S, !PS) is like
    % zero_or_more(P, Src, Xs, !S, !PS) except that successive applications of
    % P must be separated by punct(Separator, Src, _, !PS).
    %
:- pred separated_list(string::in,
    parser_with_state(T, S)::in(parser_with_state),
    src::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.

%--------------------------------------------------%

    % comma_separated_list(P, Src, Xs) is the same as
    %   separated_list(",", P, Src, Xs).
    %
:- pred comma_separated_list(parser(T)::in(parser), src::in, list(T)::out,
    ps::in, ps::out) is semidet.

    % comma_separated_list(P, Src, Xs, !S, !PS) is the same as
    %   separated_list(",", P, Src, Xs, !S, !PS).
    %
:- pred comma_separated_list(parser_with_state(T, S)::in(parser_with_state),
    src::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.

%--------------------------------------------------%

    % Declaratively this predicate is equivalent to false. Operationally,
    % it will record an error message that will be returned by parse/4
    % if no further progress is made and then fail.
    %
:- pred fail_with_message(string::in, src::in, T::out, ps::in, ps::out)
    is semidet.

    % As above, but use the given offset for the context of the message.
    %
:- pred fail_with_message(string::in, int::in, src::in, T::out,
    ps::in, ps::out) is semidet.

%--------------------------------------------------%
%--------------------------------------------------%
60 parsing_utils