Next: , Previous: parser, Up: Top


47 parsing_utils

     %--------------------------------------------------%
     % vim: ft=mercury ts=4 sw=4 et wm=0 tw=0
     %--------------------------------------------------%
     % Copyright (C) 2009-2012 The University of Melbourne.
     % This file may only be copied under the terms of the GNU Library General
     % Public License - see the file COPYING.LIB in the Mercury distribution.
     %--------------------------------------------------%
     %
     % File: parsing_utils.m
     % Authors: Ralph Becket <rafe@csse.unimelb.edu.au>, maclarty
     % Stability: low
     %
     % Utilities for recursive descent parsers.  Parsers take at least three
     % arguments: a source (src) containing the input string and a parser state (ps)
     % input/output pair tracking the current offset into the input.
     %
     % Call parse(InputString, SkipWS, Parser, Result) to parse an input string
     % and return an error context and message if parsing failed.
     % The SkipWS predicate is used by the primitive parsers to skip over any
     % following whitespace (providing a skipping predicate allows users to define
     % comments as whitespace).
     % Alternatively a new src and ps can be constructed by calling
     % new_src_and_ps(InputString, SkipWS, Src, !:PS).
     % Parsing predicates are semidet and typically take the form
     % p(...parameters..., Src, Result, !PS).  A parser matching variable
     % assignments of the form `x = 42' might be defined like this:
     %
     %   var_assignment(Src, {Var, Value}, !PS) :-
     %       var(Src, Var, !PS),
     %       punct(Src, "=", !PS),
     %       expr(Src, Expr, !PS).
     %
     % where var/4 and expr/4 are parsers for variables and expressions
     % respectively and punct/4 is provided by this module for matching
     % punctuation.
     %
     %--------------------------------------------------%
     %--------------------------------------------------%
     
     :- module parsing_utils.
     :- interface.
     
     :- import_module char.
     :- import_module list.
     :- import_module maybe.
     :- import_module unit.
     
     %--------------------------------------------------%
     
         % The parser source (input string).
         %
     :- type src.
     
         % The parser "state", passed around in DCG arguments.
         %
     :- type ps.
     
         % These types and insts are useful for specifying "standard" parser
         % signatures.
         %
     :- type parser(T) == pred(src, T, ps, ps).
     :- inst parser == ( pred(in, out, in, out) is semidet ).
     
         % The following are for parsers that also transform a separate state value.
         %
     :- type parser_with_state(T, S) == pred(src, T, S, S, ps, ps).
     :- inst parser_with_state == ( pred(in, out, in, out, in, out) is semidet ).
     
         % Predicates of this type are used to skip whitespace in the primitive
         % parsers provided by this module.
         %
     :- type skip_whitespace_pred == parser(unit).
     
     :- type parse_result(T)
         --->    ok(T)
         ;       error(
                     error_message :: maybe(string),
                     error_line    :: int,
                     error_col     :: int
                 ).
     
         % parse(Input, SkipWS, Parser, Result).
         % Try to parse Input using Parser and SkipWS to consume whitespace.
         % If Parser succeeds then return ok with the parsed value,
         % otherwise return error.  If there were any calls to fail_with_message
         % without any subsequent progress being made, then the error message
         % passed to the last call to fail_with_message will be returned in the
         % error result.  Otherwise no message is returned and the furthest
         % position the parser got in the input string is returned.
         %
     :- pred parse(string::in, skip_whitespace_pred::in(parser),
         parser(T)::in(parser), parse_result(T)::out) is cc_multi.
     
         % As above but using the default whitespace parser, whitespace/4.
         %
     :- pred parse(string::in, parser(T)::in(parser), parse_result(T)::out)
         is cc_multi.
     
         % Construct a new parser source and state from a string, also specifying
         % a predicate for skipping over whitespace (several primitive parsers
         % use this predicate to consume whitespace after a token; this argument
         % allows the user to specify a predicate for, say, skipping over comments
         % as well).
         %
     :- pred new_src_and_ps(string::in, skip_whitespace_pred::in(parser),
         src::out, ps::out) is det.
     
         % Construct a new parser source and state from a string.
         % The default whitespace parser, whitespace/4, is used.
         %
     :- pred new_src_and_ps(string::in, src::out, ps::out) is det.
     
         % Return the input string and its length from the parser source.
         %
     :- pred input_string(src::in, string::out, int::out) is det.
     
         % Return the parser to skip over whitespace from the parser source.
         %
     :- pred get_skip_whitespace_pred(src::in, skip_whitespace_pred::out(parser))
         is det.
     
         % Obtain the current offset from the start of the input string
         % (the first character in the input has offset 0).
         %
     :- pred current_offset(src::in, int::out, ps::in, ps::out) is det.
     
         % Compute a structure from the parser source which can be used to
         % convert offsets into line numbers and positions in the file (this
         % is useful for error reporting).
         %
     :- type line_numbers.
     
     :- func src_to_line_numbers(src) = line_numbers.
     
         % Convert an offset into a line number and position within the line
         % (the first line is number 1; the first character in a line is
         % position 1).
         %
     :- pred offset_to_line_number_and_position(line_numbers::in, int::in,
         int::out, int::out) is det.
     
         % input_substring(Src, StartOffset, EndOffsetPlusOne, Substring):
         % Copy the substring from the input occupying the offsets
         % [StartOffset, EndOffsetPlusOne).
         %
     :- pred input_substring(src::in, int::in, int::in, string::out) is semidet.
     
         % Read the next char.
         %
     :- pred next_char(src::in, char::out, ps::in, ps::out) is semidet.
     
         % Read the next char but do not record progress information.
         % This is more efficient than next_char, but may produce less informative
         % error messages in case of a parse error.
         %
     :- pred next_char_no_progress(src::in, char::out, ps::in, ps::out) is semidet.
     
         % Match a char from the given string.
         %
     :- pred char_in_class(string::in, src::in, char::out,
         ps::in, ps::out) is semidet.
     
         % Match a string exactly and any subsequent whitespace.
         %
     :- pred punct(string::in, src::in, unit::out, ps::in, ps::out) is semidet.
     
         % keyword(IdChars, Keyword, Src, _, !PS) matches Keyword exactly (i.e., it
         % must not be followed by any character in IdChars) and any subsequent
         % whitespace.
         %
     :- pred keyword(string::in, string::in, src::in, unit::out,
         ps::in, ps::out) is semidet.
     
         % ikeyword(IdChars, Keyword, Src, _, !PS)
         % Case-insensitive version of keyword/6.
         % Only upper and lowercase unaccented Latin letters are treated specially.
         %
     :- pred ikeyword(string::in, string::in, src::in, unit::out,
         ps::in, ps::out) is semidet.
     
         % identifier(InitIdChars, IdChars, Src, Identifier, !PS) matches the next
         % identifier (result in Identifier) comprising a char from InitIdChars
         % followed by zero or more chars from IdChars.  Any subsequent whitespace
         % is consumed.
         %
     :- pred identifier(string::in, string::in, src::in, string::out,
         ps::in, ps::out) is semidet.
     
         % Consume any whitespace (defined as a sequence of characters
         % satisfying char.is_whitespace).
         %
     :- pred whitespace(src::in, unit::out,
         ps::in, ps::out) is semidet.
     
         % Consume any input up to, and including, the next newline character
         % marking the end of the current line.
         %
     :- pred skip_to_eol(src::in, unit::out,
         ps::in, ps::out) is semidet.
     
         % Succeed if we have reached the end of the input.
         %
     :- pred eof(src::in, unit::out, ps::in, ps::out) is semidet.
     
         % Parse a float literal matching [-][0-9]+[.][0-9]+([Ee][-+][0-9]+)?
         % followed by any whitespace.  The float_literal_as_string version simply
         % returns the matched string.  The float_literal version uses
         % string.to_float to convert the output of float_literal_as_string; this
         % may return an approximate answer since not all floating point numbers
         % can be perfectly represented as Mercury floats.
         %
     :- pred float_literal_as_string(src::in, string::out,
         ps::in, ps::out) is semidet.
     :- pred float_literal(src::in, float::out,
         ps::in, ps::out) is semidet.
     
         % Parse an int literal matching [-][0-9]+, not followed by [.][0-9]+,
         % followed by any whitespace.  The int_literal_as_string version simply
         % returns the matched string.  The int_literal version uses string.to_int
         % to convert the output of int_literal_as_string; this may fail if the
         % number in question cannot be represented as a Mercury int.
         %
     :- pred int_literal_as_string(src::in, string::out,
         ps::in, ps::out) is semidet.
     :- pred int_literal(src::in, int::out,
         ps::in, ps::out) is semidet.
     
         % Parse an string literal.  The string argument is the quote character.
         % A backslash (\) character in the string makes the next character
         % literal (e.g., for embedding quotes).  These 'escaped' characters
         % are included as-is in the result, along with the preceding backslash.
         % Any following whitespace is also consumed.
         %
     :- pred string_literal(char::in, src::in, string::out,
         ps::in, ps::out) is semidet.
     
         % optional(P, Src, Result, !PS) returns Result = yes(X), if P(Src, X, !PS),
         % or Result = no if P does not succeed.
         %
     :- pred optional(parser(T)::in(parser), src::in, maybe(T)::out,
         ps::in, ps::out) is semidet.
     
         % zero_or_more(P, Src, Xs, !PS) returns the list of results Xs obtained
         % by repeatedly applying P until P fails.  The nth item in Xs is
         % the result from the nth application of P.
         %
     :- pred zero_or_more(parser(T)::in(parser), src::in, list(T)::out,
         ps::in, ps::out) is semidet.
     
         % one_or_more(P, Src, Xs, !PS) returns the list of results Xs obtained
         % by repeatedly applying P until P fails.  The nth item in Xs is
         % the result from the nth application of P.  P must succeed at
         % least once.
         %
     :- pred one_or_more(parser(T)::in(parser), src::in, list(T)::out,
         ps::in, ps::out) is semidet.
     
         % brackets(L, R, P, Src, X, !PS) is equivalent to
         %   punct(L, Src, _, !PS), P(Src, X, !PS), punct(R, Src, _, !PS).
         %
     :- pred brackets(string::in, string::in, parser(T)::in(parser), src::in,
         T::out, ps::in, ps::out) is semidet.
     
         % separated_list(Separator, P, Src, Xs, !PS) is like
         % zero_or_more(P, Src, Xs, !PS) except that successive applications of
         % P must be separated by punct(Separator, Src, _, !PS).
         %
     :- pred separated_list(string::in, parser(T)::in(parser), src::in,
         list(T)::out, ps::in, ps::out) is semidet.
     
         % comma_separated_list(P, Src, Xs) is the same as
         %   separated_list(",", P, Src, Xs).
         %
     :- pred comma_separated_list(parser(T)::in(parser), src::in, list(T)::out,
         ps::in, ps::out) is semidet.
     
         % Declaratively this predicate is equivalent to false.  Operationally
         % it will record an error message that will be returned by parse/4
         % if no further progress is made and then fail.
         %
     :- pred fail_with_message(string::in, src::in, T::out, ps::in, ps::out)
         is semidet.
     
         % As above, but use the given offset for the context of the message.
         %
     :- pred fail_with_message(string::in, int::in, src::in, T::out,
         ps::in, ps::out) is semidet.
     
     % The following parser combinators are equivalent to the above, except that
     % a separate state argument is threaded through the computation (e.g., for
     % parsers that incrementally construct a symbol table).
     
         % optional(P, Src, Result, !S, !PS) returns Result = yes(X),
         % if P(Src, X, !S, !PS), or Result = no if P does not succeed.
         %
     :- pred optional(parser_with_state(T, S)::in(parser_with_state), src::in,
         maybe(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
     
         % zero_or_more(P, Src, Xs, !S, !PS) returns the list of results Xs obtained
         % by repeatedly applying P until P fails.  The nth item in Xs is
         % the result from the nth application of P.
         %
     :- pred zero_or_more(parser_with_state(T, S)::in(parser_with_state), src::in,
         list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
     
         % one_or_more(P, Src, Xs, !S, !PS) returns the list of results Xs obtained
         % by repeatedly applying P until P fails.  The nth item in Xs is
         % the result from the nth application of P.  P must succeed at
         % least once.
         %
     :- pred one_or_more(parser_with_state(T, S)::in(parser_with_state), src::in,
         list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
     
         % brackets(L, R, P, Src, X, !S, !PS) is equivalent to
         %   punct(L, Src, _, !PS), P(Src, X, !S, !PS), punct(R, Src, _, !PS).
         %
     :- pred brackets(string::in, string::in,
         parser_with_state(T, S)::in(parser_with_state), src::in,
         T::out, S::in, S::out, ps::in, ps::out) is semidet.
     
         % separated_list(Separator, P, Src, Xs, !S, !PS) is like
         % zero_or_more(P, Src, Xs, !S, !PS) except that successive applications of
         % P must be separated by punct(Separator, Src, _, !PS).
         %
     :- pred separated_list(string::in,
         parser_with_state(T, S)::in(parser_with_state),
         src::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
     
         % comma_separated_list(P, Src, Xs, !S, !PS) is the same as
         %   separated_list(",", P, Src, Xs, !S, !PS).
         %
     :- pred comma_separated_list(parser_with_state(T, S)::in(parser_with_state),
         src::in, list(T)::out, S::in, S::out, ps::in, ps::out) is semidet.
     
     %--------------------------------------------------%
     %--------------------------------------------------%