Next: mercury_term_parser, Previous: maybe, Up: Top [Contents]
%--------------------------------------------------%
% vim: ft=mercury ts=4 sw=4 et
%--------------------------------------------------%
% Copyright (C) 1993-2000, 2003-2008, 2011-2012 The University of Melbourne.
% Copyright (C) 2014-2018 The Mercury team.
% This file is distributed under the terms specified in COPYING.LIB.
%--------------------------------------------------%
%
% File: lexer.m.
% Main author: fjh.
% Stability: high.
%
% Lexical analysis. This module defines the representation of tokens
% and exports predicates for reading in tokens from an input stream.
%
% See ISO Prolog 6.4. Also see the comments at the top of parser.m.
%
%--------------------------------------------------%
%--------------------------------------------------%
:- module mercury_term_lexer.
:- interface.
:- import_module char.
:- import_module io.
:- import_module integer.
%--------------------------------------------------%
:- type token
---> name(string)
; variable(string)
; integer(integer_base, integer, signedness, integer_size)
; float(float)
; string(string) % "...."
; implementation_defined(string) % $name
; open % '('
; open_ct % '(' without any preceding whitespace
; close % ')'
; open_list % '['
; close_list % ']'
; open_curly % '{'
; close_curly % '}'
; ht_sep % '|'
; comma % ','
; end % '.'
; junk(char) % junk character in the input stream
; error(string) % some other invalid token
; io_error(io.error) % error reading from the input stream
; eof % end-of-file
; integer_dot(integer).
% The lexer will never return integer_dot. This token is used
% internally in the lexer, to keep the grammar LL(1) so that
% only one character of pushback is needed. But the lexer will
% convert integer_dot/1 tokens to integer/1 tokens before
% returning them.
:- type integer_base
---> base_2
; base_8
; base_10
; base_16.
:- type signedness
---> signed
; unsigned.
:- type integer_size
---> size_word
; size_8_bit
; size_16_bit
; size_32_bit
; size_64_bit.
% For every token, we record the line number of the line on
% which the token occurred.
%
:- type token_context == int. % line number
% This "fat list" representation is more efficient than a list of pairs.
%
:- type token_list
---> token_cons(token, token_context, token_list)
; token_nil.
% A line_context and a line_posn together contain exactly the same
% fields as a posn, with the same semantics. The difference is that
% stepping past a single character requires no memory allocation
% whatsoever *unless* that character is a newline.
%
% XXX We should consider making both fields of line_context into uint32s,
% to allow them to fit into a single 64 bit word. Simplicity would then
% require line_posn's argument being a uint32 as well.
:- type line_context
---> line_context(
line_context_current_line_number :: int,
line_context_offset_of_start_of_line :: int
).
:- type line_posn
---> line_posn(
line_posn_current_offset_in_file :: int
).
% Read a list of tokens either from the current input stream
% or from the specified input stream.
% Keep reading until we encounter either an `end' token
% (i.e. a full stop followed by whitespace) or the end-of-file.
%
% See `char.is_whitespace' for the definition of whitespace characters
% used by this predicate.
%
:- pred get_token_list(token_list::out, io::di, io::uo) is det.
:- pred get_token_list(io.text_input_stream::in, token_list::out,
io::di, io::uo) is det.
% The type `offset' represents a (zero-based) offset into a string.
%
:- type offset == int.
% string_get_token_list_max(String, MaxOffset, Tokens,
% InitialPos, FinalPos):
% linestr_get_token_list_max(String, MaxOffset, Tokens,
% InitialLineContext, FinalLineContext, InitialPos, FinalPos):
%
% Scan a list of tokens from a string, starting at the current offset
% specified by InitialPos. Keep scanning until either we encounter either
% an `end' token (i.e. a full stop followed by whitespace) or until we
% reach MaxOffset. (MaxOffset must be =< the length of the string.)
% Return the tokens scanned in Tokens, and return the position one
% character past the end of the last token in FinalPos.
%
% See `char.is_whitespace' for the definition of whitespace characters
% used by this predicate.
%
:- pred string_get_token_list_max(string::in, offset::in, token_list::out,
posn::in, posn::out) is det.
:- pred linestr_get_token_list_max(string::in, offset::in, token_list::out,
line_context::in, line_context::out, line_posn::in, line_posn::out) is det.
% string_get_token_list(String, Tokens, InitialPos, FinalPos):
%
% calls string_get_token_list_max above with MaxPos = length of String.
%
:- pred string_get_token_list(string::in, token_list::out,
posn::in, posn::out) is det.
% Convert a token to a human-readable string describing the token.
%
:- pred token_to_string(token::in, string::out) is det.
%--------------------------------------------------%
%--------------------------------------------------%
Next: mercury_term_parser, Previous: maybe, Up: Top [Contents]