#!/usr/bin/env python

"""
Parsing of textual content.

Copyright (C) 2014, 2015, 2016 Paul Boddie <paul@boddie.org.uk>

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 3 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
details.

You should have received a copy of the GNU General Public License along with
this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import codecs
import re

# Parsing of lines to obtain functions and arguments.

line_pattern_str = r"(?:" \
                   r"(?:'(.*?)')" \
                   r"|" \
                   r'(?:"(.*?)")' \
                   r"|" \
                   r"([^\s]+)" \
                   r")+" \
                   r"(?:\s+|$)"
line_pattern = re.compile(line_pattern_str)

def parse_line(text):

    """
    Parse the given 'text', returning a list of words separated by whitespace in
    the input, where whitespace may occur inside words if quoted using single or
    double quotes.
    """

    parts = []

    # Match the components of each part.

    for match in line_pattern.finditer(text):

        # Combine the components by traversing the matching groups.

        parts.append(reduce(lambda a, b: (a or "") + (b or ""), match.groups()))

    return parts

# Parsing of tabular files.

def set_defaults(t, empty_defaults):

    """
    In the list 't', replace values that are empty or absent with defaults
    provided by the 'empty_defaults' collection whose entries are of the form
    (index, value).
    """

    for i, default in empty_defaults:
        if i >= len(t):
            t += [None] * (i - len(t) + 1)
        if not t[i]:
            t[i] = default
    return t

def get_table(filename, empty_defaults=None, tab_separated=True):

    """
    From the file having the given 'filename', return a list of tuples
    representing the file's contents.

    The 'empty_defaults' is a list of (index, value) tuples indicating the
    default value where a column either does not exist or provides an empty
    value.

    If 'tab_separated' is specified and is a false value, line parsing using
    the imiptools.text.parse_line function will be performed instead of
    splitting each line of the file using tab characters as separators.
    """

    f = codecs.open(filename, "rb", encoding="utf-8")
    try:
        return get_table_from_stream(f, empty_defaults, tab_separated)
    finally:
        f.close()

def get_table_from_stream(f, empty_defaults=None, tab_separated=True):

    """
    Return a list of tuples representing the contents of the stream 'f'.

    The 'empty_defaults' is a list of (index, value) tuples indicating the
    default value where a column either does not exist or provides an empty
    value.

    If 'tab_separated' is specified and is a false value, line parsing using
    the imiptools.text.parse_line function will be performed instead of
    splitting each line of the file using tab characters as separators.
    """

    l = []

    for line in f.readlines():
        line = line.strip(" \r\n")

        if tab_separated:
            t = line.split("\t")
        else:
            t = parse_line(line)

        if empty_defaults:
            t = set_defaults(t, empty_defaults)
        l.append(tuple(t))

    return l

# vim: tabstop=4 expandtab shiftwidth=4
