Source code for sir.querying

# Copyright (c) 2014, 2015 Lukas Lalinsky, Wieland Hoffmann
# License: MIT, see LICENSE for details
import logging


from sqlalchemy import func
from sqlalchemy.orm.attributes import InstrumentedAttribute
from sqlalchemy.orm.interfaces import ONETOMANY, MANYTOONE
from sqlalchemy.orm.properties import RelationshipProperty

logger = logging.getLogger("sir")


[docs]def iterate_path_values(path, obj):
    """
    Return an iterator over all values for `path` on `obj`, an instance of
    a :ref:`declarative <sqla:declarative_toplevel>` class by first splitting
    the path into its elements by splitting it on the dots, resulting in a list
    of path elements. Then, for each element, a call to :func:`getattr` is made
    - the arguments will be the current model (which initially is the **model**
    assigned to the :class:`~sir.schema.searchentities.SearchEntity`) and the
    current path element. After doing this, there are two cases:

    1. The path element is not the last one in the path. In this case, the
       :func:`getattr` call returns one or more objects of another model which
       will replace the current one.

    2. The path element is the last one in the path. In this case, the value
       returned by the :func:`getattr` call will be returned and added to the
       list of values for this field.

    To give an example, lets presume the object we're starting with is an
    instance of :class:`~mbdata.models.Artist` and the path is
    "begin_area.name". The first :func:`getattr` call will be::

      getattr(obj, "begin_area")

    which returns an :class:`~mbdata.models.Area` object, on which the call::

      getattr(obj, "name")

    will return the final value::

        >>> from mbdata.models import Artist, Area
        >>> artist = Artist(name="Johan de Meij")
        >>> area = Area(name="Netherlands")
        >>> artist.begin_area = area
        >>> list(iterate_path_values("begin_area.name", artist))
        ['Netherlands']

    One-to-many relationships will of course be handled as well::

        >>> from mbdata.models import Recording, ISRC
        >>> recording = Recording(name="Fortuna Imperatrix Mundi: O Fortuna")
        >>> recording.isrcs.append(ISRC(isrc="DEF056730100"))
        >>> recording.isrcs.append(ISRC(isrc="DEF056730101"))
        >>> list(iterate_path_values("isrcs.isrc", recording))
        ['DEF056730100', 'DEF056730101']

    """
    if obj is None:
        return

    if '.' in path:
        pathelem, rest = path.split('.', 1)
    else:
        pathelem, rest = path, None

    column = getattr(obj.__class__, pathelem)
    if isinstance(column, InstrumentedAttribute):
        prop = column.property
        if isinstance(column.property, RelationshipProperty):
            if prop.direction == ONETOMANY:
                for sub_obj in getattr(obj, pathelem):
                    for val in iterate_path_values(rest, sub_obj):
                        yield val
            elif prop.direction == MANYTOONE:
                for val in iterate_path_values(rest, getattr(obj, pathelem)):
                    yield val
        else:
            yield getattr(obj, pathelem)
    else:
        yield getattr(obj, pathelem)


[docs]def iter_bounds(db_session, column, batch_size, importlimit):
    """
    Return a list of (lower bound, upper bound) tuples which contain row ids to
    iterate through a table in batches of ``batch_size``. If ``importlimit`` is
    greater than zero, return only enough tuples to contain ``importlimit``
    rows. The second element of the last tuple in the returned list may be
    ``None``. This happens if the last batch will contain less than
    ``batch_size`` rows.

    :param sqlalchemy.orm.session.Session db_session:
    :param sqlalchemy.Column column:
    :param int batch_size:
    :param int importlimit:
    :rtype: [(int, int)]
    """
    q = db_session.query(
        column,
        func.row_number().
        over(order_by=column).
        label('rownum')
    ).\
        from_self(column)

    if batch_size > 1:
        q = q.filter("rownum %% %d=1" % batch_size)

    if importlimit:
        q = q.filter("rownum <= %d" % (importlimit))

    intervals = [id for id in q]
    bounds = []

    while intervals:
        start = intervals.pop(0)[0]
        if intervals:
            end = intervals[0][0]
        elif importlimit:
            # If there's an importlimit, just add a noop bound. This way,
            # :func:`sir.indexing.index_entity` doesn't require any
            # information about the limit
            end = start
        else:
            end = None
        bounds.append((start, end))

    return bounds