Source code for onetick.py.sources

import datetime
import datetime as dt
import inspect
import operator
import os
import sys
import warnings
import io
import math
import string

from functools import partial
from typing import Optional, Union, Type, Iterable

import onetick.py as otp
import onetick.query as otq
import pandas as pd

import onetick.py.core._source
import onetick.py.functions
import onetick.py.db._inspection
from onetick.py.core._internal._param_column import _ParamColumn
from onetick.py.core._source._symbol_param_column import _SymbolParamColumn
from onetick.py.core._source._symbol_param_source import _SymbolParamSource
from onetick.py.core._source.tmp_otq import TmpOtq
from onetick.py.core.column import _Column
from onetick.py.core.eval_query import _QueryEvalWrapper
from onetick.py.core.source import Source, _Source  # _Source for backward compatibility
from onetick.py.core.column_operations.base import Raw

from . import types as ott
from . import utils, configuration
from .core import _csv_inspector, query_inspector
from .core.column_operations._methods.methods import is_arithmetical
from .core.column_operations.base import _Operation
from .db.db import DB
from .db._inspection import DB as inspect_DB
from .compatibility import is_supported_bucket_units_for_tick_generator

from .aggregations.order_book import (
    OB_SNAPSHOT_DOC_PARAMS, OB_SNAPSHOT_WIDE_DOC_PARAMS, OB_SNAPSHOT_FLAT_DOC_PARAMS
)
from .aggregations._docs import _bucket_interval_doc, _bucket_time_doc
from onetick.py.docs.utils import docstring, param_doc


_QUERY_PARAM_SPECIAL_CHARACTERS = "=,"

AdaptiveTickType = Union[str, Type[utils.adaptive]]


def update_node_tick_type(node: "Source", tick_type: AdaptiveTickType, db: Optional[str] = None):
    """Update node tick_type according to db name and tick_type.

    Don't change tick type for adaptive tick type.

    Parameters
    ----------
    node: Source
        node to set tick_type on
    tick_type: AdaptiveTickType
        string tick type or :py:class:`onetick.py.adaptive`
    db: Optional[str]
        optional db name
    """
    # do not change tick type for adaptive `tick_type`
    if not isinstance(tick_type, type) and tick_type is not utils.adaptive:
        if db:
            node.tick_type(db + "::" + tick_type)
        else:
            node.tick_type(tick_type)


[docs]class Tick(Source):

    @docstring(parameters=[_bucket_interval_doc, _bucket_time_doc], add_self=True)
    def __init__(
        self,
        offset=0,
        offset_part='millisecond',
        time: ott.datetime = None,
        timezone_for_time=None,
        symbol=utils.adaptive_to_default,
        db=utils.adaptive_to_default,
        start=utils.adaptive,
        end=utils.adaptive,
        tick_type: Optional[AdaptiveTickType] = None,
        bucket_time: str = "start",
        bucket_interval: int = 0,
        bucket_units: str = utils.adaptive,
        **kwargs,
    ):
        """
        Generate single tick object

        Parameters
        ----------
        offset: int, default=0
            tick timestamp offset from query start time in `offset_part`
        offset_part: one of [nanosecond, millisecond, second, minute, hour, day, dayofyear, weekday, week, month, quarter, year], default=millisecond   #noqa
            unit of time to calculate ``offset`` from.
        time: :py:class:`otp.datetime <onetick.py.datetime>`
            fixed time to set to all ticks.
            Note that this time should be inside time interval set by ``start`` and ``end`` parameters
            or by query time range.
        timezone_for_time: str
            timezone of the ``time``
        symbol: str, list of str, :class:`Source`, :class:`query`, :py:func:`eval query <onetick.py.eval>`
            Symbol(s) from which data should be taken.
        db: str
            Database to use for tick generation
        start: :py:class:`otp.datetime <onetick.py.datetime>`
            start time for tick generation. By default the start time of the query will be used.
        end: :py:class:`otp.datetime <onetick.py.datetime>`
            end time for tick generation. By default the end time of the query will be used.
        tick_type: str
            Special tick_type `TICK_GENERATOR` will be used by default. You can use
            :py:class:`onetick.py.adaptive` for the value if you want to use sink node tick type
            instead of defining your own.
        bucket_units: 'seconds', 'days' or 'months'
            Unit for value in ``bucket_interval``.
            Default is 'seconds'.
        kwargs:
            dictionary of columns names with their values.

        See also
        --------
        | **TICK_GENERATOR** OneTick event processor
        | :py:class:`otp.Ticks <onetick.py.Ticks>`

        Examples
        --------

        Simple usage, generate single tick:

        >>> t = otp.Tick(A=1, B='string', C=3.14, D=otp.dt(2000, 1, 1, 1, 1, 1, 1))
        >>> otp.run(t)
                Time  A       B     C                          D
        0 2003-12-01  1  string  3.14 2000-01-01 01:01:01.000001

        Generate one tick for each day in a week:

        >>> t = otp.Tick(A=1, start=otp.dt(2023, 1, 1), end=otp.dt(2023, 1, 8), bucket_interval=24 * 60 * 60)
        >>> otp.run(t)
                Time  A
        0 2023-01-01  1
        1 2023-01-02  1
        2 2023-01-03  1
        3 2023-01-04  1
        4 2023-01-05  1
        5 2023-01-06  1
        6 2023-01-07  1

        Generate tick every hour and add 1 minute offset to ticks' timestamps:

        >>> t = otp.Tick(A=1, offset=1, offset_part='minute', bucket_interval=60 * 60)
        >>> t.head(5)
                          Time  A
        0  2003-12-01 00:01:00  1
        1  2003-12-01 01:01:00  1
        2  2003-12-01 02:01:00  1
        3  2003-12-01 03:01:00  1
        4  2003-12-01 04:01:00  1

        Generate tick every hour and set fixed time:

        >>> t = otp.Tick(A=1, time=otp.dt(2023, 1, 2, 3, 4, 5, 6), bucket_interval=60 * 60,
        ...              start=otp.dt(2023, 1, 1), end=otp.dt(2023, 1, 8))
        >>> t.head(5)
                                Time  A
        0 2023-01-02 03:04:05.000006  1
        1 2023-01-02 03:04:05.000006  1
        2 2023-01-02 03:04:05.000006  1
        3 2023-01-02 03:04:05.000006  1
        4 2023-01-02 03:04:05.000006  1
        """

        if self._try_default_constructor(**kwargs):
            return

        if len(kwargs) == 0:
            raise ValueError("It is not allowed to have a tick without fields")
        if time is not None and offset != 0:
            raise ValueError("It's not allowed to set parameter 'datetime' and set non-zero offset at the same time")
        bucket_time = self._get_bucket_time(bucket_time)
        if tick_type is None:
            tick_type = "TICK_GENERATOR"

        columns = {}
        for key, value in kwargs.items():
            # the way to skip a field
            if value is None:
                continue

            if inspect.isclass(value):
                raise TypeError(f"Tick constructor expects values but not types, {value}")
            else:
                value_type = ott.get_object_type(value)

            if value_type is str:
                if isinstance(value, _Column) or is_arithmetical(value):
                    if value.dtype is not str:
                        value_type = value.dtype
                elif len(value) > ott.string.DEFAULT_LENGTH:
                    value_type = ott.string[len(value)]

            if value_type is bool:
                value_type = float

            if issubclass(value_type, (ott.datetime, ott.date, dt.datetime, dt.date, pd.Timestamp)):
                value_type = ott.nsectime

            columns[key] = value_type

        super().__init__(
            _symbols=symbol,
            _start=start,
            _end=end,
            _base_ep_func=lambda: self.base_ep(db=db,
                                               tick_type=tick_type,
                                               offset=offset,
                                               offset_part=offset_part,
                                               time=time,
                                               timezone_for_time=timezone_for_time,
                                               columns=columns,
                                               bucket_time=bucket_time,
                                               bucket_interval=bucket_interval,
                                               bucket_units=bucket_units,
                                               **kwargs),
            **columns,
        )

    def base_ep(self,
                db=utils.adaptive_to_default,
                tick_type="TICK_GENERATOR",
                offset=0,
                offset_part='millisecond',
                time=None,
                timezone_for_time=None,
                columns=None,
                bucket_time="start",
                bucket_interval=0,
                bucket_units=utils.adaptive,
                **kwargs):
        if columns is None:
            columns = {}

        if db is utils.adaptive_to_default:
            # if default database is not set, tick type will be set without it
            # and symbols will have to be specified in otp.run
            db = configuration.config.get('default_db')

        params = ",".join(
            ott.type2str(columns[key]) + " " + str(key) + "=" + ott.value2str(value)
            for key, value in kwargs.items()
            if value is not None
        )

        tick_generator_kwargs = {}
        if bucket_units is not utils.adaptive and is_supported_bucket_units_for_tick_generator(throw_warning=True):
            tick_generator_kwargs['bucket_interval_units'] = bucket_units.upper()

        src = Source(
            otq.TickGenerator(
                bucket_interval=bucket_interval,
                bucket_time=bucket_time,
                fields=params,
                **tick_generator_kwargs,
            ),
            **columns
        )

        update_node_tick_type(src, tick_type, db)

        # TIMESTAMP += offset will add redundant nodes to sort the timestamps.
        # No sorting needed for a single tick.
        if offset:
            src.sink(otq.UpdateField(field="TIMESTAMP",
                                     value=f"dateadd('{offset_part}', {offset}, TIMESTAMP, _TIMEZONE)"))
        elif time:
            src.sink(otq.UpdateField(field="TIMESTAMP",
                                     value=ott.datetime2expr(time, timezone_naive=timezone_for_time)))
        return src

    @staticmethod
    def _get_bucket_time(bucket_time):
        if bucket_time == "BUCKET_START":
            warnings.warn("BUCKET_START value is deprecated. Please, use 'start' instead", DeprecationWarning)
        elif bucket_time == "BUCKET_END":
            warnings.warn("BUCKET_END value is deprecated. Please, use 'end' instead", DeprecationWarning)
        elif bucket_time == "start":
            bucket_time = "BUCKET_START"
        elif bucket_time == "end":
            bucket_time = "BUCKET_END"
        else:
            raise ValueError(f"Only 'start' and 'end' values supported as bucket time, but you've passed {bucket_time}")
        return bucket_time


[docs]def Ticks(data=None,
          symbol=utils.adaptive_to_default,
          db=utils.adaptive_to_default,
          start=utils.adaptive,
          end=utils.adaptive,
          tick_type: Optional[AdaptiveTickType] = None,
          timezone_for_time=None,
          **inplace_data):
    """
    Data source that generates ticks.

    Ticks are placed with the 1 millisecond offset from
    each other starting from the start of the query interval.
    It has ability to change `distance` between ticks using the
    special reserved field name ``offset``, that specify time offset
    from a previous tick.

    Parameters
    ----------
    data: dict, list or pandas.DataFrame, optional
        Ticks values

        * ``dict`` -- <field_name>: <values>

        * ``list`` -- [[<field_names>], [<first_tick_values>], ..., [<n_tick_values>]]

        * :pandas:`DataFrame <pandas.DataFrame>` -- DataFrame with ``Time`` column

        * ``None`` -- ``inplace_data`` will be used

    symbol: str, list of str, :class:`Source`, :class:`query`, :py:func:`eval query <onetick.py.eval>`
        Symbol(s) from which data should be taken.
    db: str
        Database to use for tick generation
    start, end: :py:class:`datetime.datetime`, :py:class:`otp.datetime <onetick.py.datetime>`, \
                    :py:class:`onetick.py.adaptive`
        Timestamp for data generation
    tick_type: str
        tick type for data generation
    timezone_for_time: str
        timezone for data generation
    **inplace_data: list
        <field_name>: list(<field_values>)

    See also
    --------
    | **TICK_GENERATOR** OneTick event processor
    | :py:class:`otp.Tick <onetick.py.Tick>`

    Examples
    --------

    Pass data in ``dict``

    >>> d = otp.Ticks({'A': [1, 2, 3], 'B': [4, 5, 6]})
    >>> otp.run(d)
                         Time  A  B
    0 2003-12-01 00:00:00.000  1  4
    1 2003-12-01 00:00:00.001  2  5
    2 2003-12-01 00:00:00.002  3  6

    Pass ``inplace_data``

    >>> d = otp.Ticks(A=[1, 2, 3], B=[4, 5, 6])
    >>> otp.run(d)
                         Time  A  B
    0 2003-12-01 00:00:00.000  1  4
    1 2003-12-01 00:00:00.001  2  5
    2 2003-12-01 00:00:00.002  3  6

    Pass data in ``list``

    >>> d = otp.Ticks([['A', 'B'],
    ...                [1, 4],
    ...                [2, 5],
    ...                [3, 6]])
    >>> otp.run(d)
                         Time  A  B
    0 2003-12-01 00:00:00.000  1  4
    1 2003-12-01 00:00:00.001  2  5
    2 2003-12-01 00:00:00.002  3  6

    Using the ``offset`` example

    >>> data = otp.Ticks(X=[1, 2, 3], offset=[0, otp.Nano(1), 1])
    >>> otp.run(data)
                               Time  X
    0 2003-12-01 00:00:00.000000000  1
    1 2003-12-01 00:00:00.000000001  2
    2 2003-12-01 00:00:00.001000000  3

    Using pandas.DataFrame

    >>> start_datetime = datetime(2023, 1, 1, 12)
    >>> time_array = [start_datetime + otp.Hour(1) + otp.Nano(1)]
    >>> a_array = [start_datetime - otp.Day(15) - otp.Nano(7)]
    >>> df = pd.DataFrame({'Time': time_array,'A': a_array})
    >>> data = otp.Ticks(df)
    >>> otp.run(data, start=start_datetime, end=start_datetime + otp.Day(1))
                               Time                             A
    0 2023-01-01 13:00:00.000000001 2022-12-17 11:59:59.999999993
    """
    if tick_type is None:
        tick_type = "TICK_GENERATOR"

    if db is utils.adaptive_to_default:
        db = configuration.config.get('default_db')

    if isinstance(data, pd.DataFrame):
        if 'Time' not in data.columns:
            raise ValueError('Field `Time` is required for constructing an `otp.Source` from `pandas.DataFrame`')
        data = data.rename(columns={"Time": "time"})
        data = data.to_dict('list')

    if data and len(inplace_data) != 0:
        raise ValueError("Data can be passed only using either the `data` parameter "
                         "or inplace through the key-value args")

    if isinstance(data, list):
        reform = {}
        for inx, key in enumerate(data[0]):
            reform[key] = [sub_list[inx] for sub_list in data[1:]]

        data = reform

    if data is None:
        if inplace_data:
            data = inplace_data
        else:
            raise ValueError("You don't specify any date to create ticks from. "
                             "Please, use otp.Empty for creating empty data source")
    else:
        data = data.copy()

    value_len = -1

    for key, value in data.items():
        if value_len == -1:
            value_len = len(value)
        else:
            if value_len != len(value):
                # TODO: write test to cover that case
                raise ValueError(
                    f"It is not allowed to have different columns of different lengths, "
                    f"some of columns have {value_len} length, but column '{key}', as instance, has {len(value)}"
                )

    use_absolute_time = False
    if "offset" in data:
        if "time" in data:
            raise ValueError("You cannot specify offset and time at the same time")
    else:
        if "time" in data:
            use_absolute_time = True
        else:
            data["offset"] = list(range(value_len))

    if not use_absolute_time:
        offset_values = []
        offset_parts = []
        for ofv in data['offset']:
            if isinstance(ofv, ott.offsets.Tick):
                offset_values.append(ofv.n)
                offset_parts.append(str(ofv.datepart)[1:-1])
            else:
                offset_values.append(ofv)
                offset_parts.append('millisecond')
        data['offset'] = offset_values
        data['offset_part'] = offset_parts

    if value_len == 1:
        columns = {key: value[0] for key, value in data.items()}
        return Tick(db=db, symbol=symbol, tick_type=tick_type, start=start, end=end,
                    timezone_for_time=timezone_for_time, **columns)
    else:
        # select only columns that do not contain None there to support
        # heterogeneous data
        not_none_columns = []
        for key in data.keys():
            data[key] = [float(elem) if isinstance(elem, bool) else elem for elem in data[key]]
        for key, value in data.items():
            add = True
            for v in value:
                # we need it, because can't use _Column instances in if-clauses
                if isinstance(v, _Column):
                    continue
                if v is None:
                    add = False
                    break

            if add:
                not_none_columns.append(key)

        # if a field depends on a symbol parameter, it cannot be csv'd (it's dynamic)
        # likewise for otq parameters
        # if there's a better way to check whether a value is constant,
        # will be glad to hear about it
        is_outside_data_dependent = False
        for key, value in data.items():
            for v in value:
                str_rep = str(v)
                if ("_SYMBOL_NAME" in str_rep) or ("_SYMBOL_PARAM" in str_rep) or ("$" in str_rep):
                    is_outside_data_dependent = True
                    break

        # infinity() and (on windows) nan() cannot be natively read from a csv
        has_special_values = False
        for key, value in data.items():
            for v in value:
                if isinstance(v, ott._inf) or \
                    (isinstance(v, ott._nan) or isinstance(v, float) and math.isnan(v)) \
                        and sys.platform.startswith("win"):
                    has_special_values = True
                    break

        if (len(not_none_columns) == len(data)) and (not is_outside_data_dependent) and (not has_special_values):
            # Data is homogenous; CSV backing can be used
            return _DataCSV(data, value_len, db=db, symbol=symbol, tick_type=tick_type, start=start, end=end,
                            timezone_for_time=timezone_for_time, use_absolute_time=use_absolute_time)
        else:
            # Fallback is a merge of individual ticks
            ticks = []

            for inx in range(value_len):
                columns = {key: value[inx] for key, value in data.items()}
                ticks.append(Tick(db=db, symbol=symbol, tick_type=tick_type, start=start, end=end,
                                  timezone_for_time=timezone_for_time, **columns))

            return onetick.py.functions.merge(ticks, align_schema=not_none_columns)


class _DataCSV(Source):
    def __init__(
        self,
        data=None,
        length=None,
        db=utils.adaptive_to_default,
        symbol=utils.adaptive_to_default,
        tick_type=None,
        start=utils.adaptive,
        end=utils.adaptive,
        use_absolute_time=False,
        timezone_for_time=None,
        **kwargs,
    ):
        if self._try_default_constructor(**kwargs):
            return

        if data is None or length is None:
            raise ValueError("'data' and 'length' parameters can't be None")

        if db is utils.adaptive_to_default:
            db = configuration.config.get('default_db')

        def datetime_to_expr(v):
            if ott.is_time_type(v):
                return ott.datetime2expr(v, timezone_naive=timezone_for_time)
            if isinstance(v, ott.nsectime):
                # TODO: change to ott.value2str after PY-441
                return f'NSECTIME({v})'
            if isinstance(v, ott.msectime):
                return ott.value2str(v)
            raise ValueError(f"Can't convert value {v} to datetime expression")

        if use_absolute_time:
            # converting values of "time" column to onetick expressions
            converted_times = []
            for d in data["time"]:
                converted_times.append(datetime_to_expr(d))
            data["time"] = converted_times

        def csv_rep(value):
            if issubclass(type(value), str):
                return '"' + value.replace("\\", "\\\\").replace('"', '\\"') + '"'
            else:
                return str(value)

        def get_type_of_column(key):
            def get_type_of_value(value):
                t = ott.get_object_type(value)

                if ott.is_time_type(t):
                    return ott.nsectime
                elif t is str:
                    if len(value) <= ott.string.DEFAULT_LENGTH:
                        return str
                    else:
                        return ott.string[len(value)]
                else:
                    return t

            types = [get_type_of_value(v) for v in data[key]]
            res, _ = utils.get_type_that_includes(types)
            return res

        columns = {key: get_type_of_column(key) for key in data}

        expression_columns = []
        header_columns = {}
        for key in list(columns):
            header_columns[key] = columns[key]
            # converting values of datetime columns to onetick expressions
            if columns[key] is ott.nsectime:
                data[key] = [datetime_to_expr(v) for v in data[key]]
                header_columns[key] = get_type_of_column(key)
                expression_columns.append(key)

        transposed_data = [[csv_rep(value[i]) for key, value in data.items()] for i in range(length)]

        text_header = ",".join(f"{ott.type2str(v)} {k}" for k, v in header_columns.items())
        text_data = "\n".join([",".join(data_row) for data_row in transposed_data])

        if use_absolute_time:
            del columns["time"]
        else:
            del columns["offset"]
            del columns["offset_part"]

        super().__init__(
            _symbols=symbol,
            _start=start,
            _end=end,
            _base_ep_func=lambda: self.base_ep(columns=columns,
                                               db=db,
                                               tick_type=tick_type,
                                               use_absolute_time=use_absolute_time,
                                               text_header=text_header,
                                               text_data=text_data,
                                               expression_columns=expression_columns),
            **columns,
        )

    def base_ep(self, columns, db, tick_type, use_absolute_time, text_header, text_data, expression_columns=None):

        node = Source(
            otq.CsvFileListing(
                discard_timestamp_column=True,
                time_assignment="_START_TIME",
                field_delimiters="','",
                quote_chars='"""',
                handle_escaped_chars=True,
                file_contents=text_data,
                first_line_is_title=False,
                fields=text_header,
            ),
            **columns,
        )

        update_node_tick_type(node, tick_type, db)

        if use_absolute_time:
            # don't trust UpdateField
            node.sink(otq.AddField(field='____TMP____', value="EVAL_EXPRESSION(time, 'datetime')"))
            node.sink(otq.UpdateField(field="TIMESTAMP", value="____TMP____"))
            node.sink(otq.Passthrough(fields="time,____TMP____", drop_fields="True"))
            node.sink(otq.OrderBy(order_by="TIMESTAMP ASC"))
        else:
            node.sink(otq.OrderBy(order_by="offset ASC"))
            node.sink(otq.UpdateField(field="TIMESTAMP", value="dateadd(offset_part, offset, TIMESTAMP, _TIMEZONE)"))
            node.sink(otq.Passthrough(fields="offset,offset_part", drop_fields="True"))
            node.sink(otq.OrderBy(order_by="TIMESTAMP ASC"))

        for column in expression_columns or []:
            # don't trust UpdateField
            node.sink(otq.RenameFields(f'{column}=____TMP____'))
            node.sink(otq.AddField(field=column, value="EVAL_EXPRESSION(____TMP____, 'datetime')"))
            node.sink(otq.Passthrough(fields='____TMP____', drop_fields=True))
        node.sink(otq.Table(keep_input_fields=True,
                            fields=', '.join(f'nsectime {column}' for column in expression_columns)))

        return node


def TTicks(data):
    """
    .. deprecated:: 1.3.101

    Transposed Ticks format.

    Parameters
    ----------
    data: list
        list of list, where the first sublist is the header, and other are values
    """

    warnings.warn("The nice and helpful function `TTicks` is going to be deprecated. "
                  "You could use the `Ticks` to pass data in the same format there",
                  DeprecationWarning)

    dt = {}

    for inx, key in enumerate(data[0]):
        dt[key] = [sub_list[inx] for sub_list in data[1:]]

    return Ticks(dt)


[docs]class Empty(Source):
    """
    Empty data source

    Parameters
    ----------
    db: str
        Name of the database from which to take schema.
    symbol: str, list of str, :class:`Source`, :class:`query`, :py:func:`eval query <onetick.py.eval>`
        Symbol(s) from which data should be taken.
    tick_type: str,
        Name of the tick_type from which to take schema.
    start, end: :py:class:`datetime.datetime`, :py:class:`otp.datetime <onetick.py.datetime>`, \
                    :py:class:`onetick.py.adaptive`
        Time interval from which the data should be taken.
    schema: schema to use in case db and/or tick_type are not set

    Examples
    --------
    We can define schema:

    >>> data = otp.Empty(A=str, B=int)
    >>> otp.run(data)
    Empty DataFrame
    Columns: []
    Index: []
    >>> data.columns()
    {'A': <class 'str'>, 'B': <class 'int'>, 'TIMESTAMP': <class 'onetick.py.types.nsectime'>,
    '_START_TIME': <class 'onetick.py.types.nsectime'>, '_END_TIME': <class 'onetick.py.types.nsectime'>,
    '_SYMBOL_NAME': <class 'str'>, '_DBNAME': <class 'str'>, '_TICK_TYPE': <class 'str'>, '_TIMEZONE': <class 'str'>}

    Or we can get schema from the database:

    >>> data = otp.Empty(db='SOME_DB', tick_type='TT')
    >>> data.columns()
    {'X': <class 'int'>, 'TIMESTAMP': <class 'onetick.py.types.nsectime'>,
    '_START_TIME': <class 'onetick.py.types.nsectime'>, '_END_TIME': <class 'onetick.py.types.nsectime'>,
    '_SYMBOL_NAME': <class 'str'>, '_DBNAME': <class 'str'>, '_TICK_TYPE': <class 'str'>, '_TIMEZONE': <class 'str'>}
    """

    def __init__(
        self,
        db=utils.adaptive_to_default,
        symbol=utils.adaptive_to_default,
        tick_type=None,
        start=utils.adaptive,
        end=utils.adaptive,
        **schema,
    ):
        if self._try_default_constructor(**schema):
            return

        columns = {}

        if tick_type and db != configuration.config.get('default_db') and db is not utils.adaptive_to_default:
            try:
                db_obj = onetick.py.db._inspection.DB(db)
                params = {'tick_type': tick_type}
                if end is not utils.adaptive:
                    params['end'] = end
                columns = db_obj.schema(**params)
            except Exception:
                pass  # do not raise an exception if no data found, because it is empty _source and does not matter

        else:
            columns = schema

        super().__init__(
            _symbols=symbol, _start=start, _end=end, _base_ep_func=lambda: self.base_ep(db), **columns
        )

    def base_ep(self, db):
        if db is utils.adaptive_to_default:
            db = configuration.config.get('default_db')

        src = Source(otq.TickGenerator(fields="long ___NOTHING___=0"))
        if db is None:
            src.tick_type('TICK_GENERATOR')
        else:
            src.tick_type(db + "::TICK_GENERATOR")
        return src


[docs]def CSV(
    filepath_or_buffer=None,
    timestamp_name: Union[str, None] = "Time",
    first_line_is_title: bool = True,
    names: Union[list, None] = None,
    dtype: dict = None,
    converters: dict = None,
    order_ticks=False,
    drop_index=True,
    change_date_to=None,
    auto_increase_timestamps=True,
    db='LOCAL',
    field_delimiter=',',
    handle_escaped_chars=False,
    quote_char='"',
    **kwargs,
):
    """
    Construct source based on CSV file.

    There are several steps determining column types.

    1. Initially, all column treated as ``str``.
    2. If column name in CSV title have format ``type COLUMNNAME``,
       it will change type from ``str`` to specified type.
    3. All column type are determined automatically from its data.
    4. You could override determined types in ``dtype`` argument explicitly.
    5. ``converters`` argument is applied after ``dtype`` and could also change column type.

    NOTE: Double quotes are not supported in CSV files for escaping quotes in strings,
    you should use escape character ``\\`` before the quote instead,
    for example: ``"I'm a string with a \\"quotes\\" inside"``. And then set `handle_escaped_chars=True`.

    Parameters
    ----------
    filepath_or_buffer: str, os.PathLike, FileBuffer, optional
        Path to CSV file or :class:`file buffer <FileBuffer>`. If None value is taken through symbol.
        When taken from symbol, symbol must have ``LOCAL::`` prefix.
        In that case you should set the columns otherwise schema will be empty.
    timestamp_name: str, default "Time"
        Name of TIMESTAMP column used for ticks. Used only if it is exists in CSV columns, otherwise ignored.
        Output data will be sorted by this column.
    first_line_is_title: bool
        Use first line of CSV file as a source for column names and types.
        If CSV file is started with # symbol, this parameter **must** be ``True``.

        - If ``True``, column names are inferred from the first line of the file,
          it is not allowed to have empty name for any column.

        - If ``False``, first line is processed as data, column names will be COLUMN_1, ..., COLUMN_N.
          You could specify column names in ``names`` argument.

    names: list, optional
        List of column names to use, or None.
        Length must be equal to columns number in file.
        Duplicates in this list are not allowed.
    dtype: dict, optional
        Data type for columns, as dict of pairs {column_name: type}.
        Will convert column type from ``str`` to specified type, before applying converters.
    converters: dict, optional
        Dict of functions for converting values in certain columns. Keys are column names.
        Function must be valid callable with ``onetick.py`` syntax, example::

            converters={
                "time_number": lambda c: c.apply(otp.nsectime),
                "stock": lambda c: c.str.lower(),
            }

        Converters applied *after* ``dtype`` conversion.
    order_ticks: bool, optional
        If ``True`` and ``timestamp_name`` column are used, then source will order tick by time.
        Note, that if ``False`` and ticks are not ordered in sequence, then OneTick will raise Exception in runtime.
    drop_index: bool, optional
        if ``True`` and 'Index' column is in the csv file then this column will be removed.
    change_date_to: datetime, date, optional
        change date from a timestamp column to a specific date. Default is None, means not changing timestamp column.
    auto_increase_timestamps: bool, optional
        Only used if provided CSV file does not have a TIMESTAMP column. If ``True``, timestamps of loaded ticks
        would start at ``start_time`` and on each next tick, would increase by 1 millisecond.
        If ``False``, timestamps of all loaded ticks would be equal to ``start_time``
    db: str, optional
        Name of a database to define a destination where the csv file will be transported for processing.
        ``LOCAL`` is default value that means OneTick will process it on the site where a query runs.
    field_delimiter: str, optional
        A character that is used to tokenize each line of the CSV file.
        For a tab character \t (back-slash followed by t) should be specified.
    handle_escaped_chars: bool, optional
        If set, the backslash char ``\\`` gets a special meaning and everywhere in the input text
        the combinations ``\\'``, ``\\"`` and ``\\\\`` are changed correspondingly by ``'``, ``"`` and ``\\``,
        which are processed then as regular chars.
        Besides, combinations like ``\\x??``, where ?-s are hexadecimal digits (0-9, a-f or A-F),
        are changed by the chars with the specified ASCII code.
        For example, ``\\x0A`` will be replaced by a newline character, ``\\x09`` will be replaced by tab, and so on.
        Default: False
    quote_char: str
        Character used to denote the start and end of a quoted item. Quoted items can include the delimiter,
        and it will be ignored. The same character cannot be marked both as the quote character and as the
        field delimiter. Besides, space characters cannot be used as quote.
        Default: " (double quotes)

    See also
    --------
    **CSV_FILE_LISTING** OneTick event processor

    Examples
    --------
    Simple CSV file reading

    >>> data = otp.CSV(os.path.join(csv_path, "data.csv"))
    >>> otp.run(data)
                         Time          time_number      px side
    0 2003-12-01 00:00:00.000  1656690986953602371   30.89  Buy
    1 2003-12-01 00:00:00.001  1656667706281508365  682.88  Buy

    Read CSV file and get timestamp for ticks from specific field.
    You need to specify query start/end interval including all ticks.

    >>> data = otp.CSV(os.path.join(csv_path, "data.csv"),
    ...                timestamp_name="time_number",
    ...                converters={"time_number": lambda c: c.apply(otp.nsectime)},
    ...                start=otp.dt(2010, 8, 1),
    ...                end=otp.dt(2022, 9, 2))
    >>> otp.run(data)
                               Time      px side
    0 2022-07-01 05:28:26.281508365  682.88  Buy
    1 2022-07-01 11:56:26.953602371   30.89  Buy

    Path to csv can be passed via symbol with `LOCAL::` prefix:

    >>> data = otp.CSV()
    >>> otp.run(data, symbols=f"LOCAL::{os.path.join(csv_path, 'data.csv')}")
                         Time          time_number      px side
    0 2003-12-01 00:00:00.000  1656690986953602371   30.89  Buy
    1 2003-12-01 00:00:00.001  1656667706281508365  682.88  Buy

    Field delimiters can be set via ``field_delimiters`` parameter:

    >>> data = otp.CSV(os.path.join(csv_path, 'data_diff_delimiters.csv'),
    ...                field_delimiter=' ',
    ...                first_line_is_title=False)
    >>> otp.run(data)
                         Time COLUMN_0 COLUMN_1
    0 2003-12-01 00:00:00.000      1,2        3
    1 2003-12-01 00:00:00.001        4      5,6

    Quote char can be set via ``quote_char`` parameter:

    >>> data = otp.CSV(os.path.join(csv_path, 'data_diff_quote_chars.csv'),
    ...                quote_char="'",
    ...                first_line_is_title=False)
    >>> otp.run(data)
                         Time COLUMN_0 COLUMN_1
    0 2003-12-01 00:00:00.000     1,"2       3"
    1 2003-12-01 00:00:00.001       "1     2",3
    """
    csv_source = _CSV(
        filepath_or_buffer=filepath_or_buffer,
        timestamp_name=timestamp_name,
        first_line_is_title=first_line_is_title,
        names=names,
        dtype=dtype,
        converters=converters,
        order_ticks=order_ticks,
        drop_index=drop_index,
        change_date_to=change_date_to,
        auto_increase_timestamps=auto_increase_timestamps,
        db=db,
        field_delimiter=field_delimiter,
        handle_escaped_chars=handle_escaped_chars,
        quote_char=quote_char,
        **kwargs,
    )
    csv_source = csv_source.sort(csv_source['Time'])
    return otp.merge([csv_source, otp.Empty(db=db)])


class _CSV(Source):
    _PROPERTIES = Source._PROPERTIES + [
        "_dtype",
        "_names",
        "_columns",
        "_forced_title",
        "_default_types",
        "_has_time",
        "_to_drop",
        "_start",
        "_end",
        "_ep_fields",
        "_symbols",
        "_field_delimiter",
        "_converters",
        "_order_ticks",
        "_auto_increase_timestamps",
        "_db",
        "_drop_index",
        "_change_date_to",
        "_timestamp_name",
        "_filepath_or_buffer",
        "_first_line_is_title",
        "_handle_escaped_chars",
        "_quote_char",
    ]

    def __init__(self,
                 filepath_or_buffer=None,
                 timestamp_name: Union[str, None] = "Time",
                 first_line_is_title: bool = True,
                 names: Union[list, None] = None,
                 dtype: dict = None,
                 converters: dict = None,
                 order_ticks=False,
                 drop_index=True,
                 change_date_to=None,
                 auto_increase_timestamps=True,
                 db='LOCAL',
                 field_delimiter=',',
                 handle_escaped_chars=False,
                 quote_char='"',
                 **kwargs):

        self._dtype = dtype or {}
        self._names = names
        self._converters = converters or {}
        if (len(field_delimiter) != 1 and field_delimiter != '\t') or field_delimiter == '"' or field_delimiter == "'":
            raise ValueError(f'`field_delimiter` can be single character (except quotes) '
                             f'or "\t" but "{field_delimiter}" was passed')
        self._field_delimiter = field_delimiter
        if len(quote_char) > 1:
            raise ValueError(f'quote_char should be single char but `{quote_char}` was passed')
        if self._field_delimiter == quote_char:
            raise ValueError(f'`{self._field_delimiter}` is both field_delimiter and quote_char')
        if quote_char in string.whitespace:
            raise ValueError('Whitespace can not be a quote_char')
        self._quote_char = quote_char
        self._order_ticks = order_ticks
        self._auto_increase_timestamps = auto_increase_timestamps
        self._db = db
        self._drop_index = drop_index
        self._change_date_to = change_date_to
        self._timestamp_name = timestamp_name
        self._filepath_or_buffer = filepath_or_buffer
        self._first_line_is_title = first_line_is_title
        self._handle_escaped_chars = handle_escaped_chars

        if self._try_default_constructor(**kwargs):
            return

        if self._filepath_or_buffer is not None and not isinstance(self._filepath_or_buffer, _SymbolParamSource):
            self._columns, self._default_types, self._forced_title, self._symbols = self._parse_file()
        else:
            self._filepath_or_buffer = None
            names = self._names or []
            self._columns = {name: str for name in names}
            self._default_types = {}
            # we don't know it is actually forced, but otherwise we would ignore the first not commented-out line
            self._forced_title = self._first_line_is_title
            self._symbols = None

        self._check_time_column()

        for t in self._dtype:
            if t not in self._columns:
                raise ValueError(f"dtype '{t}' not found in columns list")
            self._columns[t] = self._dtype[t]

        self._ep_fields = ",".join(
            f'{ott.type2str(dtype)} {column}' if issubclass(dtype, otp.string) else column
            for column, dtype in self._columns.items()
        )

        self._to_drop = self._get_to_drop()
        self._has_time, self._start, self._end = self._get_start_end(**kwargs)

        super().__init__(
            _symbols=self._symbols,
            _start=self._start,
            _end=self._end,
            _base_ep_func=self.base_ep,
            **self._columns,
        )

        # fake run converters to set proper schema
        if self._converters:
            for column, converter in self._converters.items():
                self.schema[column] = converter(self[column]).dtype

        if self._has_time and self._timestamp_name in self.schema:
            if self.schema[self._timestamp_name] not in [ott.nsectime, ott.msectime]:
                raise ValueError(f"CSV converter for {self._timestamp_name} is converting to "
                                 f"{self.schema[timestamp_name]} type, but expected resulted type is "
                                 f"ott.msectime or ott.nsectime")

        # remove timestamp_name column, if we use it as TIMESTAMP source
        if self._has_time and self._timestamp_name != "Time":
            del self[self._timestamp_name]

    def _check_time_column(self):
        if "TIMESTAMP" in self._columns:
            raise ValueError(
                "It is not allowed to have 'TIMESTAMP' columns, because it is reserved name in OneTick"
            )

        if "Time" in self._columns and self._timestamp_name != "Time":
            raise ValueError(
                "It is not allowed to have 'Time' column not used as timestamp field."
            )

    def _get_to_drop(self):
        to_drop = []
        if "TICK_STATUS" in self._columns:
            del self._columns["TICK_STATUS"]
            to_drop.append("TICK_STATUS")

        if "Index" in self._columns and self._drop_index:
            del self._columns["Index"]
            to_drop.append("Index")
        return to_drop

    def _get_start_end(self, **kwargs):
        start = kwargs.get("start", utils.adaptive)
        end = kwargs.get("end", utils.adaptive)

        has_time = False
        if self._timestamp_name in self._columns:
            has_time = True

            # remove to resolve exception in Source.__init__
            if self._timestamp_name == "Time":
                del self._columns["Time"]

        # redefine start/end time for change_date_to
        if self._change_date_to:
            start = dt.datetime(self._change_date_to.year, self._change_date_to.month, self._change_date_to.day)
            end = ott.next_day(start)
        return has_time, start, end

    def _parse_file(self):
        """
        This function finds the file and get columns names, default types and checks if first line is title via pandas.
        Is also sets the correct value for symbols.
        """
        obj_to_inspect = self._filepath_or_buffer
        if isinstance(obj_to_inspect, utils.FileBuffer):
            obj_to_inspect = io.StringIO(obj_to_inspect.get())

        if isinstance(obj_to_inspect, str) and not os.path.exists(obj_to_inspect):
            # if not found, probably, CSV file is located in OneTick CSV_FILE_PATH, check it for inspect_by_pandas()
            csv_paths = otp.utils.get_config_param(os.environ["ONE_TICK_CONFIG"], "CSV_FILE_PATH", default="")
            if csv_paths:
                for csv_path in csv_paths.split(","):
                    csv_path = os.path.join(csv_path, obj_to_inspect)
                    if os.path.exists(csv_path):
                        obj_to_inspect = csv_path
                        break

        columns, default_types, forced_title = _csv_inspector.inspect_by_pandas(
            obj_to_inspect,
            self._first_line_is_title,
            self._names,
            self._field_delimiter,
            self._quote_char,
        )
        if isinstance(self._filepath_or_buffer, utils.FileBuffer):
            symbols = 'DUMMY'
        else:
            # str, because there might passed an os.PathLike object
            symbols = str(obj_to_inspect)
        return columns, default_types, forced_title, symbols

    def base_ep(self):
        # initialize Source and set schema to columns.
        file_contents = ''

        if isinstance(self._filepath_or_buffer, utils.FileBuffer):
            file_contents = self._filepath_or_buffer.get()

        csv = Source(
            otq.CsvFileListing(
                field_delimiters=f"'{self._field_delimiter}'",
                time_assignment="_START_TIME",
                # we use EP's first_line_is_title only when file path is passed through symbol
                # otherwise we don't use EP's first_line_is_title, because EP raise error on empty column name,
                # and we explicitly define name for such columns in FIELDS arg.
                # but if first line started with # (forced_title=True), then this param ignored :(
                first_line_is_title=self._filepath_or_buffer is None and self._first_line_is_title,
                fields=self._ep_fields,
                file_contents=file_contents,
                handle_escaped_chars=self._handle_escaped_chars,
                quote_chars=f"'{self._quote_char}'",
            ),
            **self._columns,
        )

        if self._first_line_is_title and not self._forced_title:
            # remove first line with titles for columns.
            csv.sink(otq.DeclareStateVariables(variables="long __TICK_INDEX=0"))
            csv.sink(otq.PerTickScript("STATE::__TICK_INDEX = STATE::__TICK_INDEX + 1;"))
            csv.sink(otq.WhereClause(discard_on_match=False, where="STATE::__TICK_INDEX > 1"))

        # set tick type to ANY
        csv.tick_type(f"{self._db}::ANY")

        # check whether need to update types, because if column type is not specified in header
        # then by default column has string type in OneTick
        update_columns = {}
        for name, dtype in self._columns.items():
            if not issubclass(dtype, str) and name not in self._default_types:
                update_columns[name] = dtype

        for name, dtype in update_columns.items():
            if dtype is int:
                # BE-142 - workaround for converting string to int
                # OneTick first convert string to float, and then to int, which leeds to losing precision
                csv.sink(otq.AddField(field=f"_TMP_{name}", value="atol(" + name + ")"))
                csv.sink(otq.Passthrough(fields=name, drop_fields=True))
                csv.sink(otq.AddField(field=f"{name}", value=f"_TMP_{name}"))
                csv.sink(otq.Passthrough(fields=f"_TMP_{name}", drop_fields=True))
            elif dtype is float:
                csv.sink(otq.UpdateField(field=name, value="atof(" + name + ")"))
            elif dtype is ott.msectime:
                csv.sink(otq.UpdateField(field=name, value='"1970/01/01 00:00:00.000"', where=name + '=""'))
                csv.sink(otq.UpdateField(field=name, value=f'parse_time("%Y/%m/%d %H:%M:%S.%q",{name},_TIMEZONE)'))
            elif dtype is ott.nsectime:
                csv.sink(otq.UpdateField(field=name, value='"1970/1/1 00:00:00.000"', where=name + '=""'))
                csv.sink(otq.UpdateField(field=name, value=f'parse_nsectime("%Y/%m/%d %H:%M:%S.%J",{name},_TIMEZONE)'))
            else:
                raise TypeError(f"Unsupported type '{dtype}'")

        # run converters
        if self._converters:
            for column, converter in self._converters.items():
                if csv[column].dtype is not otp.nsectime and converter(csv[column]).dtype is otp.nsectime:
                    # workaround for resolve bug on column type changing:
                    # https://onemarketdata.atlassian.net/browse/PY-416
                    csv[f'_T_{name}'] = converter(csv[column])
                    del csv[column]
                    csv[column] = csv[f'_T_{name}']
                    del csv[f'_T_{name}']
                else:
                    csv[column] = converter(csv[column])

        if self._has_time:
            # if timestamp_name column is defined in the csv, then apply tick time adjustment

            if self._timestamp_name in self._converters:
                # we assume that if timestamp_name field in converters,
                # then it is already converted to otp.dt
                csv.sink(
                    otq.UpdateField(
                        field="TIMESTAMP",
                        value=self._timestamp_name,
                        allow_unordered_output_times=True,
                    )
                )
            else:
                if self._change_date_to:
                    self._change_date_to = self._change_date_to.strftime("%Y/%m/%d")
                    csv.sink(otq.UpdateField(field="Time",
                                             value=f'"{self._change_date_to}" + substr({self._timestamp_name}, 10)'))

                # by default we parse timestamp_name into TIMESTAMP field
                # from typical/default Time format from OneTick dump
                csv.sink(
                    otq.UpdateField(
                        field="TIMESTAMP",
                        value=f'parse_nsectime("%Y/%m/%d %H:%M:%S.%J", {self._timestamp_name}, _TIMEZONE)',
                        allow_unordered_output_times=True,
                    )
                )

            # drop source timestamp_name field in favor of new TIMESTAMP field
            self._to_drop.append(self._timestamp_name)
        elif self._auto_increase_timestamps:
            # default time for ticks are increasing from 0
            csv.sink(otq.DeclareStateVariables(variables="long __TIMESTAMP_INC__ = 0"))
            csv.sink(otq.UpdateField(
                field="TIMESTAMP",
                value='DATEADD("millisecond",STATE::__TIMESTAMP_INC__,TIMESTAMP,_TIMEZONE)'))
            csv.sink(otq.UpdateField(field="STATE::__TIMESTAMP_INC__", value="STATE::__TIMESTAMP_INC__ + 1"))

        if self._order_ticks:
            csv.sort('TIMESTAMP', inplace=True)

        if self._to_drop:
            csv.sink(otq.Passthrough(fields=",".join(self._to_drop), drop_fields="True"))

        return csv


class Trades(Source):
    """
    Trade source object.
    add 'PRICE' and 'SIZE' fields to schema
    """

    def __init__(self, db=utils.adaptive_to_default, symbol=utils.adaptive,
                 date=None,
                 start=utils.adaptive, end=utils.adaptive, **kwargs):
        if db is utils.adaptive_to_default:
            db = configuration.config.default_db
        if date:
            start, end = date.start, date.end
        super().__init__(
            _symbols=symbol, _start=start, _end=end, _base_ep_func=lambda: self.base_ep(db), **kwargs
        )

        self.schema['PRICE'] = float
        self.schema['SIZE'] = int

    def base_ep(self, db):
        db = str(db)

        src = Source(otq.Passthrough(fields="SYMBOL_NAME,TICK_TYPE", drop_fields=True))
        src.tick_type(db + "::TRD")

        return src


class Quotes(Source):
    def __init__(self, db=utils.adaptive_to_default, symbol=utils.adaptive,
                 start=utils.adaptive, end=utils.adaptive, **kwargs):
        if db is utils.adaptive_to_default:
            db = configuration.config.default_db
        super().__init__(
            _symbols=symbol, _start=start, _end=end, _base_ep_func=lambda: self.base_ep(db), **kwargs
        )

        self.schema['ASK_PRICE'] = float
        self.schema['BID_PRICE'] = float
        self.schema['ASK_SIZE'] = int
        self.schema['BID_SIZE'] = int

    def base_ep(self, db):
        db = str(db)

        src = Source(otq.Passthrough(fields="SYMBOL_NAME,TICK_TYPE", drop_fields=True))
        src.tick_type(db + "::QTE")

        return src


class NBBO(Source):
    def __init__(self, db="TAQ_NBBO", symbol=utils.adaptive, start=utils.adaptive, end=utils.adaptive, **kwargs):
        super().__init__(
            _symbols=symbol, _start=start, _end=end, _base_ep_func=lambda: self.base_ep(db), **kwargs
        )

        self.schema['ASK_PRICE'] = float
        self.schema['BID_PRICE'] = float
        self.schema['ASK_SIZE'] = int
        self.schema['BID_SIZE'] = int

    def base_ep(self, db):
        db = str(db)
        src = Source(otq.Passthrough(fields="SYMBOL_NAME,TICK_TYPE", drop_fields=True))
        src.tick_type(db + "::NBBO")

        return src


[docs]class Query(Source):
    def __init__(
        self,
        query_object=None,
        out_pin=utils.adaptive,
        symbol=utils.adaptive,
        start=utils.adaptive,
        end=utils.adaptive,
        params=None,
        **kwargs,
    ):
        """
        Create data source object from .otq file or query object

        Parameters
        ----------
        query_object: path or :class:`query`
            query to use as a data source
        out_pin: str
             query output pin name
        symbol: str, list of str, :class:`Source`, :class:`query`, :py:func:`eval query <onetick.py.eval>`
            Symbol(s) from which data should be taken.
        start, end : :py:class:`datetime.datetime`, :py:class:`otp.datetime <onetick.py.datetime>` or utils.adaptive
            Time interval from which the data should be taken.
        params: dict
            params to pass to query.
            Only applicable to string ``query_object``
        """
        if self._try_default_constructor(**kwargs):
            return

        if params is None:
            params = {}

        # Ignore because of the "Only @runtime_checkable protocols can be used with instance and class checks"
        if isinstance(query_object, (str, os.PathLike)):  # type: ignore
            query_object = query(str(query_object), **params)
        elif isinstance(query_object, query):
            if len(params) > 0:
                raise ValueError("Cannot pass both params and a query() (not str) query_object parameter")
        else:
            raise ValueError("query_object parameter has to be either a str (path to the query) or a query object")

        if symbol == utils.adaptive:
            if not query_object.graph_info.has_unbound_sources:
                symbol = None

        super().__init__(
            _symbols=symbol, _start=start, _end=end, _base_ep_func=lambda: self.base_ep(query_object, out_pin), **kwargs
        )

    def base_ep(self, query, out_pin):
        nested = otq.NestedOtq(query.path, query.str_params)
        graph = query.graph_info

        if out_pin is utils.adaptive:
            if len(graph.nested_outputs) == 1:
                return Source(nested[graph.nested_outputs[0].NESTED_OUTPUT])
            elif len(graph.nested_outputs) > 1:
                raise Exception(
                    f'Query "{query.query_name}" has multiple outputs, but you have not '
                    "specified which one should be used. You could specify it"
                    ' using "out_pin" parameter of the Query constructor.'
                )
            else:
                # no output
                return Source(nested, _has_output=False)
        else:
            existed_out_pins = set(map(operator.attrgetter("NESTED_OUTPUT"), graph.nested_outputs))
            if out_pin not in existed_out_pins:
                raise Exception(
                    f'Query "{query.query_name}" does not have the "{out_pin}" output, there are only following '
                    f"output pins exist: {','.join(existed_out_pins)}"
                )
            return Source(nested[out_pin])


[docs]class query:
    """
    Constructs a query object with a certain path.
    Keyword arguments specify query parameters.

    You also can pass an instance of ``otp.query.config`` class as the second positional argument to
    specify a query.

    Parameters
    ----------

    path : str
        path to an .otq file.
        If path is relative, then it's assumed that file is located in one of the directories
        specified in OneTick ``OTQ_FILE_PATH`` configuration variable.
        If there are more than one query in the file, then its name should be specified
        in the format ``<path>::<query-name>``.
        Also prefix ``remote://<database-name>::`` can be used to specify if query is located
        on the remote server.
    config:
        optional ``otp.query.config`` object.
    params:
        parameters for the query.

    Raises
    ------
    ValueError, TypeError

    Examples
    --------
    >>> otp.query('/otqs/some.otq::some_query', PARAM1='val1', PARAM2=3.14)  # doctest: +SKIP
    >>> otp.query('remote://DATABASE::/otqs/some.otq::some_query', PARAM1='val1', PARAM2=3.14)  # doctest: +SKIP
    """

[docs]    class config:
        """
        The config allows to specify different query options.
        """

        special_values = {"input"}

        def __init__(self, output_columns=None):
            """
            Parameters
            ----------

            output_columns : str, list, dict, optional
                The parameter defines what the outputs columns are.
                Default value is ``None`` that means no output fields after applying query
                for every output pin.

                The ``input`` value means that output columns are the same as inputs for
                every output pin

                A list of tuples allows to define output columns with their types;
                for example [('x', int), ('y', float), ...]. Applicable for every output
                pin.

                A dict allows to specify output columns for every output pin.

            Raises
            ------
            TypeError, ValueError
            """

            if output_columns is not None:
                if isinstance(output_columns, list):
                    self.validate_columns(output_columns)
                elif isinstance(output_columns, dict):
                    for pin, columns in output_columns.items():
                        if not isinstance(pin, str):
                            raise TypeError(f"Name of pin '{type(pin)}' is of non-str type '%s'")
                        else:
                            self.validate_columns(columns)

                elif not isinstance(output_columns, str):
                    raise TypeError(f'"output_columns" does not support value of the "{type(output_columns)}" type')

                if isinstance(output_columns, str):
                    if output_columns not in self.special_values:
                        raise ValueError(f'Config does not support "{output_columns}" value')

            self.output_columns = output_columns

[docs]        def validate_list_item(self, item):
            if isinstance(item, str):
                if item not in self.special_values:
                    raise ValueError(f"Value {item} is not supported.")

            else:
                if not isinstance(item, (tuple, list)) or (len(item) != 2) or not isinstance(item[0], str):
                    raise TypeError("Value %s is not a name-type tuple.")

[docs]        def validate_columns(self, columns):
            if isinstance(columns, str):
                if columns not in self.special_values:
                    raise ValueError(f"A pin has invalid output columns definition: '{columns}'")

            elif isinstance(columns, list):
                if columns.count("input") > 1:
                    raise ValueError(f"More than one 'input' value in {columns}")

                for item in columns:
                    self.validate_list_item(item)

            else:
                raise TypeError(f"A pin's columns definition is of unsupported type '{type(columns)}'")

[docs]        def get_output_columns_for_pin(self, out_pin_name):
            if isinstance(self.output_columns, dict):
                if out_pin_name not in self.output_columns:
                    raise ValueError(f"Pin {out_pin_name} wasn't declared in the config")
                else:
                    return self.output_columns[out_pin_name]

            else:
                return self.output_columns

[docs]        def apply(self, out_pin_name, src):
            """
            Applying specified logic on a certain object. Used internally in the functions.apply_query
            """
            columns_descriptor = self.get_output_columns_for_pin(out_pin_name)
            if columns_descriptor is None:
                # drop columns by default, because we don't know
                # how an external query changes data schema
                src.drop_columns()
            elif columns_descriptor == "input":
                pass
            else:
                if "input" not in columns_descriptor:
                    src.drop_columns()

                for item in columns_descriptor:
                    if item != "input":
                        src[item]

    def __init__(self, path, *config, **params):
        path = str(path)

        if path.startswith('remote://'):
            self.path = path
            remote, path = path.split('::', maxsplit=1)
        else:
            self.path = f"remote://{configuration.config.get('default_db', 'LOCAL')}::" + path

        self.query_path, self.query_name = utils.query_to_path_and_name(path)

        # if query_path does not exist, then we try
        # to resolve it with OTQ_PATH assuming that
        # a relative path is passed
        if not os.path.exists(self.query_path):
            otq_path = utils.get_config_param(os.environ["ONE_TICK_CONFIG"], "OTQ_FILE_PATH", "")
            self.query_path = utils.abspath_to_query_by_otq_path(otq_path, self.query_path)

        if self.query_name is None:
            # it seems that query name was not passed, then try to find it
            queries = query_inspector.get_queries(self.query_path)
            if len(queries) > 1:
                raise Exception(f"{self.query_path} has more than one query, "
                                f"but you have not specified which one to use.")
            self.query_name = queries[0]

        # prepare parameters
        self._str_params = None
        self.params = params
        self.update_params()

        # prepare configs
        if len(config) > 1:
            raise ValueError(f"It is allowed to specify only one config object, but passed {len(config)}")
        elif len(config) == 1:
            if not isinstance(config[0], self.config):
                raise TypeError(
                    f'It is expected to see config of the "query.config" type, but got "{type(config[0])}"'
                )
            self.config = config[0]
        else:
            self.config = self.config()

        self.graph_info = query_inspector.get_query_info(self.query_path, self.query_name)

    def __call__(self, *ticks, **pins):
        for key, value in pins.items():
            if not isinstance(value, Source):
                raise ValueError(f'Input "{key}" pin does not support "{type(value)}" type')

        if len(pins) == 0 and len(ticks) == 1:
            if len(self.graph_info.nested_inputs) != 1:
                raise Exception(
                    f'It is expected the query "{self.query_path}" to have one input, but it'
                    f" has {len(self.graph_info.nested_inputs)}"
                )

            pins[self.graph_info.nested_inputs[0].NESTED_INPUT] = ticks[0]
        elif len(pins) > 0 and len(ticks) == 0:
            pass
        elif len(pins) == 0 and len(ticks) == 0:
            # it is the valid case, when query has no input pins
            pass
        else:
            raise ValueError("It is allowed to pass only one non-specified input")

        outputs = self._outputs()
        outputs.query = self
        outputs.in_sources = pins

        return outputs

    class _outputs(object):
        def __getitem__(self, key):
            output_pins = []

            if type(key) is tuple:
                output_pins = list(key)
            elif isinstance(key, str):
                output_pins = [key]
            elif key is None:
                # No output
                pass
            else:
                raise ValueError(f'Output pins can not be of "{type(key)}" type')

            return onetick.py.functions.apply_query(
                self.query, in_sources=self.in_sources, output_pins=output_pins, **self.query.params
            )

[docs]    def to_eval_string(self):
        """Converts query object to `eval` string"""
        res = '"' + self.path + '"'
        if self.params:
            res += f', "{self._params_to_str(self.params, with_expr=True)}"'
        return "eval(" + res + ")"

[docs]    def update_params(self, **new_params):
        if new_params:
            self.params.update(new_params)

    @property
    def str_params(self):
        """Query parameters converted to string"""
        if self._str_params is None:
            self._str_params = self._params_to_str(self.params)
        return self._str_params

    @staticmethod
    def _params_to_str(params, *, with_expr=False):
        """ converts param to str

        Parameters
        ----------
        params: dict
            Parameters as dict(name=value)
        with_expr:
            If true return all expression in expr() function

        Returns
        -------
        result: str
            string representation of parameters ready for query evaluation
        """

        def to_str(v):
            if isinstance(v, list):
                return "\\,".join(map(to_str, v))
            else:
                if with_expr:
                    is_dt = ott.is_time_type(v)
                    if is_dt:
                        v = ott.value2str(v)
                    result = query._escape_quotes_in_eval(v)
                    if isinstance(v, _Operation) and getattr(v, "name", None) != "_SYMBOL_NAME" or is_dt:
                        result = f"expr({result})"
                else:
                    result = query._escape_characters_in_query_param(str(v))
                return result

        return ",".join(key + "=" + to_str(value) for key, value in params.items())

    @staticmethod
    def _escape_quotes_in_eval(v):
        return str(v).translate(str.maketrans({"'": r"\'", '"': r'\"'}))

    @staticmethod
    def _escape_characters_in_query_param(result):
        # 0 - no need to add backslash, 1 - need to add
        char_map = [0] * len(result)

        # put 1 between two quotes symbols
        open_char = None
        last_inx = 0
        for inx, c in enumerate(result):
            if open_char == c:
                open_char = None
                continue

            if not open_char and c == "'" or c == '"':
                open_char = c
                last_inx = inx + 1
                continue

            if open_char:
                char_map[inx] = 1

        # clean open tail if necessary
        if open_char:
            char_map[last_inx:] = [0] * (len(result) - last_inx)

        # apply mapping
        res = []
        last_esc = False  # do not add esc if the previous one is already esc
        n_brackets_in_expr_block = 0  # do not escape in expr(...)
        for inx, c in enumerate(result):
            if c == "(":
                if n_brackets_in_expr_block:
                    n_brackets_in_expr_block += 1
                elif result[inx - 4:inx] == "expr":
                    n_brackets_in_expr_block = 1
            if c == ")" and n_brackets_in_expr_block:
                n_brackets_in_expr_block -= 1

            if c in _QUERY_PARAM_SPECIAL_CHARACTERS and char_map[inx] == 0:
                if not last_esc and not n_brackets_in_expr_block:
                    c = "\\" + c

            last_esc = c == "\\"

            res.append(c)

        return "".join(res)


class Orders(Source):
    def __init__(self, db="S_ORDERS_FIX", symbol=utils.adaptive, start=utils.adaptive, end=utils.adaptive, **kwargs):
        super().__init__(
            _symbols=symbol, _start=start, _end=end, _base_ep_func=lambda: self.base_ep(db), **kwargs
        )

        self.schema['ID'] = str
        self.schema['BUY_FLAG'] = int
        self.schema['SIDE'] = str
        self.schema['STATE'] = str
        self.schema['ORDTYPE'] = str
        self.schema['PRICE'] = float
        self.schema['PRICE_FILLED'] = float
        self.schema['QTY'] = int
        self.schema['QTY_FILLED'] = int

    def base_ep(self, db):
        db = str(db)

        src = Source(otq.Passthrough(fields="SYMBOL_NAME,TICK_TYPE", drop_fields=True))
        src.tick_type(db + "::ORDER")

        return src


_db_doc = param_doc(
    name='db',
    desc="""
    Name(s) of the database or the database object(s).
    """,
    str_annotation='str, list of str, :class:`otp.DB <onetick.py.DB>`',
    default=None,
    str_default='None',
)
_symbol_doc = param_doc(
    name='symbol',
    desc="""
    Symbol(s) from which data should be taken.
    """,
    str_annotation='str, list of str, :class:`Source`, :class:`query`, :py:func:`eval query <onetick.py.eval>`',
    default=utils.adaptive,
    str_default=' :py:class:`onetick.py.adaptive`',
)
_symbols_doc = param_doc(
    name='symbols',
    desc="""
    Symbol(s) from which data should be taken.
    Alias for ``symbol`` parameter. Will take precedence over it.
    """,
    str_annotation='str, list of str, :class:`Source`, :class:`query`, :py:func:`eval query <onetick.py.eval>`',
    default=None,
)
_tick_type_doc = param_doc(
    name='tick_type',
    desc="""
    Tick type of the data.
    If not specified, all ticks from `db` will be taken.
    If ticks can't be found or there are many databases specified in `db` then default is "TRD".
    """,
    str_annotation='str, list of str',
    default=utils.adaptive,
    str_default=' :py:class:`onetick.py.adaptive`',
)
_start_doc = param_doc(
    name='start',
    desc="""
    Start of the interval from which the data should be taken.
    Default is :py:class:`onetick.py.adaptive`, making the final query deduce the time
    limits from the rest of the graph.
    """,
    str_annotation=(
        ':py:class:`datetime.datetime`, :py:class:`otp.datetime <onetick.py.datetime>`,'
        ' :py:class:`onetick.py.adaptive`'
    ),
    default=utils.adaptive,
    str_default=' :py:class:`onetick.py.adaptive`',
)
_end_doc = param_doc(
    name='end',
    desc="""
    End of the interval from which the data should be taken.
    Default is :py:class:`onetick.py.adaptive`, making the final query deduce the time
    limits from the rest of the graph.
    """,
    str_annotation=(
        ':py:class:`datetime.datetime`, :py:class:`otp.datetime <onetick.py.datetime>`,'
        ' :py:class:`onetick.py.adaptive`'
    ),
    default=utils.adaptive,
    str_default=' :py:class:`onetick.py.adaptive`',
)
_date_doc = param_doc(
    name='date',
    desc="""
    Allows to specify a whole day instead of passing explicitly ``start`` and ``end`` parameters.
    If it is set along with the ``start`` and ``end`` parameters then last two are ignored.
    """,
    str_annotation=":class:`datetime.datetime`, :class:`otp.datetime <onetick.py.datetime>`",
    default=None,
)
_schema_policy_doc = param_doc(
    name='schema_policy',
    desc="""
    Schema deduction policy:

    - 'manual'
      The resulting schema is a combination of ``desired_schema`` and database schema.
      Compatibility with database schema will not be checked.

    - 'manual_strict'
      The resulting schema will be exactly ``desired_schema``.
      Compatibility with database schema will not be checked.

    - 'tolerant'
      The resulting schema is a combination of ``desired_schema`` and database schema.
      If the database schema can be deduced,
      it's checked to be type-compatible with a ``desired_schema``,
      and ValueError is raised if checks are failed.
      Also, with this policy database is scanned 5 days back to find the schema.
      It is useful when database is misconfigured or in case of holidays.

    - 'tolerant_strict'
      The resulting schema will be ``desired_schema`` if it's not empty.
      Otherwise, database schema is used.
      If the database schema can be deduced,
      it's checked if it lacks fields from the ``desired_schema``
      and it's checked to be type-compatible with a ``desired_schema``
      and ValueError is raised if checks are failed.
      Also, with this policy database is scanned 5 days back to find the schema.
      It is useful when database is misconfigured or in case of holidays.

    - 'fail'
      The same as 'tolerant', but if the database schema can't be deduced, raises an Exception.

    - 'fail_strict'
      The same as 'tolerant_strict', but if the database schema can't be deduced, raises an Exception.
    """,
    str_annotation="'tolerant', 'tolerant_strict', 'fail', 'fail_strict', 'manual', 'manual_strict'",
    default=None,
)
_guess_schema_doc = param_doc(
    name='guess_schema',
    desc="""
    .. deprecated:: 1.3.16

    Use ``schema_policy`` parameter instead.
    """,
    annotation=bool,
    default=None,
)
_identify_input_ts_doc = param_doc(
    name='identify_input_ts',
    desc="""
    If set to False, the fields SYMBOL_NAME and TICK_TYPE are not appended to the output ticks.
    """,
    annotation=bool,
    default=False,
)
_back_to_first_tick_doc = param_doc(
    name='back_to_first_tick',
    desc="""
    Determines how far back to go looking for the latest tick before ``start`` time.
    If one is found, it is inserted into the output time series with the timestamp set to ``start`` time.
    Note: it will be rounded to int, so otp.Millis(999) will be 0 seconds.
    """,
    str_annotation=('int, :ref:`offset <datetime_offsets>`, '
                    ':class:`otp.expr <onetick.py.expr>`, '
                    ':py:class:`~onetick.py.Operation`'),
    default=0,
)
_keep_first_tick_timestamp_doc = param_doc(
    name='keep_first_tick_timestamp',
    desc="""
    If set, new field with this name will be added to source.
    This field contains original timestamp of the tick that was taken from before the start time of the query.
    For all other ticks value in this field will be equal to the value of Time field.
    This parameter is ignored if ``back_to_first_tick`` is not set.
    """,
    annotation=str,
    default=None,
)
_presort_doc = param_doc(
    name='presort',
    desc="""
    Add the presort EP in case of bound symbols.
    Applicable only when ``symbols`` is not None.
    By default, it is set to True if ``symbols`` are set
    and to False otherwise.
    """,
    annotation=bool,
    default=utils.adaptive,
    str_default=' :py:class:`onetick.py.adaptive`',
)
_concurrency_doc = param_doc(
    name='concurrency',
    desc="""
    Specifies number of CPU cores to utilize for the ``presort``
    By default, the value from otp.config.default_concurrency is used.
    """,
    annotation=int,
    default=None,
)
_batch_size_doc = param_doc(
    name='batch_size',
    desc="""
    Specifies the query batch size for the ``presort``.
    By default, the value from otp.config.default_batch_size is used.
    """,
    annotation=int,
    default=None,
)
_desired_schema_doc = param_doc(
    name='desired_schema',
    desc="""
    List of <column name> -> <column type> pairs that the source is expected to have.
    If the type is irrelevant, provide None as the type in question.
    """,
    str_annotation='type[str]',
    kind=inspect.Parameter.VAR_KEYWORD,
)

_max_back_ticks_to_prepend_doc = param_doc(
    name='max_back_ticks_to_prepend',
    desc="""
    When the ``back_to_first_tick`` interval is specified, this parameter determines the maximum number
    of the most recent ticks before start_time that will be prepended to the output time series.
    Their timestamp will be changed to start_time.
    """,
    annotation=int,
    default=1,
)

_where_clause_for_back_ticks_doc = param_doc(
    name='where_clause_for_back_ticks',
    desc="""
    A logical expression that is computed only for the ticks encountered when a query goes back from the start time,
    in search of the ticks to prepend. If it returns false, a tick is ignored.
    """,
    annotation=Raw,
    default=None,
)

DATA_SOURCE_DOC_PARAMS = [
    _db_doc, _symbol_doc, _tick_type_doc,
    _start_doc, _end_doc, _date_doc,
    _schema_policy_doc, _guess_schema_doc,
    _identify_input_ts_doc,
    _back_to_first_tick_doc, _keep_first_tick_timestamp_doc,
    _max_back_ticks_to_prepend_doc,
    _where_clause_for_back_ticks_doc,
    _symbols_doc,
    _presort_doc, _batch_size_doc, _concurrency_doc,
    _desired_schema_doc,
]


[docs]class DataSource(Source):

    POLICY_MANUAL = "manual"
    POLICY_MANUAL_STRICT = "manual_strict"
    POLICY_TOLERANT = "tolerant"
    POLICY_TOLERANT_STRICT = "tolerant_strict"
    POLICY_FAIL = "fail"
    POLICY_FAIL_STRICT = "fail_strict"

    _VALID_POLICIES = frozenset([POLICY_MANUAL, POLICY_MANUAL_STRICT,
                                 POLICY_TOLERANT, POLICY_TOLERANT_STRICT,
                                 POLICY_FAIL, POLICY_FAIL_STRICT])
    _PROPERTIES = Source._PROPERTIES + ["_p_db", "_p_strict", "_p_schema"]

    def __get_schema(self, db, start, schema_policy):
        schema = {}

        if start is utils.adaptive:
            start = None  # means that use the last date with data

        if isinstance(db, list):
            ''' This case of a merge, since we need to get combined schema
            across different tick types and dbs '''
            for t_db in db:
                _db = t_db.split(':')[0]
                _tt = t_db.split(':')[-1]

                db_obj = onetick.py.db._inspection.DB(_db)
                if schema_policy == self.POLICY_TOLERANT and start:
                    # repeating the same logic as in db_obj.last_date
                    start = db_obj.last_not_empty_date(start, days_back=5, tick_type=_tt)
                schema.update(db_obj.schema(date=start, tick_type=_tt))

        if db is None or isinstance(db, _SymbolParamColumn):
            ''' In this case we can't get schema, because db is calculated dynamically.
            Set to empty to indicate that in this case we expect the manually set schema. '''
            schema = {}
        return schema

    def __prepare_schema(self, db, start, end, schema_policy, guess_schema, desired_schema):
        if guess_schema is not None:
            warnings.warn(
                "guess_schema flag is deprecated; use schema_policy argument instead",
                DeprecationWarning,
            )
            if schema_policy is not None:
                raise ValueError("guess_schema and schema_policy cannot be set at the same time")
            if guess_schema:
                schema_policy = self.POLICY_FAIL
            else:
                schema_policy = self.POLICY_MANUAL

        if schema_policy is None:
            schema_policy = self.POLICY_TOLERANT
        if schema_policy not in self._VALID_POLICIES:
            raise ValueError(f"Invalid schema_policy; allowed values are: {self._VALID_POLICIES}")

        actual_schema = {}
        if schema_policy not in (self.POLICY_MANUAL, self.POLICY_MANUAL_STRICT):
            actual_schema = self.__get_schema(db, start, schema_policy)
            dbs = ', '.join(db if isinstance(db, list) else [])

            if len(actual_schema) == 0:
                if schema_policy in (self.POLICY_FAIL, self.POLICY_FAIL_STRICT):
                    raise Exception(f'No ticks found in database(-s) {dbs}')
                # lets try to use at least something
                return desired_schema.copy()

            for k, v in desired_schema.items():
                field_type = actual_schema.get(k, None)
                incompatible_types = False
                if field_type is None:
                    if self._p_strict or schema_policy in (self.POLICY_TOLERANT, self.POLICY_FAIL):
                        raise ValueError(f"Database(-s) {dbs} schema has no {k} field")
                elif issubclass(field_type, str) and issubclass(v, str):
                    field_length = ott.string.DEFAULT_LENGTH
                    if issubclass(field_type, ott.string):
                        field_length = field_type.length
                    v_length = ott.string.DEFAULT_LENGTH
                    if issubclass(v, ott.string):
                        v_length = v.length
                    if issubclass(field_type, ott.varstring):
                        if not issubclass(v, ott.varstring):
                            incompatible_types = True
                    elif not issubclass(v, ott.varstring) and v_length < field_length:
                        incompatible_types = True
                elif not issubclass(field_type, v):
                    incompatible_types = True
                if incompatible_types:
                    error_message = f"Database(-s) {dbs} schema field {k} has type {field_type}, but {v} was requested"
                    if field_type in (str, ott.string) or v in (str, ott.string):
                        error_message = f"{error_message}. Notice, that `str` and `otp.string` lengths are 64"
                    raise ValueError(error_message)
            if not self._p_strict:
                desired_schema.update(actual_schema)

        table_schema = desired_schema.copy()
        if not self._p_strict:
            # in this case we will table only fields specified by user
            table_schema = {
                k: v for k, v in table_schema.items() if k not in actual_schema
            }
        return table_schema

    def __prepare_dates(self, date):
        if isinstance(date, ott.datetime) or isinstance(date, ott.date):
            start = date.start
            end = date.end
        if isinstance(date, dt.datetime) or isinstance(date, dt.date):
            start = dt.datetime(date.year, date.month, date.day)
            end = start + dt.timedelta(days=1, milliseconds=-1)
        return start, end

    def __prepare_db_tick_type(self, db, tick_type, symbol, start, end):
        if isinstance(db, list):
            ''' If everything is correct then this branch should leave
            the `db` var as a list of databases with tick types and the
            `tick_type` var is None.
            Valid cases:
                - Fully defined case. The `db` parameter has a list of databases where
                  every database has a tick type, when the `tick_type`
                  parameter has default value or None (for backward compatibility)
                - Partially defined case. The `db` parameter has a list of databases but
                  not every database has a tick type, and meantime the `tick_type`
                  is passed to not None value. In that case databases without tick type
                  are exetended with a tick type from the `tick_type` parameter
                - No defined case. The `db` parameter has a list of databases and
                  every database there has no tick type, and the `tick_type` is
                  set to not None value. In that case every database is extended with
                  the tick type from the `tick_type`.
            '''

            def db_converter(_db):
                if isinstance(_db, DB):
                    return _db.name
                else:
                    return _db

            db = [db_converter(_db) for _db in db]
            res = all(('::' in _db and _db[-1] != ':' for _db in db))
            if res:
                if tick_type is utils.adaptive or tick_type is None:
                    tick_type = None  # tick types is specified for all databases
                else:
                    raise Exception('The `tick_type` is set as a parameter '
                                    'and also as a part of the `db` parameter'
                                    'for every database')
            else:
                dbs_without_tt = [_db.split(':')[0] for _db in db
                                  if '::' not in _db or _db[-1] == ':']

                if tick_type is utils.adaptive:
                    tick_type = 'TRD'  # default one for backward compatibility and testing usecase
                if tick_type is None:
                    raise Exception('The tick type is not set for databases: ' +
                                    ', '.join(dbs_without_tt))
                else:
                    # extend databases with missing tick types from the tick tick parameter
                    dbs_with_tt = [_db for _db in db
                                   if '::' in _db and _db[-1] != ':']

                    db = dbs_with_tt + [_db + '::' + tick_type for _db in dbs_without_tt]
                    tick_type = None

        if isinstance(db, (DB, inspect_DB)):
            db = db.name  # ... and we go to the next branch

        if isinstance(db, str):
            ''' The resulting `db` var contains a list with string value, that has the `db`
            concatenated with the `tick_type`. '''
            if '::' in db:
                if tick_type is utils.adaptive or tick_type is None:
                    tick_type = db.split(':')[-1]
                    db = db.split('::')[0]
                else:
                    raise Exception('The `tick_type` is set as a parameter '
                                    'and also as a part of the `db` parameter')
            else:
                if tick_type is utils.adaptive or tick_type is None:
                    db_obj = onetick.py.db._inspection.DB(db)

                    # try to find at least one common tick type
                    # through all days
                    tick_types = None

                    if start is utils.adaptive:
                        start = db_obj.last_date
                        end = db_obj.last_date

                    if start and end:  # could be None if there is no data
                        t_start = start
                        while t_start <= end:
                            t_tts = set(db_obj.tick_types(t_start))

                            t_start += dt.timedelta(days=1)

                            if len(t_tts) == 0:
                                continue

                            if tick_types is None:
                                tick_types = t_tts
                            else:
                                tick_types &= t_tts

                            if len(tick_types) == 0:
                                raise Exception(f'It seems that there is no common '
                                                f'tick types for dates from {start} '
                                                f'to {end}. Please specify a tick '
                                                'type')

                    if tick_types is None:
                        if tick_type is utils.adaptive:
                            tick_types = ['TRD']  # the default one
                        else:
                            raise Exception(f'Could not find any data in from {start} '
                                            f' to {end}. Could you check that tick type, '
                                            ' database and date range are correct.')

                    if len(tick_types) != 1:
                        raise Exception('The tick type is not specified, found '
                                        'multiple tick types in the database : ' +
                                        ', '.join(tick_types))

                    tick_type = tick_types.pop()

            if not isinstance(tick_type, str) and isinstance(tick_type, Iterable):
                db = [f'{db}::{tt}' for tt in tick_type]
            else:
                db = [db + '::' + tick_type]
            tick_type = None

        if isinstance(db, _SymbolParamColumn):
            ''' Do nothing, because we don't know whether db will come with the tick
            type or not. The only one thing that definetely we know that tick_type
            can not be utils.adpative '''
            if tick_type is utils.adaptive:
                # TODO: need to test this case
                raise Exception('The `db` is set to the symbol param, in that case '
                                'the `tick_type` should be set explicitly to some value '
                                'or to None')

        if db is None:
            ''' This case means that database comes with the symbol name, then tick type
            should be defined '''
            if tick_type is utils.adaptive or tick_type is None:
                raise Exception('The `db` is not specified that means database is '
                                'expected to be defined with the symbol name. '
                                'In that case the `tick_type` should be defined.')
            if not isinstance(tick_type, str) and isinstance(tick_type, Iterable):
                tick_type = '+'.join(tick_type)

        return db, tick_type

    @docstring(parameters=DATA_SOURCE_DOC_PARAMS, add_self=True)
    def __init__(
        self,
        db=None,
        symbol=utils.adaptive,
        tick_type=utils.adaptive,
        start=utils.adaptive,
        end=utils.adaptive,
        date=None,
        schema_policy=None,
        guess_schema=None,
        identify_input_ts=False,
        back_to_first_tick=0,
        keep_first_tick_timestamp=None,
        max_back_ticks_to_prepend=1,
        where_clause_for_back_ticks=None,
        symbols=None,
        presort=utils.adaptive,
        batch_size=None,
        concurrency=None,
        **desired_schema,
    ):
        """
        Construct a source providing data from a given ``db``.

        Examples
        ---------

        Symbol can be a collection

        >>> # OTdirective: snippet-name:fetch data.simple;
        >>> data = otp.DataSource(db='SOME_DB', tick_type='TT', symbols=['S1', 'S2'])
        >>> otp.run(data)
                             Time  X
        0 2003-12-01 00:00:00.000  1
        1 2003-12-01 00:00:00.000 -3
        2 2003-12-01 00:00:00.001  2
        3 2003-12-01 00:00:00.001 -2
        4 2003-12-01 00:00:00.002  3
        5 2003-12-01 00:00:00.002 -1


        Source also can be passed as symbols, in such case magic named column SYMBOL_NAME will be transform to symbol
        and all other columns will be symbol parameters

        >>> # OTdirective: snippet-name:fetch data.symbols as a source;
        >>> symbols = otp.Ticks(SYMBOL_NAME=['S1', 'S2'])
        >>> data = otp.DataSource(db='SOME_DB', symbols=symbols, tick_type='TT')
        >>> otp.run(data)
                             Time  X
        0 2003-12-01 00:00:00.000  1
        1 2003-12-01 00:00:00.000 -3
        2 2003-12-01 00:00:00.001  2
        3 2003-12-01 00:00:00.001 -2
        4 2003-12-01 00:00:00.002  3
        5 2003-12-01 00:00:00.002 -1

        Default schema policy is **tolerant**.

        >>> data = otp.DataSource(db='NYSE_TAQ', tick_type='TRD', symbols='AAPL', PRICE=float, date=otp.dt(2022, 3, 1))
        >>> data.schema
        {'PRICE': <class 'float'>, 'SIZE': <class 'int'>}

        >>> data = otp.DataSource(db='NYSE_TAQ', tick_type='TRD', symbols='AAPL', PRICE=int, date=otp.dt(2022, 3, 1))
        Traceback (most recent call last):
          ...
        ValueError: Database(-s) NYSE_TAQ::TRD schema field PRICE has type <class 'float'>,
        but <class 'int'> was requested

        Schema policy **manual** uses exactly ``desired_schema``:

        >>> data = otp.DataSource(db='NYSE_TAQ', tick_type='TRD', symbols='AAPL', PRICE=float,
        ...                       date=otp.dt(2022, 3, 1), schema_policy='manual')
        >>> data.schema
        {'PRICE': <class 'float'>}

        Schema policy **fail** raises an exception if the schema cannot be deduced:

        >>> data = otp.DataSource(db='NYSE_TAQ', tick_type='TRD', symbols='AAPL', date=otp.dt(2021, 3, 1),
        ...                       schema_policy='fail')
        Traceback (most recent call last):
          ...
        Exception: No ticks found in database(-s) NYSE_TAQ::TRD

        ``back_to_first_tick`` sets how far back to go looking for the latest tick before ``start`` time:

        >>> data = otp.DataSource(db='NYSE_TAQ', tick_type='TRD', symbols='AAPL', date=otp.dt(2022, 3, 2),
        ...                       back_to_first_tick=otp.Day(1))
        >>> otp.run(data)
                             Time  PRICE  SIZE
        0 2022-03-02 00:00:00.000    1.4    50
        1 2022-03-02 00:00:00.000    1.0   100
        2 2022-03-02 00:00:00.001    1.1   101
        3 2022-03-02 00:00:00.002    1.2   102

        ``keep_first_tick_timestamp`` allows to show the original timestamp of the tick that was taken from before
        the start time of the query:

        >>> data = otp.DataSource(db='NYSE_TAQ', tick_type='TRD', symbols='AAPL', date=otp.dt(2022, 3, 2),
        ...                       back_to_first_tick=otp.Day(1), keep_first_tick_timestamp='ORIGIN_TIMESTAMP')
        >>> otp.run(data)
                             Time  PRICE  SIZE        ORIGIN_TIMESTAMP
        0 2022-03-02 00:00:00.000    1.4    50 2022-03-01 00:00:00.002
        1 2022-03-02 00:00:00.000    1.0   100 2022-03-02 00:00:00.000
        2 2022-03-02 00:00:00.001    1.1   101 2022-03-02 00:00:00.001
        3 2022-03-02 00:00:00.002    1.2   102 2022-03-02 00:00:00.002

        ``max_back_ticks_to_prepend`` is used with ``back_to_first_tick``
        if more than 1 ticks before start time should be retrieved:

        >>> data = otp.DataSource(db='NYSE_TAQ', tick_type='TRD', symbols='AAPL', date=otp.dt(2022, 3, 2),
        ...                       max_back_ticks_to_prepend=2, back_to_first_tick=otp.Day(1),
        ...                       keep_first_tick_timestamp='ORIGIN_TIMESTAMP')
        >>> otp.run(data)
                             Time  PRICE  SIZE        ORIGIN_TIMESTAMP
        0 2022-03-02 00:00:00.000    1.4    10 2022-03-01 00:00:00.001
        1 2022-03-02 00:00:00.000    1.4    50 2022-03-01 00:00:00.002
        2 2022-03-02 00:00:00.000    1.0   100 2022-03-02 00:00:00.000
        3 2022-03-02 00:00:00.001    1.1   101 2022-03-02 00:00:00.001
        4 2022-03-02 00:00:00.002    1.2   102 2022-03-02 00:00:00.002

        ``where_clause_for_back_ticks`` is used to filter out ticks before the start time:

        >>> data = otp.DataSource(db='NYSE_TAQ', tick_type='TRD', symbols='AAPL', date=otp.dt(2022, 3, 2),
        ...                       where_clause_for_back_ticks=otp.raw('SIZE>=50', dtype=bool),
        ...                       back_to_first_tick=otp.Day(1), max_back_ticks_to_prepend=2,
        ...                       keep_first_tick_timestamp='ORIGIN_TIMESTAMP')  # doctest: +SKIP
        >>> otp.run(data)  # doctest: +SKIP
                             Time  PRICE  SIZE        ORIGIN_TIMESTAMP
        0 2022-03-02 00:00:00.000    1.3   100 2022-03-01 00:00:00.000
        1 2022-03-02 00:00:00.000    1.4    50 2022-03-01 00:00:00.002
        2 2022-03-02 00:00:00.000    1.0   100 2022-03-02 00:00:00.000
        3 2022-03-02 00:00:00.001    1.1   101 2022-03-02 00:00:00.001
        4 2022-03-02 00:00:00.002    1.2   102 2022-03-02 00:00:00.002
        """

        if self._try_default_constructor(**desired_schema):
            return

        # for cases when we want to explicitly convert into string,
        # it might be symbol param or join_with_query parameter
        if isinstance(tick_type, _ParamColumn):
            tick_type = str(tick_type)[1:-1]

        if date:
            # TODO: write a warning in that case
            start, end = self.__prepare_dates(date)

        db, tick_type = self.__prepare_db_tick_type(db,
                                                    tick_type,
                                                    symbol,
                                                    start,
                                                    end)

        self._p_db = db
        self._p_strict = schema_policy in (self.POLICY_FAIL_STRICT,
                                           self.POLICY_TOLERANT_STRICT,
                                           self.POLICY_MANUAL_STRICT)

        self._p_schema = self.__prepare_schema(db,  # tick type is embedded into the db
                                               start,
                                               end,
                                               schema_policy,
                                               guess_schema,
                                               desired_schema)

        if symbols is not None:
            if symbol is utils.adaptive or symbol is None:
                symbol = symbols
            else:
                # TODO: test it
                raise Exception('You have set the `symbol` and `symbols` parameters'
                                'together, it is not allowed. Please, clarify parameters')

        if isinstance(symbol, Symbols) and symbol._p_db is None:
            symbol = Symbols.duplicate(symbol, db=db)

        if identify_input_ts:
            if "SYMBOL_NAME" in desired_schema:
                raise Exception()  # TODO: think about how user could workaround it
            desired_schema["SYMBOL_NAME"] = str
            if "TICK_TYPE" in desired_schema:
                raise Exception()
            desired_schema["TICK_TYPE"] = str

        # unobvious way to convert otp.Minute/Hour/... to number of seconds
        if type(back_to_first_tick).__name__ == '_DatePartCls':
            back_to_first_tick = int((ott.dt(0) + back_to_first_tick).timestamp())

        if isinstance(back_to_first_tick, _Operation):
            back_to_first_tick = otp.expr(back_to_first_tick)

        if back_to_first_tick != 0 and keep_first_tick_timestamp:
            desired_schema[keep_first_tick_timestamp] = ott.nsectime

        if max_back_ticks_to_prepend < 1:
            raise ValueError(f'`max_back_ticks_to_prepend` must be at least 1 '
                             f'but {max_back_ticks_to_prepend} was passed')

        if where_clause_for_back_ticks is not None:
            if not isinstance(where_clause_for_back_ticks, Raw):
                raise ValueError(f'Currently only otp.raw is supported for `where_clause_for_back_ticks` '
                                 f'but {type(where_clause_for_back_ticks)} was passed')
            if where_clause_for_back_ticks.dtype is not bool:
                raise ValueError(f'Only bool dtype for otp.raw in `where_clause_for_back_ticks` is supported '
                                 f'but {where_clause_for_back_ticks.dtype} was passed')
            where_clause_for_back_ticks = str(where_clause_for_back_ticks)

        if (
            isinstance(symbol, Source)
            or hasattr(symbol, "__iter__")
            and not isinstance(symbol, dict)
            and not isinstance(symbol, str)
            or isinstance(symbol, query)
            or isinstance(symbol, _QueryEvalWrapper)
        ):
            super().__init__(
                _start=start,
                _end=end,
                _base_ep_func=lambda: self._base_ep_for_cross_symbol(
                    db, symbol, tick_type,
                    identify_input_ts=identify_input_ts,
                    back_to_first_tick=back_to_first_tick,
                    keep_first_tick_timestamp=keep_first_tick_timestamp,
                    presort=presort, batch_size=batch_size, concurrency=concurrency,
                    max_back_ticks_to_prepend=max_back_ticks_to_prepend,
                    where_clause_for_back_ticks=where_clause_for_back_ticks,
                ),
                **desired_schema
            )
        else:
            super().__init__(
                _symbols=symbol,
                _start=start,
                _end=end,
                _base_ep_func=lambda: self.base_ep(
                    db,
                    tick_type,
                    identify_input_ts=identify_input_ts,
                    back_to_first_tick=back_to_first_tick,
                    keep_first_tick_timestamp=keep_first_tick_timestamp,
                    max_back_ticks_to_prepend=max_back_ticks_to_prepend,
                    where_clause_for_back_ticks=where_clause_for_back_ticks,
                ),
                **desired_schema
            )

    @property
    def db(self):
        return self._p_db

    @staticmethod
    def _create_source(passthrough_ep, back_to_first_tick=0, keep_first_tick_timestamp=None):
        """Create graph that save original timestamp of first tick if needed"""
        if back_to_first_tick != 0 and keep_first_tick_timestamp:
            src = Source(otq.Passthrough())
            src.sink(otq.AddField(field=keep_first_tick_timestamp, value='TIMESTAMP'))
            src.sink(passthrough_ep)
            return src
        return Source(passthrough_ep)

    def _table_schema(self, src):
        return src.table(**self._p_schema, strict=self._p_strict)

    def base_ep(
        self,
        db,
        tick_type,
        identify_input_ts,
        back_to_first_tick=0,
        keep_first_tick_timestamp=None,
        max_back_ticks_to_prepend=1,
        where_clause_for_back_ticks=None,
    ):
        if db is not None:
            if isinstance(db, list):
                str_db = "+".join(db)
            else:
                str_db = str(db)

            if tick_type:
                if isinstance(db, _SymbolParamColumn):
                    str_db = f"expr({str_db} + '::{tick_type}')"    # TODO: test
                else:
                    if "::" not in str_db:
                        str_db += "::" + tick_type

            else:
                if isinstance(db, _SymbolParamColumn):
                    str_db = f"expr({str_db})"  # TODO: test

        else:
            str_db = tick_type

        params = dict(
            go_back_to_first_tick=back_to_first_tick,
            max_back_ticks_to_prepend=max_back_ticks_to_prepend,
        )

        if where_clause_for_back_ticks is not None:
            params['where_clause_for_back_ticks'] = where_clause_for_back_ticks

        if isinstance(db, list) or isinstance(db, _SymbolParamColumn):
            src = self._create_source(otq.Passthrough(**params),
                                      back_to_first_tick=back_to_first_tick,
                                      keep_first_tick_timestamp=keep_first_tick_timestamp)
            src.sink(otq.Merge(identify_input_ts=identify_input_ts))
        else:
            if identify_input_ts:
                params["fields"] = "SYMBOL_NAME,TICK_TYPE"
                params["drop_fields"] = True

            src = self._create_source(otq.Passthrough(**params),
                                      back_to_first_tick=back_to_first_tick,
                                      keep_first_tick_timestamp=keep_first_tick_timestamp)
        src.tick_type(str_db)

        src = self._table_schema(src)
        return src

    def _base_ep_for_cross_symbol(
        self, db, symbol, tick_type, identify_input_ts,
        back_to_first_tick=0, keep_first_tick_timestamp=None,
        presort=utils.adaptive, batch_size=None, concurrency=None,
        max_back_ticks_to_prepend=1,
        where_clause_for_back_ticks=None,
    ):
        tmp_otq = TmpOtq()

        if isinstance(symbol, _QueryEvalWrapper):
            symbol = symbol.to_eval_string(tmp_otq=tmp_otq)
        elif isinstance(symbol, query):
            symbol = symbol.to_eval_string()
        elif isinstance(symbol, Source):
            symbol = self._convert_symbol_to_string(symbol, tmp_otq)

        if db is not None:
            if isinstance(db, list):
                tick_type = "+".join(db)
            else:
                tick_type = f"{db}::{tick_type}"

        kwargs = dict(
            go_back_to_first_tick=back_to_first_tick,
            max_back_ticks_to_prepend=max_back_ticks_to_prepend,
        )

        if where_clause_for_back_ticks is not None:
            kwargs['where_clause_for_back_ticks'] = where_clause_for_back_ticks

        src = self._create_source(otq.Passthrough(**kwargs),
                                  back_to_first_tick=back_to_first_tick,
                                  keep_first_tick_timestamp=keep_first_tick_timestamp)
        if presort is utils.adaptive:
            presort = True
        if presort:
            if batch_size is None:
                batch_size = otp.config.default_batch_size
            if concurrency is None:
                concurrency = (
                    otp.config.default_concurrency
                    if otp.config.default_concurrency is not None
                    # otq.Presort does not support None
                    else ''
                )
            src.sink(
                otq.Presort(batch_size=batch_size, max_concurrency=concurrency).symbols(symbol).tick_type(tick_type)
            )
            src.sink(otq.Merge(identify_input_ts=identify_input_ts))
        else:
            src.sink(
                otq.Merge(identify_input_ts=identify_input_ts).symbols(symbol).tick_type(tick_type)
            )
        src._tmp_otq.merge(tmp_otq)

        src = self._table_schema(src)
        return src


Custom = DataSource  # for backward compatiblity, previously we had only Custom


[docs]class Symbols(Source):
    """
    Construct a source that returns ticks with information about symbols in a database.
    The SYMBOL_NAME field is populated with symbol names. The TICK_TYPE field contains
    corresponding tick type (enabled by the ``show_tick_type`` parameter).

    Parameters
    ----------
    db: str
        Name of the database where to search symbols
    tick_type: str
        Tick type to use. Default is `ANY`
    start, end: :py:class:`datetime.datetime`, :py:class:`otp.datetime <onetick.py.datetime>`, \
                    :py:class:`onetick.py.adaptive`
        Time interval from which the data should be taken.
    date: :py:class:`datetime.date`
        Alternative way of setting instead of start/end times
    keep_db: bool
        Flag that indicates whether symbols should have a db prefix.
    pattern: str
        SQL syntax patter for symbols. Default is '%'
    for_tick_type: str
        Fetch only symbols belong to this tick type, if specified.
    show_tick_type: bool
        Add the TICK_TYPE column with the information about tick type
    symbology: str
        The destination symbology for a symbol name translation.
        Translation is performed, if destination symbology is not empty
        and is different from that of the queried database.
    show_original_symbols: bool
        Switches original symbol name propagation as a tick field ORIGINAL_SYMBOL_NAME
        if symbol name translation is performed (if `symbology` is set).
        Note that if this parameter is set to True,
        database symbols with missing translations are also propagated.

    Note
    ----
    Additional fields that can be added to Symbols will be converted to symbol parameters

    See also
    --------
    | :ref:`Symbols guide <Symbols>`
    | **FIND_DB_SYMBOLS** OneTick event processor

    Examples
    --------

    This class can be used to get a list of all symbols in the database:

    >>> symbols = otp.Symbols('NYSE_TAQ', date=otp.dt(2022, 3, 1))
    >>> otp.run(symbols)
            Time  SYMBOL_NAME
    0 2022-03-01          AAP
    1 2022-03-01         AAPL

    Also this class can be used to specify symbols for the main query:

    >>> symbols = otp.Symbols('NYSE_TAQ', date=otp.dt(2022, 3, 1))
    >>> data = otp.DataSource('NYSE_TAQ', tick_type='TRD', date=otp.dt(2022, 3, 1))
    >>> result = otp.run(data, symbols=symbols)
    >>> result['AAPL']
                         Time  PRICE  SIZE
    0 2022-03-01 00:00:00.000    1.3   100
    1 2022-03-01 00:00:00.001    1.4    10
    2 2022-03-01 00:00:00.002    1.4    50
    >>> result['AAP']
                         Time  PRICE
    0 2022-03-01 00:00:00.000  45.37
    1 2022-03-01 00:00:00.001  45.41

    Additional fields of the ``otp.Symbols`` can be used in the main query as symbol parameters:

    >>> symbols = otp.Symbols('SOME_DB', show_tick_type=True, keep_db=True)
    >>> symbols['PARAM'] = symbols['SYMBOL_NAME'] + '__' + symbols['TICK_TYPE']
    >>> data = otp.DataSource('SOME_DB')
    >>> data['S_PARAM'] = data.Symbol.PARAM
    >>> data = otp.merge([data], symbols=symbols)
    >>> otp.run(data)
                         Time   X          S_PARAM
    0 2003-12-01 00:00:00.000   1  SOME_DB::S1__TT
    1 2003-12-01 00:00:00.000  -3  SOME_DB::S2__TT
    2 2003-12-01 00:00:00.001   2  SOME_DB::S1__TT
    3 2003-12-01 00:00:00.001  -2  SOME_DB::S2__TT
    4 2003-12-01 00:00:00.002   3  SOME_DB::S1__TT
    5 2003-12-01 00:00:00.002  -1  SOME_DB::S2__TT
    """

    _PROPERTIES = Source._PROPERTIES + ["_p_db",
                                        "_p_pattern",
                                        "_p_start",
                                        "_p_end",
                                        "_p_for_tick_type",
                                        "_p_keep_db"]

    def __init__(
        self,
        db=None,
        tick_type="ANY",
        start=utils.adaptive,
        end=utils.adaptive,
        date=None,
        find_params=None,
        keep_db=False,
        pattern='%',
        for_tick_type=None,
        show_tick_type=False,
        symbology='',
        show_original_symbols=False,
        **kwargs
    ):
        if self._try_default_constructor(**kwargs):
            return

        self._p_db = db
        self._p_pattern = pattern
        self._p_start = start
        self._p_end = end
        self._p_keep_db = keep_db
        self._p_for_tick_type = for_tick_type

        if date:
            if isinstance(date, ott.datetime) or isinstance(date, ott.date):
                start = date.start
                end = date.end

        _symbol = utils.adaptive
        if db:
            if isinstance(db, list):
                _symbol = [f"{str(_db).split(':')[0]}::" for _db in db] # noqa
            else:
                _symbol = f"{str(db).split(':')[0]}::"  # noqa
        _find_params = find_params if find_params is not None else {}

        _find_params.setdefault('pattern', pattern)
        if for_tick_type:
            _find_params['tick_type_field'] = for_tick_type
        _find_params.setdefault('show_tick_type', show_tick_type)

        _find_params.setdefault('symbology', symbology)
        _find_params.setdefault('show_original_symbols', show_original_symbols)

        super().__init__(
            _symbols=_symbol,
            _start=start,
            _end=end,
            _base_ep_func=lambda: self.base_ep(ep_tick_type=tick_type, keep_db=keep_db, **_find_params),
        )

        self.schema['SYMBOL_NAME'] = str

        if _find_params['show_tick_type']:
            self.schema['TICK_TYPE'] = str

        if _find_params['symbology'] and _find_params['show_original_symbols']:
            self.schema['ORIGINAL_SYMBOL_NAME'] = str

    def base_ep(self, ep_tick_type, keep_db, **params):
        src = Source(otq.FindDbSymbols(**params))

        src.tick_type(ep_tick_type)
        src.schema['SYMBOL_NAME'] = str

        if not keep_db:
            src["SYMBOL_NAME"] = src["SYMBOL_NAME"].str.regex_replace('.*::', '')

        return src

    @staticmethod
    def duplicate(obj, db=None):
        return Symbols(db=obj._p_db if db is None else db,
                       pattern=obj._p_pattern,
                       start=obj._p_start,
                       end=obj._p_end,
                       keep_db=obj._p_keep_db,
                       for_tick_type=obj._p_for_tick_type)


def default_date_converter(date):
    return pd.to_datetime(date, format='%Y%m%d%H%M%S.%f')


def to_timestamp_nanos(date, date_converter, tz):
    date = date_converter(date)
    if isinstance(date, ott.dt):
        date = date.ts
    else:
        date = pd.to_datetime(date)
    return date.tz_localize(tz)


def LocalCSVTicks(path,
                  start=utils.adaptive,
                  end=utils.adaptive,
                  date_converter=default_date_converter,
                  additional_date_columns=None,
                  converters=None,
                  tz=None,
                  ):
    """
    Loads ticks from csv file, and creating otp.Ticks object from them

    Parameters
    ----------
    path: str
        Absolute path to csv file
    start: datetime object
        Start of the query interval
    end: datetime object
        End of the query interval
    date_converter:
        A converter from string to datetime format, by default used only to TIMESTAMP column
    additional_date_columns:
        Other columns to convert to datetime format
    converters:
        Non default converters to columns from strings
    tz:
        timezone

    Returns
    -------
    otp.Ticks
    """
    if tz is None:
        tz = configuration.config.tz

    c = {'TIMESTAMP': partial(to_timestamp_nanos, date_converter=date_converter, tz=tz)}
    if converters is not None:
        c.update(converters)
    if additional_date_columns is not None:
        c.update({column: partial(to_timestamp_nanos,
                                  date_converter=date_converter,
                                  tz=tz,
                                  ) for column in additional_date_columns})
    df = pd.read_csv(path, converters=c)
    df['TS_'] = df['TIMESTAMP']
    df['SYMBOL_NAME'] = df['#SYMBOL_NAME']
    d = df.to_dict(orient='list')
    del d['TIMESTAMP']
    del d['#SYMBOL_NAME']

    ticks = Ticks(d, start=start, end=end)
    ticks['TIMESTAMP'] = ticks['TS_']
    ticks = ticks.drop('TS_')

    return ticks


class SymbologyMapping(Source):
    _PROPERTIES = Source._PROPERTIES + ["_p_dest_symbology"]

    def __init__(self,
                 dest_symbology: str = None,
                 tick_type: str = None,
                 start=utils.adaptive,
                 end=utils.adaptive,
                 symbols=utils.adaptive,
                 **desired_schema):
        if self._try_default_constructor(**desired_schema):
            return

        if not dest_symbology or not tick_type:
            raise TypeError("Missing required argument: 'dest_symbology' or 'tick_type'")

        self._p_dest_symbology = dest_symbology

        super().__init__(
            _symbols=symbols,
            _start=start,
            _end=end,
            _base_ep_func=lambda: self.base_ep(dest_symbology, tick_type),
            **desired_schema
        )

        self.schema['MAPPED_SYMBOL_NAME'] = str
        self.schema['END_DATETIME'] = ott.nsectime

    @property
    def dest_symbology(self):
        return self._p_dest_symbology

    def base_ep(self, dest_symbology, tick_type):
        src = Source(otq.SymbologyMapping(dest_symbology=dest_symbology))
        src.tick_type(tick_type)
        return src


class SplitQueryOutputBySymbol(Source):
    def __init__(self,
                 query=None,
                 symbol_field=None,
                 single_invocation=False,
                 db=utils.adaptive_to_default,
                 tick_type=utils.adaptive,
                 start=utils.adaptive,
                 end=utils.adaptive,
                 symbols=utils.adaptive,
                 **desired_schema):
        if self._try_default_constructor(**desired_schema):
            return

        if isinstance(query, Source):  # TODO: support already existing queries
            query = query.copy()
            otq_query = query._save_as_tmp_otq()
            q_start, q_end, _ = query._get_date_range()
            if start is utils.adaptive and end is utils.adaptive:
                start, end = q_start, q_end
        else:
            raise Exception('Non supported type of the `query` is specified')

        if db is utils.adaptive_to_default:
            db = configuration.config.get('default_db')
        if tick_type is utils.adaptive:
            tick_type = 'SPLIT_BY_SYMBOL'

        super().__init__(
            _symbols=symbols,
            _start=start,
            _end=end,
            _base_ep_func=partial(self.build, db, tick_type, symbol_field, otq_query, single_invocation),
            **desired_schema
        )

    def build(self, db, tick_type, symbol_field_name, otq_query, single_invocation):
        src = Source(otq.SplitQueryOutputBySymbol(otq_query=otq_query,
                                                  symbol_field_name=str(symbol_field_name),
                                                  ensure_single_invocation=single_invocation))

        if db:
            tick_type = str(db) + f'::{tick_type}'
        src.tick_type(tick_type)

        return src


[docs]def by_symbol(src: Source,
              symbol_field,
              single_invocation=False,
              db=utils.adaptive_to_default,
              tick_type=utils.adaptive,
              start=utils.adaptive,
              end=utils.adaptive,
              ) -> Source:
    """
    Create a separate data series for each unique value of ``symbol_field`` in the output of ``src``.
    ``src`` must specify enough parameters to be run (e.g., symbols, query range). A typical use case is to split a
    single data series (e.g., from a CSV file) into separate data series by symbol. This method is a source.

    Parameters
    ----------
    src: Source
        a query which output is to be split by ``symbol_field``
    symbol_field: str
        the name of the field carrying symbol name in the ``src`` query
    single_invocation: bool, optional
        ``True`` means that the ``src`` query is run once and the result stored in memory speeding up the execution.
        ``False`` means that the ``src`` query is run for every symbol of the query saving memory
        but slowing down query execution.
        Default: ``False``
    db: str, optional
        Database for running the query. Doesn't affect the ``src`` query. The default value
        is ``otp.config['default_db']``.
    tick_type: str, optional
        Tick type for the query. Doesn't affect the ``src`` query.
    start: otp.dt, optional
        By default it is taken from the ``src`` start time
    end: otp.dt, optional
        By default it is taken from the ``src`` end time

    See also
    --------
    **SPLIT_QUERY_OUTPUT_BY_SYMBOL** OneTick event processor

    Examples
    --------
    >>> executions = otp.CSV( # doctest: +SKIP
    ...     otp.utils.file(os.path.join(cur_dir, 'data', 'example_events.csv')),
    ...     converters={"time_number": lambda c: c.apply(otp.nsectime)},
    ...     timestamp_name="time_number",
    ...     start=otp.dt(2022, 7, 1),
    ...     end=otp.dt(2022, 7, 2),
    ...     order_ticks=True
    ... )[['stock', 'px']]
    >>> csv = otp.by_symbol(executions, 'stock') # doctest: +SKIP
    >>> trd = otp.DataSource( # doctest: +SKIP
    ...     db='NYSE_TAQ',
    ...     tick_type='TRD',
    ...     start=otp.dt(2022, 7, 1),
    ...     end=otp.dt(2022, 7, 2)
    ... )[['PRICE', 'SIZE']]
    >>> data = otp.funcs.join_by_time([csv, trd]) # doctest: +SKIP
    >>> result = otp.run(data, symbols=executions.distinct(keys='stock')[['stock']], concurrency=8) # doctest: +SKIP
    >>> result['THG'] # doctest: +SKIP
                               Time stock      px   PRICE  SIZE
    0 2022-07-01 11:37:56.432947200   THG  148.02  146.48     1
    >>> result['TFX'] # doctest: +SKIP
                               Time stock      px   PRICE  SIZE
    0 2022-07-01 11:39:45.882808576   TFX  255.61  251.97     1
    >>> result['BURL'] # doctest: +SKIP
                               Time stock      px   PRICE  SIZE
    0 2022-07-01 11:42:35.125718016  BURL  137.53  135.41     2
    """
    result = SplitQueryOutputBySymbol(src,
                                      symbol_field=symbol_field,
                                      single_invocation=single_invocation,
                                      db=db,
                                      tick_type=tick_type,
                                      start=start,
                                      end=end)

    result.schema.set(**src.schema)
    return result


[docs]@docstring(parameters=OB_SNAPSHOT_DOC_PARAMS + DATA_SOURCE_DOC_PARAMS)
def ObSnapshot(*args, **kwargs):
    """
    Construct a source providing order book snapshot for a given ``db``.
    This is just a shortcut for otp.DataSource + otp.agg.ob_snapshot.

    See also
    --------
    | :class:`onetick.py.DataSource`
    | :meth:`onetick.py.Source.ob_snapshot`
    | :func:`onetick.py.agg.ob_snapshot`
    | **OB_SNAPSHOT** OneTick event processor

    Examples
    ---------

    >>> data = otp.ObSnapshot(db='SOME_DB', tick_type='PRL', symbols='AA', max_levels=1) # doctest: +SKIP
    >>> otp.run(data) # doctest: +SKIP
            Time  PRICE             UPDATE_TIME  SIZE  LEVEL  BUY_SELL_FLAG
    0 2003-12-04    2.0 2003-12-01 00:00:00.003     6      1              1
    1 2003-12-04    5.0 2003-12-01 00:00:00.004     7      1              0
    """
    aggregation_params = {
        param.name: kwargs.pop(param.name, param.default)
        for _, param in OB_SNAPSHOT_DOC_PARAMS
    }
    src = otp.DataSource(*args, **kwargs)
    return otp.agg.ob_snapshot(**aggregation_params).apply(src)


[docs]@docstring(parameters=OB_SNAPSHOT_WIDE_DOC_PARAMS + DATA_SOURCE_DOC_PARAMS)
def ObSnapshotWide(*args, **kwargs):
    """
    Construct a source providing order book wide snapshot for a given ``db``.
    This is just a shortcut for otp.DataSource + otp.agg.ob_snapshot_wide.

    See also
    --------
    | :class:`onetick.py.DataSource`
    | :meth:`onetick.py.Source.ob_snapshot_wide`
    | :func:`onetick.py.agg.ob_snapshot_wide`
    | **OB_SNAPSHOT_WIDE** OneTick event processor

    Examples
    ---------

    >>> data = otp.ObSnapshotWide(db='SOME_DB', tick_type='PRL', symbols='AA', max_levels=1) # doctest: +SKIP
    >>> otp.run(data) # doctest: +SKIP
            Time  BID_PRICE         BID_UPDATE_TIME  BID_SIZE  ASK_PRICE         ASK_UPDATE_TIME  ASK_SIZE  LEVEL
    0 2003-12-03        5.0 2003-12-01 00:00:00.004         7        2.0 2003-12-01 00:00:00.003         6      1
    """
    aggregation_params = {
        param.name: kwargs.pop(param.name, param.default)
        for _, param in OB_SNAPSHOT_WIDE_DOC_PARAMS
    }
    src = otp.DataSource(*args, **kwargs)
    return otp.agg.ob_snapshot_wide(**aggregation_params).apply(src)


[docs]@docstring(parameters=OB_SNAPSHOT_FLAT_DOC_PARAMS + DATA_SOURCE_DOC_PARAMS)
def ObSnapshotFlat(*args, **kwargs):
    """
    Construct a source providing order book flat snapshot for a given ``db``.
    This is just a shortcut for otp.DataSource + otp.agg.ob_snapshot_flat.

    See also
    --------
    | :class:`onetick.py.DataSource`
    | :meth:`onetick.py.Source.ob_snapshot_flat`
    | :func:`onetick.py.agg.ob_snapshot_flat`
    | **OB_SNAPSHOT_FLAT** OneTick event processor

    Examples
    ---------

    >>> data = otp.ObSnapshotFlat(db='SOME_DB', tick_type='PRL', symbols='AA', max_levels=1) # doctest: +SKIP
    >>> otp.run(data) # doctest: +SKIP
            Time  BID_PRICE1        BID_UPDATE_TIME1  BID_SIZE1  ASK_PRICE1        ASK_UPDATE_TIME1  ASK_SIZE1
    0 2003-12-03         5.0 2003-12-01 00:00:00.004          7         2.0 2003-12-01 00:00:00.003          6
    """
    aggregation_params = {
        param.name: kwargs.pop(param.name, param.default)
        for _, param in OB_SNAPSHOT_FLAT_DOC_PARAMS
    }
    src = otp.DataSource(*args, **kwargs)
    return otp.agg.ob_snapshot_flat(**aggregation_params).apply(src)