|
""" |
|
DSV data can be surprisingly diverse. While Python's `csv` module offers out-of-the-box support |
|
for the basic formatting parameters, CSVW recognizes a couple more, like `skipColumns` or |
|
`skipRows`. |
|
|
|
.. seealso:: |
|
|
|
- `<https://www.w3.org/TR/2015/REC-tabular-metadata-20151217/#dialect-descriptions>`_ |
|
- `<https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters>`_ |
|
- `<https://specs.frictionlessdata.io/csv-dialect/>`_ |
|
""" |
|
import attr |
|
import warnings |
|
import functools |
|
|
|
from . import utils |
|
|
|
__all__ = ['Dialect'] |
|
|
|
ENCODING_MAP = { |
|
'UTF-8-BOM': 'utf-8-sig', |
|
} |
|
|
|
|
|
|
|
def _non_negative(instance, attribute, value): |
|
if value < 0: |
|
raise ValueError('{0} is not a valid {1}'.format(value, attribute.name)) |
|
|
|
|
|
non_negative_int = [attr.validators.instance_of(int), _non_negative] |
|
|
|
|
|
def convert_encoding(s): |
|
s = utils.converter(str, 'utf-8', s) |
|
try: |
|
_ = 'x'.encode(ENCODING_MAP.get(s, s)) |
|
return s |
|
except LookupError: |
|
warnings.warn('Invalid value for property: {}'.format(s)) |
|
return 'utf-8' |
|
|
|
|
|
@attr.s |
|
class Dialect(object): |
|
""" |
|
A CSV dialect specification. |
|
|
|
.. seealso:: `<https://www.w3.org/TR/2015/REC-tabular-metadata-20151217/#dialect-descriptions>`_ |
|
""" |
|
|
|
encoding = attr.ib( |
|
default='utf-8', |
|
converter=convert_encoding, |
|
validator=attr.validators.instance_of(str)) |
|
|
|
lineTerminators = attr.ib( |
|
converter=functools.partial(utils.converter, list, ['\r\n', '\n']), |
|
default=attr.Factory(lambda: ['\r\n', '\n'])) |
|
|
|
quoteChar = attr.ib( |
|
converter=functools.partial(utils.converter, str, '"', allow_none=True), |
|
default='"', |
|
) |
|
|
|
doubleQuote = attr.ib( |
|
default=True, |
|
converter=functools.partial(utils.converter, bool, True), |
|
validator=attr.validators.instance_of(bool)) |
|
|
|
skipRows = attr.ib( |
|
default=0, |
|
converter=functools.partial(utils.converter, int, 0, cond=lambda s: s >= 0), |
|
validator=non_negative_int) |
|
|
|
commentPrefix = attr.ib( |
|
default='#', |
|
converter=functools.partial(utils.converter, str, '#', allow_none=True), |
|
validator=attr.validators.optional(attr.validators.instance_of(str))) |
|
|
|
header = attr.ib( |
|
default=True, |
|
converter=functools.partial(utils.converter, bool, True), |
|
validator=attr.validators.instance_of(bool)) |
|
|
|
headerRowCount = attr.ib( |
|
default=1, |
|
converter=functools.partial(utils.converter, int, 1, cond=lambda s: s >= 0), |
|
validator=non_negative_int) |
|
|
|
delimiter = attr.ib( |
|
default=',', |
|
converter=functools.partial(utils.converter, str, ','), |
|
validator=attr.validators.instance_of(str)) |
|
|
|
skipColumns = attr.ib( |
|
default=0, |
|
converter=functools.partial(utils.converter, int, 0, cond=lambda s: s >= 0), |
|
validator=non_negative_int) |
|
|
|
skipBlankRows = attr.ib( |
|
default=False, |
|
converter=functools.partial(utils.converter, bool, False), |
|
validator=attr.validators.instance_of(bool)) |
|
|
|
skipInitialSpace = attr.ib( |
|
default=False, |
|
converter=functools.partial(utils.converter, bool, False), |
|
validator=attr.validators.instance_of(bool)) |
|
|
|
trim = attr.ib( |
|
default='false', |
|
validator=attr.validators.in_(['true', 'false', 'start', 'end']), |
|
converter=lambda v: functools.partial( |
|
utils.converter, |
|
(str, bool), 'false')('{0}'.format(v).lower() if isinstance(v, bool) else v)) |
|
|
|
def updated(self, **kw): |
|
res = self.__class__(**attr.asdict(self)) |
|
for k, v in kw.items(): |
|
setattr(res, k, v) |
|
return res |
|
|
|
@functools.cached_property |
|
def escape_character(self): |
|
return None if self.quoteChar is None else ('"' if self.doubleQuote else '\\') |
|
|
|
@functools.cached_property |
|
def line_terminators(self): |
|
return [self.lineTerminators] \ |
|
if isinstance(self.lineTerminators, str) else self.lineTerminators |
|
|
|
@functools.cached_property |
|
def trimmer(self): |
|
return { |
|
'true': lambda s: s.strip(), |
|
'false': lambda s: s, |
|
'start': lambda s: s.lstrip(), |
|
'end': lambda s: s.rstrip() |
|
}[self.trim] |
|
|
|
def asdict(self, omit_defaults=True): |
|
return utils.attr_asdict(self, omit_defaults=omit_defaults) |
|
|
|
@property |
|
def python_encoding(self): |
|
return ENCODING_MAP.get(self.encoding, self.encoding) |
|
|
|
def as_python_formatting_parameters(self): |
|
return { |
|
'delimiter': self.delimiter, |
|
'doublequote': self.doubleQuote, |
|
|
|
|
|
'escapechar': self.escape_character if not self.doubleQuote else None, |
|
'lineterminator': self.line_terminators[0], |
|
'quotechar': self.quoteChar, |
|
'skipinitialspace': self.skipInitialSpace, |
|
'strict': True, |
|
} |
|
|