Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Some missing linkages #77

Merged
merged 4 commits into from
Aug 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions src/akimbo/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ def __init__(self, accessor) -> None:

# listed below https://arrow.apache.org/docs/python/generated/
# pyarrow.compute.ceil_temporal.html
cast = dec(pc.cast) # TODO: move to .ak
ceil_temporal = dec_t(pc.ceil_temporal)
floor_temporal = dec_t(pc.floor_temporal)
reound_temporal = dec_t(pc.round_temporal)
Expand Down Expand Up @@ -62,7 +61,7 @@ def __init__(self, accessor) -> None:
weeks_between = dec_t(pc.weeks_between)
years_between = dec_t(pc.years_between)

# TODO: strftime, strptime
strftime = dec_t(pc.strftime)

# TODO: timezone conversion

Expand Down
13 changes: 13 additions & 0 deletions src/akimbo/mixin.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from typing import Callable, Iterable

import awkward as ak
import pyarrow.compute as pc

from akimbo.apply_tree import dec

methods = [
_ for _ in (dir(ak)) if not _.startswith(("_", "ak_")) and not _[0].isupper()
Expand Down Expand Up @@ -142,6 +145,16 @@ def __init__(self, obj, behavior=None):
self._obj = obj
self._behavior = behavior

def __call__(self, *args, behavior=None, **kwargs):
return Accessor(self._obj, behavior=behavior)

cast = dec(pc.cast)

@property
def accessor(self):
# if we use `dec`, which expects to work on
return self

@classmethod
def is_series(cls, data):
return isinstance(data, cls.series_type)
Expand Down
11 changes: 10 additions & 1 deletion src/akimbo/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from collections.abc import Callable

import awkward as ak
import pyarrow.compute as pc

from akimbo.apply_tree import dec
from akimbo.mixin import Accessor
Expand Down Expand Up @@ -50,6 +51,12 @@ def _decode(layout):
if not aname.startswith(("_", "akstr_")) and not aname[0].isupper()
]

# make sensible defaults for strptime
strptime = functools.wraps(pc.strptime)(
lambda *args, format="%FT%T", unit="s", error_is_null=True, **kw:
pc.strptime(*args, format=format, unit=unit, error_is_null=error_is_null)
)


class StringAccessor:
def __init__(self, accessor):
Expand Down Expand Up @@ -92,8 +99,10 @@ def f(*args, **kwargs):

return f

strptime = dec(strptime, match=match_string)

def __dir__(self) -> list[str]:
return sorted(methods)
return sorted(methods + ["strptime"])


Accessor.register_accessor("str", StringAccessor)
24 changes: 17 additions & 7 deletions tests/test_dt.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import datetime
import os

import pytest

import akimbo.pandas # noqa

pd = pytest.importorskip("pandas")
WIN = os.name == "nt"


def test_cast():
s = pd.Series([[0, 1], [1, 0], [2]])
out = s.ak.dt.cast("timestamp[s]")
out = s.ak.cast("timestamp[s]")
assert str(out.dtype) == "list<item: timestamp[s]>[pyarrow]"
assert out.to_list() == [
[datetime.datetime(1970, 1, 1, 0, 0), datetime.datetime(1970, 1, 1, 0, 0, 1)],
Expand All @@ -20,7 +22,7 @@ def test_cast():

def test_unary_unit():
s = pd.Series([[0, 1], [1, 0], [2]])
ts = s.ak.dt.cast("timestamp[s]")
ts = s.ak.cast("timestamp[s]")
s2 = ts.ak.dt.second()
assert s.to_list() == s2.to_list()

Expand All @@ -35,8 +37,8 @@ def test_bad_type():
def test_binary():
s = pd.Series([[0, 1], [1, 0], [2]])
s2 = s.ak + 1
ts1 = s.ak.dt.cast("timestamp[s]")
ts2 = s2.ak.dt.cast("timestamp[s]")
ts1 = s.ak.cast("timestamp[s]")
ts2 = s2.ak.cast("timestamp[s]")

out = ts1.ak.dt.nanoseconds_between(ts2)
assert out.tolist() == [
Expand All @@ -50,8 +52,8 @@ def test_binary():
def test_binary_with_kwargs():
s = pd.Series([[0, 1], [1, 0], [2]])
s2 = s.ak + int(24 * 3600 * 7 * 2.5)
ts1 = s.ak.dt.cast("timestamp[s]")
ts2 = s2.ak.dt.cast("timestamp[s]")
ts1 = s.ak.cast("timestamp[s]")
ts2 = s2.ak.cast("timestamp[s]")

out = ts1.ak.dt.weeks_between(ts2, count_from_zero=False, week_start=2)
assert out.tolist() == [[2, 2], [2, 2], [2]]
Expand All @@ -64,8 +66,16 @@ def test_mixed_record():
s = pd.Series(data)

# explicit select of where to apply transform
ts = s.ak.dt.cast("timestamp[s]", where="a")
ts = s.ak.cast("timestamp[s]", where="a")

# implicit selection of timestamps
s2 = ts.ak.dt.second()
assert s2.to_list() == data


@pytest.mark.skipif(WIN, reason="arrow on windows needs a timezone database")
def test_text_conversion():
s = pd.Series([["2024-08-01T01:00:00", None, "2024-08-01T01:01:00"]])
s2 = s.ak.str.strptime()
s3 = s2.ak.dt.strftime("%FT%T")
assert s3.tolist() == [["2024-08-01T01:00:00", None, "2024-08-01T01:01:00"]]
6 changes: 3 additions & 3 deletions tests/test_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ def test_encode_decode():


def test_split():
s = pd.Series(["hello world", "oio", ""])
s = pd.Series(["hello world", "oio", pd.NA, ""])
s2 = s.ak.str.split_whitespace()
assert s2.tolist() == [["hello", "world"], ["oio"], [""]]
assert s2.tolist() == [["hello", "world"], ["oio"], pd.NA, [""]]
s2 = s.ak.str.split_pattern("i")
assert s2.tolist() == [["hello world"], ["o", "o"], [""]]
assert s2.tolist() == [["hello world"], ["o", "o"], pd.NA, [""]]

s = pd.Series([b"hello world", b"oio", b""])
s2 = s.ak.str.split_whitespace()
Expand Down
Loading