|
37 | 37 | trim_labels, |
38 | 38 | calculate_correlations, |
39 | 39 | plot_store_correlation_matrices, |
| 40 | + bin_categorical, |
40 | 41 | ) |
41 | 42 | from mostlyai.qa.sampling import pull_data_for_accuracy, sample_two_consecutive_rows |
42 | 43 | from mostlyai.qa.common import ( |
@@ -496,6 +497,14 @@ def test_num_col_nans_only(self): |
496 | 497 | df_counts = df["nans"].value_counts().to_dict() |
497 | 498 | assert df_counts["(n/a)"] == 10 |
498 | 499 |
|
| 500 | + def test_bin_categorical(self): |
| 501 | + x = pd.Series(["a", "b"] * 50 + ["x"]) |
| 502 | + col, _ = bin_categorical(x, 5) |
| 503 | + assert len(col) == 101 |
| 504 | + x = pd.Series([True, False] * 50 + [np.nan] * 100, dtype="object") |
| 505 | + col, _ = bin_categorical(x, 5) |
| 506 | + assert len(col) == 200 |
| 507 | + |
499 | 508 | def test_bin_numeric(self): |
500 | 509 | # test several edge cases |
501 | 510 | cases = [ |
@@ -534,6 +543,10 @@ def test_bin_datetime(self): |
534 | 543 | ), |
535 | 544 | ["⪰ 2023-01-30 13:00:00.333000"] * 20, |
536 | 545 | ), # two values |
| 546 | + ( |
| 547 | + pd.Series([pd.NaT, "2024-11-20"], dtype="datetime64[ns]"), |
| 548 | + ["(n/a)", "⪰ 2024-Nov-20"], |
| 549 | + ), # single value with leading N/A |
537 | 550 | ] |
538 | 551 |
|
539 | 552 | for col, expected in cases: |
|
0 commit comments