diff-diff/diff_diff/linalg.py at aeb6ecced23f4a413f2839669b809c4d6715a7d8 · igerber/diff-diff · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
Unified linear algebra backend for diff-diff.

This module provides optimized OLS and variance estimation with an optional
Rust backend for maximum performance.

The key optimizations are:
1. scipy.linalg.lstsq with 'gelsd' driver (SVD-based, handles rank-deficient matrices)
2. Vectorized cluster-robust SE via groupby (eliminates O(n*clusters) loop)
3. Single interface for all estimators (reduces code duplication)
4. Optional Rust backend for additional speedup (when available)
5. R-style rank deficiency handling: detect, warn, and set NA for dropped columns

The Rust backend is automatically used when available, with transparent
fallback to NumPy/SciPy implementations.

Rank Deficiency Handling
------------------------
When a design matrix is rank-deficient (has linearly dependent columns), the OLS
solution is not unique. This module follows R's `lm()` approach:

1. Detect rank deficiency using pivoted QR decomposition
2. Identify which columns are linearly dependent
3. Drop redundant columns from the solve
4. Set NA (NaN) for coefficients of dropped columns
5. Warn with clear message listing dropped columns
6. Compute valid SEs for remaining (identified) coefficients

This is controlled by the `rank_deficient_action` parameter:
- "warn" (default): Emit warning, set NA for dropped coefficients
- "error": Raise ValueError with dropped column information
- "silent": No warning, but still set NA for dropped coefficients
"""

import warnings
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union

import numpy as np
import pandas as pd
from scipy import stats
from scipy.linalg import lstsq as scipy_lstsq
from scipy.linalg import qr

# Import Rust backend if available (from _backend to avoid circular imports)
from diff_diff._backend import (
    HAS_RUST_BACKEND,
    _rust_compute_robust_vcov,
    _rust_solve_ols,
)


# =============================================================================
# Utility Functions
# =============================================================================


def _factorize_cluster_ids(cluster_ids: np.ndarray) -> np.ndarray:
    """
    Convert cluster IDs to contiguous integer codes for Rust backend.

    Handles string, categorical, or non-contiguous integer cluster IDs by
    mapping them to contiguous integers starting from 0.

    Parameters
    ----------
    cluster_ids : np.ndarray
        Cluster identifiers (can be strings, integers, or categorical).

    Returns
    -------
    np.ndarray
        Integer cluster codes (dtype int64) suitable for Rust backend.
    """
    # Use pandas factorize for efficient conversion of any dtype
    codes, _ = pd.factorize(cluster_ids)
    return codes.astype(np.int64)


# =============================================================================
# Rank Deficiency Detection and Handling
# =============================================================================


def _detect_rank_deficiency(
    X: np.ndarray,
    rcond: Optional[float] = None,
) -> Tuple[int, np.ndarray, np.ndarray]:
    """
    Detect rank deficiency using pivoted QR decomposition.

    This follows R's lm() approach of using pivoted QR to detect which columns
    are linearly dependent. The pivoting ensures we drop the "least important"
    columns (those with smallest contribution to the column space).

    Parameters
    ----------
    X : ndarray of shape (n, k)
        Design matrix.
    rcond : float, optional
        Relative condition number threshold for determining rank.
        Diagonal elements of R smaller than rcond * max(|R_ii|) are treated
        as zero. If None, uses 1e-07 to match R's qr() default tolerance.

    Returns
    -------
    rank : int
        Numerical rank of the matrix.
    dropped_cols : ndarray of int
        Indices of columns that are linearly dependent (should be dropped).
        Empty if matrix is full rank.
    pivot : ndarray of int
        Column permutation from QR decomposition.
    """
    n, k = X.shape

    # Compute pivoted QR decomposition: X @ P = Q @ R
    # P is a permutation matrix, represented as pivot indices
    Q, R, pivot = qr(X, mode="economic", pivoting=True)

    # Determine rank tolerance
    # R's qr() uses tol = 1e-07 by default, which is sqrt(eps) ≈ 1.49e-08
    # We use 1e-07 to match R's lm() behavior for consistency
    if rcond is None:
        rcond = 1e-07

    # The diagonal of R contains information about linear independence
    # After pivoting, |R[i,i]| is decreasing
    r_diag = np.abs(np.diag(R))

    # Find numerical rank: count singular values above threshold
    # The threshold is relative to the largest diagonal element
    if r_diag[0] == 0:
        rank = 0
    else:
        tol = rcond * r_diag[0]
        rank = int(np.sum(r_diag > tol))

    # Columns after rank position (in pivot order) are linearly dependent
    # We need to map back to original column indices
    if rank < k:
        dropped_cols = np.sort(pivot[rank:])
    else:
        dropped_cols = np.array([], dtype=int)

    return rank, dropped_cols, pivot


def _format_dropped_columns(
    dropped_cols: np.ndarray,
    column_names: Optional[List[str]] = None,
) -> str:
    """
    Format dropped column information for error/warning messages.

    Parameters
    ----------
    dropped_cols : ndarray of int
        Indices of dropped columns.
    column_names : list of str, optional
        Names for the columns. If None, uses indices.

    Returns
    -------
    str
        Formatted string describing dropped columns.
    """
    if len(dropped_cols) == 0:
        return ""

    if column_names is not None:
        names = [column_names[i] if i < len(column_names) else f"column {i}" for i in dropped_cols]
        if len(names) == 1:
            return f"'{names[0]}'"
        elif len(names) <= 5:
            return ", ".join(f"'{n}'" for n in names)
        else:
            shown = ", ".join(f"'{n}'" for n in names[:5])
            return f"{shown}, ... and {len(names) - 5} more"
    else:
        if len(dropped_cols) == 1:
            return f"column {dropped_cols[0]}"
        elif len(dropped_cols) <= 5:
            return ", ".join(f"column {i}" for i in dropped_cols)
        else:
            shown = ", ".join(f"column {i}" for i in dropped_cols[:5])
            return f"{shown}, ... and {len(dropped_cols) - 5} more"


def _expand_coefficients_with_nan(
    coef_reduced: np.ndarray,
    k_full: int,
    kept_cols: np.ndarray,
) -> np.ndarray:
    """
    Expand reduced coefficients to full size, filling dropped columns with NaN.

    Parameters
    ----------
    coef_reduced : ndarray of shape (rank,)
        Coefficients for kept columns only.
    k_full : int
        Total number of columns in original design matrix.
    kept_cols : ndarray of int
        Indices of columns that were kept.

    Returns
    -------
    ndarray of shape (k_full,)
        Full coefficient vector with NaN for dropped columns.
    """
    coef_full = np.full(k_full, np.nan)
    coef_full[kept_cols] = coef_reduced
    return coef_full


def _expand_vcov_with_nan(
    vcov_reduced: np.ndarray,
    k_full: int,
    kept_cols: np.ndarray,
) -> np.ndarray:
    """
    Expand reduced vcov matrix to full size, filling dropped entries with NaN.

    Parameters
    ----------
    vcov_reduced : ndarray of shape (rank, rank)
        Variance-covariance matrix for kept columns only.
    k_full : int
        Total number of columns in original design matrix.
    kept_cols : ndarray of int
        Indices of columns that were kept.

    Returns
    -------
    ndarray of shape (k_full, k_full)
        Full vcov matrix with NaN for dropped rows/columns.
    """
    vcov_full = np.full((k_full, k_full), np.nan)
    # Use advanced indexing to fill in the kept entries
    ix = np.ix_(kept_cols, kept_cols)
    vcov_full[ix] = vcov_reduced
    return vcov_full


def _solve_ols_rust(
    X: np.ndarray,
    y: np.ndarray,
    *,
    cluster_ids: Optional[np.ndarray] = None,
    return_vcov: bool = True,
    return_fitted: bool = False,
) -> Optional[
    Union[
        Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]],
        Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray]],
    ]
]:
    """
    Rust backend implementation of solve_ols for full-rank matrices.

    This is only called when:
    1. The Rust backend is available
    2. The design matrix is full rank (no rank deficiency handling needed)

    For rank-deficient matrices, the Python backend is used instead to
    properly handle R-style NA coefficients for dropped columns.

    Why the backends differ (by design):
    - Rust uses SVD-based solve (minimum-norm solution for rank-deficient)
    - Python uses pivoted QR to identify and drop linearly dependent columns
    - ndarray-linalg doesn't support QR with pivoting, so Rust can't identify
      which specific columns to drop
    - For full-rank matrices, both approaches give identical results
    - For rank-deficient matrices, only Python can provide R-style NA handling

    Parameters
    ----------
    X : np.ndarray
        Design matrix of shape (n, k), must be full rank.
    y : np.ndarray
        Response vector of shape (n,).
    cluster_ids : np.ndarray, optional
        Cluster identifiers for cluster-robust SEs.
    return_vcov : bool
        Whether to compute variance-covariance matrix.
    return_fitted : bool
        Whether to return fitted values.

    Returns
    -------
    coefficients : np.ndarray
        OLS coefficients of shape (k,).
    residuals : np.ndarray
        Residuals of shape (n,).
    fitted : np.ndarray, optional
        Fitted values if return_fitted=True.
    vcov : np.ndarray, optional
        Variance-covariance matrix if return_vcov=True.
    None
        If Rust backend detects numerical instability and caller should
        fall back to Python backend.
    """
    # Convert cluster_ids to int64 for Rust (handles string/categorical IDs)
    if cluster_ids is not None:
        cluster_ids = _factorize_cluster_ids(cluster_ids)

    # Call Rust backend with fallback on numerical instability
    try:
        coefficients, residuals, vcov = _rust_solve_ols(
            X, y, cluster_ids=cluster_ids, return_vcov=return_vcov
        )
    except ValueError as e:
        error_msg = str(e).lower()
        if "numerically unstable" in error_msg or "singular" in error_msg:
            warnings.warn(
                f"Rust backend detected numerical instability: {e}. "
                "Falling back to Python backend.",
                UserWarning,
                stacklevel=3,
            )
            return None  # Signal caller to use Python fallback
        raise

    # Convert to numpy arrays
    coefficients = np.asarray(coefficients)
    residuals = np.asarray(residuals)
    if vcov is not None:
        vcov = np.asarray(vcov)

    # Return with optional fitted values
    if return_fitted:
        fitted = np.dot(X, coefficients)
        return coefficients, residuals, fitted, vcov
    else:
        return coefficients, residuals, vcov


def solve_ols(
    X: np.ndarray,
    y: np.ndarray,
    *,
    cluster_ids: Optional[np.ndarray] = None,
    return_vcov: bool = True,
    return_fitted: bool = False,
    check_finite: bool = True,
    rank_deficient_action: str = "warn",
    column_names: Optional[List[str]] = None,
    skip_rank_check: bool = False,
) -> Union[
    Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]],
    Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray]],
]:
    """
    Solve OLS regression with optional clustered standard errors.

    This is the unified OLS solver for all diff-diff estimators. It uses
    scipy's optimized LAPACK routines and vectorized variance estimation.

    Parameters
    ----------
    X : ndarray of shape (n, k)
        Design matrix (should include intercept if desired).
    y : ndarray of shape (n,)
        Response vector.
    cluster_ids : ndarray of shape (n,), optional
        Cluster identifiers for cluster-robust standard errors.
        If None, HC1 (heteroskedasticity-robust) SEs are computed.
    return_vcov : bool, default True
        Whether to compute and return the variance-covariance matrix.
        Set to False for faster computation when SEs are not needed.
    return_fitted : bool, default False
        Whether to return fitted values in addition to residuals.
    check_finite : bool, default True
        Whether to check that X and y contain only finite values (no NaN/Inf).
        Set to False for faster computation if you are certain your data is clean.
    rank_deficient_action : str, default "warn"
        How to handle rank-deficient design matrices:
        - "warn": Emit warning and set NaN for dropped coefficients (R-style)
        - "error": Raise ValueError with dropped column information
        - "silent": No warning, but still set NaN for dropped coefficients
    column_names : list of str, optional
        Names for the columns (used in warning/error messages).
        If None, columns are referred to by their indices.
    skip_rank_check : bool, default False
        If True, skip the pivoted QR rank check and use Rust backend directly
        (when available). This saves O(nk²) computation but will not detect
        rank-deficient matrices. Use only when you know the design matrix is
        full rank. If the matrix is actually rank-deficient, results may be
        incorrect (minimum-norm solution instead of R-style NA handling).

    Returns
    -------
    coefficients : ndarray of shape (k,)
        OLS coefficient estimates. For rank-deficient matrices, coefficients
        of linearly dependent columns are set to NaN.
    residuals : ndarray of shape (n,)
        Residuals (y - fitted). For rank-deficient matrices, uses only
        identified coefficients to compute fitted values.
    fitted : ndarray of shape (n,), optional
        Fitted values. For full-rank matrices, this is X @ coefficients.
        For rank-deficient matrices, uses only identified coefficients
        (X_reduced @ coefficients_reduced). Only returned if return_fitted=True.
    vcov : ndarray of shape (k, k) or None
        Variance-covariance matrix (HC1 or cluster-robust).
        For rank-deficient matrices, rows/columns for dropped coefficients
        are filled with NaN. None if return_vcov=False.

    Notes
    -----
    This function detects rank-deficient matrices using pivoted QR decomposition
    and handles them following R's lm() approach:

    1. Detect linearly dependent columns via pivoted QR
    2. Drop redundant columns and solve the reduced system
    3. Set NaN for coefficients of dropped columns
    4. Compute valid SEs for identified coefficients only
    5. Expand vcov matrix with NaN for dropped rows/columns

    The cluster-robust standard errors use the sandwich estimator with the
    standard small-sample adjustment: (G/(G-1)) * ((n-1)/(n-k)).

    Examples
    --------
    >>> import numpy as np
    >>> from diff_diff.linalg import solve_ols
    >>> X = np.column_stack([np.ones(100), np.random.randn(100)])
    >>> y = 2 + 3 * X[:, 1] + np.random.randn(100)
    >>> coef, resid, vcov = solve_ols(X, y)
    >>> print(f"Intercept: {coef[0]:.2f}, Slope: {coef[1]:.2f}")

    For rank-deficient matrices with collinear columns:

    >>> X = np.random.randn(100, 3)
    >>> X[:, 2] = X[:, 0] + X[:, 1]  # Perfect collinearity
    >>> y = np.random.randn(100)
    >>> coef, resid, vcov = solve_ols(X, y)  # Emits warning
    >>> print(np.isnan(coef[2]))  # Dropped column has NaN coefficient
    True
    """
    # Validate inputs
    X = np.asarray(X, dtype=np.float64)
    y = np.asarray(y, dtype=np.float64)

    if X.ndim != 2:
        raise ValueError(f"X must be 2-dimensional, got shape {X.shape}")
    if y.ndim != 1:
        raise ValueError(f"y must be 1-dimensional, got shape {y.shape}")
    if X.shape[0] != y.shape[0]:
        raise ValueError(
            f"X and y must have same number of observations: " f"{X.shape[0]} vs {y.shape[0]}"
        )

    n, k = X.shape
    if n < k:
        raise ValueError(
            f"Fewer observations ({n}) than parameters ({k}). "
            "Cannot solve underdetermined system."
        )

    # Validate rank_deficient_action
    valid_actions = {"warn", "error", "silent"}
    if rank_deficient_action not in valid_actions:
        raise ValueError(
            f"rank_deficient_action must be one of {valid_actions}, "
            f"got '{rank_deficient_action}'"
        )

    # Check for NaN/Inf values if requested
    if check_finite:
        if not np.isfinite(X).all():
            raise ValueError(
                "X contains NaN or Inf values. "
                "Clean your data or set check_finite=False to skip this check."
            )
        if not np.isfinite(y).all():
            raise ValueError(
                "y contains NaN or Inf values. "
                "Clean your data or set check_finite=False to skip this check."
            )

    # Fast path: skip rank check and use Rust directly when requested
    # This saves O(nk²) QR overhead but won't detect rank-deficient matrices
    if skip_rank_check:
        if HAS_RUST_BACKEND and _rust_solve_ols is not None:
            result = _solve_ols_rust(
                X,
                y,
                cluster_ids=cluster_ids,
                return_vcov=return_vcov,
                return_fitted=return_fitted,
            )
            if result is not None:
                return result
            # Fall through to NumPy on numerical instability
        # Fall through to Python without rank check (user guarantees full rank)
        return _solve_ols_numpy(
            X,
            y,
            cluster_ids=cluster_ids,
            return_vcov=return_vcov,
            return_fitted=return_fitted,
            rank_deficient_action=rank_deficient_action,
            column_names=column_names,
            _skip_rank_check=True,
        )

    # Check for rank deficiency using fast pivoted QR decomposition.
    # This adds O(nk²) overhead but is necessary for:
    # 1. Detecting which columns to drop (R-style NA handling)
    # 2. Routing rank-deficient cases to Python (Rust doesn't support pivoted QR)
    #
    # Trade-off: ~2x compute cost for full-rank matrices in exchange for proper
    # rank deficiency handling. For maximum performance on known full-rank data,
    # set skip_rank_check=True.
    rank, dropped_cols, pivot = _detect_rank_deficiency(X)
    is_rank_deficient = len(dropped_cols) > 0

    # Routing strategy:
    # - Full-rank + Rust available → fast Rust backend (SVD-based solve)
    # - Rank-deficient → Python backend (proper NA handling, valid SEs)
    # - Rust numerical instability → Python fallback (via None return)
    # - No Rust → Python backend (works for all cases)
    if HAS_RUST_BACKEND and _rust_solve_ols is not None and not is_rank_deficient:
        result = _solve_ols_rust(
            X,
            y,
            cluster_ids=cluster_ids,
            return_vcov=return_vcov,
            return_fitted=return_fitted,
        )

        # Check for None: Rust backend detected numerical instability and
        # signaled us to fall back to Python backend
        if result is None:
            return _solve_ols_numpy(
                X,
                y,
                cluster_ids=cluster_ids,
                return_vcov=return_vcov,
                return_fitted=return_fitted,
                rank_deficient_action=rank_deficient_action,
                column_names=column_names,
                _precomputed_rank_info=None,  # Force fresh rank detection
            )

        # Check for NaN vcov: Rust SVD may detect rank-deficiency that QR missed
        # for ill-conditioned matrices (QR and SVD have different numerical properties).
        # When this happens, fall back to Python's R-style handling.
        vcov = result[-1]  # vcov is always the last element
        if return_vcov and vcov is not None and np.any(np.isnan(vcov)):
            warnings.warn(
                "Rust backend detected ill-conditioned matrix (NaN in variance-covariance). "
                "Re-running with Python backend for proper rank detection.",
                UserWarning,
                stacklevel=2,
            )
            # Force fresh rank detection - don't pass cached info since QR
            # and SVD disagreed about rank. Python's QR will re-detect and
            # apply R-style NaN handling for dropped columns.
            return _solve_ols_numpy(
                X,
                y,
                cluster_ids=cluster_ids,
                return_vcov=return_vcov,
                return_fitted=return_fitted,
                rank_deficient_action=rank_deficient_action,
                column_names=column_names,
                _precomputed_rank_info=None,  # Force re-detection
            )
        else:
            return result

    # Use NumPy implementation for rank-deficient cases (R-style NA handling)
    # or when Rust backend is not available
    return _solve_ols_numpy(
        X,
        y,
        cluster_ids=cluster_ids,
        return_vcov=return_vcov,
        return_fitted=return_fitted,
        rank_deficient_action=rank_deficient_action,
        column_names=column_names,
        # Pass pre-computed rank info to avoid redundant computation
        _precomputed_rank_info=(rank, dropped_cols, pivot),
    )


def _solve_ols_numpy(
    X: np.ndarray,
    y: np.ndarray,
    *,
    cluster_ids: Optional[np.ndarray] = None,
    return_vcov: bool = True,
    return_fitted: bool = False,
    rank_deficient_action: str = "warn",
    column_names: Optional[List[str]] = None,
    _precomputed_rank_info: Optional[Tuple[int, np.ndarray, np.ndarray]] = None,
    _skip_rank_check: bool = False,
) -> Union[
    Tuple[np.ndarray, np.ndarray, Optional[np.ndarray]],
    Tuple[np.ndarray, np.ndarray, np.ndarray, Optional[np.ndarray]],
]:
    """
    NumPy/SciPy implementation of solve_ols with R-style rank deficiency handling.

    Detects rank-deficient matrices using pivoted QR decomposition and handles
    them following R's lm() approach: drop redundant columns, set NA (NaN) for
    their coefficients, and compute valid SEs for identified coefficients only.

    Parameters
    ----------
    X : np.ndarray
        Design matrix of shape (n, k).
    y : np.ndarray
        Response vector of shape (n,).
    cluster_ids : np.ndarray, optional
        Cluster identifiers for cluster-robust SEs.
    return_vcov : bool
        Whether to compute variance-covariance matrix.
    return_fitted : bool
        Whether to return fitted values.
    rank_deficient_action : str
        How to handle rank deficiency: "warn", "error", or "silent".
    column_names : list of str, optional
        Names for the columns (used in warning/error messages).
    _precomputed_rank_info : tuple, optional
        Pre-computed (rank, dropped_cols, pivot) from _detect_rank_deficiency.
        Used internally to avoid redundant computation when called from solve_ols.
    _skip_rank_check : bool, default False
        If True, skip rank detection entirely and assume full rank.
        Used when caller has already determined matrix is full rank.

    Returns
    -------
    coefficients : np.ndarray
        OLS coefficients of shape (k,). NaN for dropped columns.
    residuals : np.ndarray
        Residuals of shape (n,).
    fitted : np.ndarray, optional
        Fitted values if return_fitted=True.
    vcov : np.ndarray, optional
        Variance-covariance matrix if return_vcov=True. NaN for dropped rows/cols.
    """
    n, k = X.shape

    # Determine rank deficiency status
    if _skip_rank_check:
        # Caller guarantees full rank - skip expensive QR decomposition
        is_rank_deficient = False
        dropped_cols = np.array([], dtype=int)
    elif _precomputed_rank_info is not None:
        # Use pre-computed rank info
        rank, dropped_cols, pivot = _precomputed_rank_info
        is_rank_deficient = len(dropped_cols) > 0
    else:
        # Compute rank via pivoted QR
        rank, dropped_cols, pivot = _detect_rank_deficiency(X)
        is_rank_deficient = len(dropped_cols) > 0

    if is_rank_deficient:
        # Format dropped column information for messages
        dropped_str = _format_dropped_columns(dropped_cols, column_names)

        if rank_deficient_action == "error":
            raise ValueError(
                f"Design matrix is rank-deficient. {k - rank} of {k} columns are "
                f"linearly dependent and cannot be uniquely estimated: {dropped_str}. "
                "This indicates multicollinearity in your model specification."
            )
        elif rank_deficient_action == "warn":
            warnings.warn(
                f"Rank-deficient design matrix: dropping {k - rank} of {k} columns "
                f"({dropped_str}). Coefficients for these columns are set to NA. "
                "This may indicate multicollinearity in your model specification.",
                UserWarning,
                stacklevel=3,  # Point to user code that called solve_ols
            )
        # else: "silent" - no warning

        # Extract kept columns for the reduced solve
        kept_cols = np.array([i for i in range(k) if i not in dropped_cols])
        X_reduced = X[:, kept_cols]

        # Solve the reduced system (now full-rank)
        # Use cond=1e-07 for consistency with Rust backend and QR rank tolerance
        coefficients_reduced = scipy_lstsq(
            X_reduced, y, lapack_driver="gelsd", check_finite=False, cond=1e-07
        )[0]

        # Expand coefficients to full size with NaN for dropped columns
        coefficients = _expand_coefficients_with_nan(coefficients_reduced, k, kept_cols)

        # Compute residuals using only the identified coefficients
        # Note: Dropped coefficients are NaN, so we use the reduced form
        fitted = np.dot(X_reduced, coefficients_reduced)
        residuals = y - fitted

        # Compute variance-covariance matrix for reduced system, then expand
        vcov = None
        if return_vcov:
            vcov_reduced = _compute_robust_vcov_numpy(X_reduced, residuals, cluster_ids)
            vcov = _expand_vcov_with_nan(vcov_reduced, k, kept_cols)
    else:
        # Full-rank case: proceed normally
        # Use cond=1e-07 for consistency with Rust backend and QR rank tolerance
        coefficients = scipy_lstsq(X, y, lapack_driver="gelsd", check_finite=False, cond=1e-07)[0]

        # Compute residuals and fitted values
        fitted = np.dot(X, coefficients)
        residuals = y - fitted

        # Compute variance-covariance matrix if requested
        vcov = None
        if return_vcov:
            vcov = _compute_robust_vcov_numpy(X, residuals, cluster_ids)

    if return_fitted:
        return coefficients, residuals, fitted, vcov
    else:
        return coefficients, residuals, vcov


def compute_robust_vcov(
    X: np.ndarray,
    residuals: np.ndarray,
    cluster_ids: Optional[np.ndarray] = None,
) -> np.ndarray:
    """
    Compute heteroskedasticity-robust or cluster-robust variance-covariance matrix.

    Uses the sandwich estimator: (X'X)^{-1} * meat * (X'X)^{-1}

    Parameters
    ----------
    X : ndarray of shape (n, k)
        Design matrix.
    residuals : ndarray of shape (n,)
        OLS residuals.
    cluster_ids : ndarray of shape (n,), optional
        Cluster identifiers. If None, computes HC1 robust SEs.

    Returns
    -------
    vcov : ndarray of shape (k, k)
        Variance-covariance matrix.

    Notes
    -----
    For HC1 (no clustering):
        meat = X' * diag(u^2) * X
        adjustment = n / (n - k)

    For cluster-robust:
        meat = sum_g (X_g' u_g)(X_g' u_g)'
        adjustment = (G / (G-1)) * ((n-1) / (n-k))

    The cluster-robust computation is vectorized using pandas groupby,
    which is much faster than a Python loop over clusters.
    """
    # Use Rust backend if available
    if HAS_RUST_BACKEND:
        X = np.ascontiguousarray(X, dtype=np.float64)
        residuals = np.ascontiguousarray(residuals, dtype=np.float64)

        cluster_ids_int = None
        if cluster_ids is not None:
            cluster_ids_int = pd.factorize(cluster_ids)[0].astype(np.int64)

        try:
            return _rust_compute_robust_vcov(X, residuals, cluster_ids_int)
        except ValueError as e:
            # Translate Rust errors to consistent Python error messages or fallback
            error_msg = str(e)
            if "Matrix inversion failed" in error_msg:
                raise ValueError(
                    "Design matrix is rank-deficient (singular X'X matrix). "
                    "This indicates perfect multicollinearity. Check your fixed effects "
                    "and covariates for linear dependencies."
                ) from e
            if "numerically unstable" in error_msg.lower():
                # Fall back to NumPy on numerical instability (with warning)
                warnings.warn(
                    f"Rust backend detected numerical instability: {e}. "
                    "Falling back to Python backend for variance computation.",
                    UserWarning,
                    stacklevel=2,
                )
                return _compute_robust_vcov_numpy(X, residuals, cluster_ids)
            raise

    # Fallback to NumPy implementation
    return _compute_robust_vcov_numpy(X, residuals, cluster_ids)


def _compute_robust_vcov_numpy(
    X: np.ndarray,
    residuals: np.ndarray,
    cluster_ids: Optional[np.ndarray] = None,
) -> np.ndarray:
    """
    NumPy fallback implementation of compute_robust_vcov.

    Computes HC1 (heteroskedasticity-robust) or cluster-robust variance-covariance
    matrix using the sandwich estimator.

    Parameters
    ----------
    X : np.ndarray
        Design matrix of shape (n, k).
    residuals : np.ndarray
        OLS residuals of shape (n,).
    cluster_ids : np.ndarray, optional
        Cluster identifiers. If None, uses HC1. If provided, uses
        cluster-robust with G/(G-1) small-sample adjustment.

    Returns
    -------
    vcov : np.ndarray
        Variance-covariance matrix of shape (k, k).

    Notes
    -----
    Uses vectorized groupby aggregation for cluster-robust SEs to avoid
    the O(n * G) loop that would be required with explicit iteration.
    """
    n, k = X.shape
    XtX = X.T @ X

    if cluster_ids is None:
        # HC1 (heteroskedasticity-robust) standard errors
        adjustment = n / (n - k)
        u_squared = residuals**2
        # Vectorized meat computation: X' diag(u^2) X = (X * u^2)' X
        meat = np.dot(X.T, X * u_squared[:, np.newaxis])
    else:
        # Cluster-robust standard errors (vectorized via groupby)
        cluster_ids = np.asarray(cluster_ids)
        unique_clusters = np.unique(cluster_ids)
        n_clusters = len(unique_clusters)

        if n_clusters < 2:
            raise ValueError(f"Need at least 2 clusters for cluster-robust SEs, got {n_clusters}")

        # Small-sample adjustment
        adjustment = (n_clusters / (n_clusters - 1)) * ((n - 1) / (n - k))

        # Compute cluster-level scores: sum of X_i * u_i within each cluster
        # scores[i] = X[i] * residuals[i] for each observation
        scores = X * residuals[:, np.newaxis]  # (n, k)

        # Sum scores within each cluster using pandas groupby (vectorized)
        # This is much faster than looping over clusters
        cluster_scores = pd.DataFrame(scores).groupby(cluster_ids).sum().values  # (G, k)

        # Meat is the outer product sum: sum_g (score_g)(score_g)'
        # Equivalent to cluster_scores.T @ cluster_scores
        meat = cluster_scores.T @ cluster_scores  # (k, k)

    # Sandwich estimator: (X'X)^{-1} meat (X'X)^{-1}
    # Solve (X'X) temp = meat, then solve (X'X) vcov' = temp'
    # More stable than explicit inverse
    try:
        temp = np.linalg.solve(XtX, meat)
        vcov = adjustment * np.linalg.solve(XtX, temp.T).T
    except np.linalg.LinAlgError as e:
        if "Singular" in str(e):
            raise ValueError(
                "Design matrix is rank-deficient (singular X'X matrix). "
                "This indicates perfect multicollinearity. Check your fixed effects "
                "and covariates for linear dependencies."
            ) from e
        raise

    return vcov


# Empirical threshold: coefficients above this magnitude suggest near-separation
# in the logistic model (predicted probabilities collapse to 0/1).
_LOGIT_SEPARATION_COEF_THRESHOLD = 10
_LOGIT_SEPARATION_PROB_THRESHOLD = 1e-5


def solve_logit(
    X: np.ndarray,
    y: np.ndarray,
    max_iter: int = 25,
    tol: float = 1e-8,
    check_separation: bool = True,
    rank_deficient_action: str = "warn",
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Fit logistic regression via IRLS (Fisher scoring).

    Matches R's ``glm(family=binomial)`` algorithm: iteratively reweighted
    least squares with working weights ``mu*(1-mu)`` and working response
    ``eta + (y-mu)/(mu*(1-mu))``.

    Parameters
    ----------
    X : np.ndarray
        Feature matrix (n_samples, n_features). Intercept added automatically.
    y : np.ndarray
        Binary outcome (0/1).
    max_iter : int, default 25
        Maximum IRLS iterations (R's ``glm`` default).
    tol : float, default 1e-8
        Convergence tolerance on coefficient change (R's ``glm`` default).
    check_separation : bool, default True
        Whether to check for near-separation and emit warnings.
    rank_deficient_action : str, default "warn"
        How to handle rank-deficient design matrices:
        - "warn": Emit warning and drop columns (default)
        - "error": Raise ValueError
        - "silent": Drop columns silently

    Returns
    -------
    beta : np.ndarray
        Fitted coefficients (including intercept as element 0).
    probs : np.ndarray
        Predicted probabilities.
    """
    n, p = X.shape
    X_with_intercept = np.column_stack([np.ones(n), X])
    k = p + 1  # number of parameters including intercept

    # Validate rank_deficient_action
    valid_actions = {"warn", "error", "silent"}
    if rank_deficient_action not in valid_actions:
        raise ValueError(
            f"rank_deficient_action must be one of {valid_actions}, "
            f"got '{rank_deficient_action}'"
        )

    # Check rank deficiency once before iterating
    rank_info = _detect_rank_deficiency(X_with_intercept)
    rank, dropped_cols, _ = rank_info
    if len(dropped_cols) > 0:
        col_desc = _format_dropped_columns(dropped_cols)
        if rank_deficient_action == "error":
            raise ValueError(
                f"Rank-deficient design matrix in logistic regression: "
                f"dropping {col_desc}. Propensity score estimates may be unreliable."
            )
        elif rank_deficient_action == "warn":
            warnings.warn(
                f"Rank-deficient design matrix in logistic regression: "
                f"dropping {col_desc}. Propensity score estimates may be unreliable.",
                UserWarning,
                stacklevel=2,
            )
        kept_cols = np.array([i for i in range(k) if i not in dropped_cols])
        X_solve = X_with_intercept[:, kept_cols]
    else:
        kept_cols = np.arange(k)
        X_solve = X_with_intercept

    # IRLS (Fisher scoring)
    beta_solve = np.zeros(X_solve.shape[1])
    converged = False

    for iteration in range(max_iter):
        eta = X_solve @ beta_solve
        # Clip to prevent overflow in exp
        eta = np.clip(eta, -500, 500)
        mu = 1.0 / (1.0 + np.exp(-eta))
        # Clip mu to prevent zero working weights
        mu = np.clip(mu, 1e-10, 1 - 1e-10)

        # Working weights and working response
        w = mu * (1.0 - mu)
        z = eta + (y - mu) / w

        # Weighted least squares: solve (X'WX) beta = X'Wz
        sqrt_w = np.sqrt(w)
        Xw = X_solve * sqrt_w[:, None]
        zw = z * sqrt_w
        beta_new, _, _, _ = np.linalg.lstsq(Xw, zw, rcond=None)

        # Check convergence
        if np.max(np.abs(beta_new - beta_solve)) < tol:
            beta_solve = beta_new
            converged = True
            break
        beta_solve = beta_new

    # Final predicted probabilities
    eta_final = X_solve @ beta_solve
    eta_final = np.clip(eta_final, -500, 500)
    probs = 1.0 / (1.0 + np.exp(-eta_final))

    # Warnings
    if not converged:
        warnings.warn(
            f"Logistic regression did not converge in {max_iter} iterations. "
            f"Propensity score estimates may be unreliable.",
            UserWarning,
            stacklevel=2,
        )