22
22
from __future__ import annotations
23
23
24
24
import ast
25
+ import copy
25
26
import dataclasses
26
27
import datetime
27
28
import functools
30
31
import textwrap
31
32
import typing
32
33
from typing import (
34
+ Any ,
33
35
Iterable ,
34
36
List ,
35
37
Literal ,
49
51
import pyarrow as pa
50
52
51
53
from bigframes import session
52
- import bigframes ._config . sampling_options as sampling_options
54
+ from bigframes ._config import sampling_options
53
55
import bigframes .constants
54
56
import bigframes .core as core
55
57
import bigframes .core .compile .googlesql as googlesql
@@ -535,19 +537,9 @@ def to_pandas(
535
537
Returns:
536
538
pandas.DataFrame, QueryJob
537
539
"""
538
- if (sampling_method is not None ) and (sampling_method not in _SAMPLING_METHODS ):
539
- raise NotImplementedError (
540
- f"The downsampling method { sampling_method } is not implemented, "
541
- f"please choose from { ',' .join (_SAMPLING_METHODS )} ."
542
- )
543
-
544
- sampling = bigframes .options .sampling .with_max_download_size (max_download_size )
545
- if sampling_method is not None :
546
- sampling = sampling .with_method (sampling_method ).with_random_state ( # type: ignore
547
- random_state
548
- )
549
- else :
550
- sampling = sampling .with_disabled ()
540
+ sampling = self ._get_sampling_option (
541
+ max_download_size , sampling_method , random_state
542
+ )
551
543
552
544
df , query_job = self ._materialize_local (
553
545
materialize_options = MaterializationOptions (
@@ -559,6 +551,27 @@ def to_pandas(
559
551
df .set_axis (self .column_labels , axis = 1 , copy = False )
560
552
return df , query_job
561
553
554
+ def _get_sampling_option (
555
+ self ,
556
+ max_download_size : Optional [int ] = None ,
557
+ sampling_method : Optional [str ] = None ,
558
+ random_state : Optional [int ] = None ,
559
+ ) -> sampling_options .SamplingOptions :
560
+
561
+ if (sampling_method is not None ) and (sampling_method not in _SAMPLING_METHODS ):
562
+ raise NotImplementedError (
563
+ f"The downsampling method { sampling_method } is not implemented, "
564
+ f"please choose from { ',' .join (_SAMPLING_METHODS )} ."
565
+ )
566
+
567
+ sampling = bigframes .options .sampling .with_max_download_size (max_download_size )
568
+ if sampling_method is None :
569
+ return sampling .with_disabled ()
570
+
571
+ return sampling .with_method (sampling_method ).with_random_state ( # type: ignore
572
+ random_state
573
+ )
574
+
562
575
def try_peek (
563
576
self , n : int = 20 , force : bool = False , allow_large_results = None
564
577
) -> typing .Optional [pd .DataFrame ]:
@@ -798,11 +811,73 @@ def split(
798
811
return [sliced_block .drop_columns (drop_cols ) for sliced_block in sliced_blocks ]
799
812
800
813
def _compute_dry_run (
801
- self , value_keys : Optional [Iterable [str ]] = None
802
- ) -> bigquery .QueryJob :
814
+ self ,
815
+ value_keys : Optional [Iterable [str ]] = None ,
816
+ * ,
817
+ ordered : bool = True ,
818
+ max_download_size : Optional [int ] = None ,
819
+ sampling_method : Optional [str ] = None ,
820
+ random_state : Optional [int ] = None ,
821
+ ) -> typing .Tuple [pd .Series , bigquery .QueryJob ]:
822
+ sampling = self ._get_sampling_option (
823
+ max_download_size , sampling_method , random_state
824
+ )
825
+ if sampling .enable_downsampling :
826
+ raise NotImplementedError ("Dry run with sampling is not supported" )
827
+
828
+ index : List [Any ] = []
829
+ values : List [Any ] = []
830
+
831
+ index .append ("columnCount" )
832
+ values .append (len (self .value_columns ))
833
+ index .append ("columnDtypes" )
834
+ values .append (
835
+ {
836
+ col : self .expr .get_column_type (self .resolve_label_exact_or_error (col ))
837
+ for col in self .column_labels
838
+ }
839
+ )
840
+
841
+ index .append ("indexLevel" )
842
+ values .append (self .index .nlevels )
843
+ index .append ("indexDtypes" )
844
+ values .append (self .index .dtypes )
845
+
803
846
expr = self ._apply_value_keys_to_expr (value_keys = value_keys )
804
- query_job = self .session ._executor .dry_run (expr )
805
- return query_job
847
+ query_job = self .session ._executor .dry_run (expr , ordered )
848
+ job_api_repr = copy .deepcopy (query_job ._properties )
849
+
850
+ job_ref = job_api_repr ["jobReference" ]
851
+ for key , val in job_ref .items ():
852
+ index .append (key )
853
+ values .append (val )
854
+
855
+ index .append ("jobType" )
856
+ values .append (job_api_repr ["configuration" ]["jobType" ])
857
+
858
+ query_config = job_api_repr ["configuration" ]["query" ]
859
+ for key in ("destinationTable" , "useLegacySql" ):
860
+ index .append (key )
861
+ values .append (query_config .get (key ))
862
+
863
+ query_stats = job_api_repr ["statistics" ]["query" ]
864
+ for key in (
865
+ "referencedTables" ,
866
+ "totalBytesProcessed" ,
867
+ "cacheHit" ,
868
+ "statementType" ,
869
+ ):
870
+ index .append (key )
871
+ values .append (query_stats .get (key ))
872
+
873
+ index .append ("creationTime" )
874
+ values .append (
875
+ pd .Timestamp (
876
+ job_api_repr ["statistics" ]["creationTime" ], unit = "ms" , tz = "UTC"
877
+ )
878
+ )
879
+
880
+ return pd .Series (values , index = index ), query_job
806
881
807
882
def _apply_value_keys_to_expr (self , value_keys : Optional [Iterable [str ]] = None ):
808
883
expr = self ._expr
@@ -2703,11 +2778,18 @@ def to_pandas(
2703
2778
"Cannot materialize index, as this object does not have an index. Set index column(s) using set_index."
2704
2779
)
2705
2780
ordered = ordered if ordered is not None else True
2781
+
2706
2782
df , query_job = self ._block .select_columns ([]).to_pandas (
2707
- ordered = ordered , allow_large_results = allow_large_results
2783
+ ordered = ordered ,
2784
+ allow_large_results = allow_large_results ,
2708
2785
)
2709
2786
return df .index , query_job
2710
2787
2788
+ def _compute_dry_run (
2789
+ self , * , ordered : bool = True
2790
+ ) -> Tuple [pd .Series , bigquery .QueryJob ]:
2791
+ return self ._block .select_columns ([])._compute_dry_run (ordered = ordered )
2792
+
2711
2793
def resolve_level (self , level : LevelsType ) -> typing .Sequence [str ]:
2712
2794
if utils .is_list_like (level ):
2713
2795
levels = list (level )
0 commit comments