28
28
DtypeArg ,
29
29
FilePath ,
30
30
IndexLabel ,
31
+ JSONEngine ,
31
32
JSONSerializable ,
32
33
ReadBuffer ,
33
34
StorageOptions ,
66
67
build_table_schema ,
67
68
parse_table_schema ,
68
69
)
70
+ from pandas .io .json .arrow_json_parser_wrapper import ArrowJsonParserWrapper
69
71
from pandas .io .parsers .readers import validate_integer
70
72
71
73
if TYPE_CHECKING :
@@ -389,6 +391,7 @@ def read_json(
389
391
date_unit : str | None = ...,
390
392
encoding : str | None = ...,
391
393
encoding_errors : str | None = ...,
394
+ engine : JSONEngine = ...,
392
395
lines : bool = ...,
393
396
chunksize : int ,
394
397
compression : CompressionOptions = ...,
@@ -417,6 +420,7 @@ def read_json(
417
420
compression : CompressionOptions = ...,
418
421
nrows : int | None = ...,
419
422
storage_options : StorageOptions = ...,
423
+ engine : JSONEngine = ...,
420
424
) -> JsonReader [Literal ["series" ]]:
421
425
...
422
426
@@ -440,6 +444,7 @@ def read_json(
440
444
compression : CompressionOptions = ...,
441
445
nrows : int | None = ...,
442
446
storage_options : StorageOptions = ...,
447
+ engine : JSONEngine = ...,
443
448
) -> Series :
444
449
...
445
450
@@ -463,6 +468,7 @@ def read_json(
463
468
compression : CompressionOptions = ...,
464
469
nrows : int | None = ...,
465
470
storage_options : StorageOptions = ...,
471
+ engine : JSONEngine = ...,
466
472
) -> DataFrame :
467
473
...
468
474
@@ -489,6 +495,7 @@ def read_json(
489
495
compression : CompressionOptions = "infer" ,
490
496
nrows : int | None = None ,
491
497
storage_options : StorageOptions = None ,
498
+ engine : JSONEngine = "ujson" ,
492
499
) -> DataFrame | Series | JsonReader :
493
500
"""
494
501
Convert a JSON string to pandas object.
@@ -605,6 +612,9 @@ def read_json(
605
612
606
613
.. versionadded:: 1.3.0
607
614
615
+ engine : {{'ujson', 'pyarrow'}}, default "ujson"
616
+ Parser engine to use.
617
+
608
618
lines : bool, default False
609
619
Read the file as a json object per line.
610
620
@@ -740,6 +750,7 @@ def read_json(
740
750
precise_float = precise_float ,
741
751
date_unit = date_unit ,
742
752
encoding = encoding ,
753
+ engine = engine ,
743
754
lines = lines ,
744
755
chunksize = chunksize ,
745
756
compression = compression ,
@@ -782,6 +793,7 @@ def __init__(
782
793
nrows : int | None ,
783
794
storage_options : StorageOptions = None ,
784
795
encoding_errors : str | None = "strict" ,
796
+ engine : JSONEngine = "ujson" ,
785
797
) -> None :
786
798
787
799
self .orient = orient
@@ -793,6 +805,7 @@ def __init__(
793
805
self .precise_float = precise_float
794
806
self .date_unit = date_unit
795
807
self .encoding = encoding
808
+ self .engine = engine
796
809
self .compression = compression
797
810
self .storage_options = storage_options
798
811
self .lines = lines
@@ -810,9 +823,46 @@ def __init__(
810
823
self .nrows = validate_integer ("nrows" , self .nrows , 0 )
811
824
if not self .lines :
812
825
raise ValueError ("nrows can only be passed if lines=True" )
826
+ if self .engine == "pyarrow" :
827
+ if not self .lines :
828
+ raise ValueError (
829
+ "currently pyarrow engine only supports "
830
+ "the line-delimited JSON format"
831
+ )
832
+ if self .engine not in ["pyarrow" , "ujson" ]:
833
+ raise ValueError ("This engine type is currently not supported." )
834
+
835
+ if self .engine == "pyarrow" :
836
+ self ._engine = self ._make_engine (filepath_or_buffer )
837
+ if self .engine == "ujson" :
838
+ data = self ._get_data_from_filepath (filepath_or_buffer )
839
+ self .data = self ._preprocess_data (data )
840
+
841
+ def _make_engine (
842
+ self ,
843
+ filepath_or_buffer : FilePath | ReadBuffer [str ] | ReadBuffer [bytes ],
844
+ ) -> ArrowJsonParserWrapper :
845
+
846
+ if not isinstance (filepath_or_buffer , list ):
847
+ is_text = False
848
+ mode = "rb"
849
+ self .handles = get_handle (
850
+ self ._get_data_from_filepath (filepath_or_buffer ),
851
+ mode = mode ,
852
+ encoding = self .encoding ,
853
+ is_text = is_text ,
854
+ compression = self .compression ,
855
+ storage_options = self .storage_options ,
856
+ errors = self .encoding_errors ,
857
+ )
858
+ filepath_or_buffer = self .handles .handle
813
859
814
- data = self ._get_data_from_filepath (filepath_or_buffer )
815
- self .data = self ._preprocess_data (data )
860
+ try :
861
+ return ArrowJsonParserWrapper (filepath_or_buffer )
862
+ except Exception :
863
+ if self .handles is not None :
864
+ self .handles .close ()
865
+ raise
816
866
817
867
def _preprocess_data (self , data ):
818
868
"""
@@ -896,20 +946,23 @@ def read(self) -> DataFrame | Series:
896
946
Read the whole JSON input into a pandas object.
897
947
"""
898
948
obj : DataFrame | Series
899
- if self .lines :
900
- if self .chunksize :
901
- obj = concat (self )
902
- elif self .nrows :
903
- lines = list (islice (self .data , self .nrows ))
904
- lines_json = self ._combine_lines (lines )
905
- obj = self ._get_object_parser (lines_json )
949
+ if self .engine == "pyarrow" :
950
+ obj = self ._engine .read ()
951
+ if self .engine == "ujson" :
952
+ if self .lines :
953
+ if self .chunksize :
954
+ obj = concat (self )
955
+ elif self .nrows :
956
+ lines = list (islice (self .data , self .nrows ))
957
+ lines_json = self ._combine_lines (lines )
958
+ obj = self ._get_object_parser (lines_json )
959
+ else :
960
+ data = ensure_str (self .data )
961
+ data_lines = data .split ("\n " )
962
+ obj = self ._get_object_parser (self ._combine_lines (data_lines ))
906
963
else :
907
- data = ensure_str (self .data )
908
- data_lines = data .split ("\n " )
909
- obj = self ._get_object_parser (self ._combine_lines (data_lines ))
910
- else :
911
- obj = self ._get_object_parser (self .data )
912
- self .close ()
964
+ obj = self ._get_object_parser (self .data )
965
+ self .close ()
913
966
return obj
914
967
915
968
def _get_object_parser (self , json ) -> DataFrame | Series :
0 commit comments