28
28
DtypeArg ,
29
29
FilePath ,
30
30
IndexLabel ,
31
+ JSONEngine ,
31
32
JSONSerializable ,
32
33
ReadBuffer ,
33
34
StorageOptions ,
66
67
build_table_schema ,
67
68
parse_table_schema ,
68
69
)
70
+ from pandas .io .json .arrow_json_parser_wrapper import ArrowJsonParserWrapper
69
71
from pandas .io .parsers .readers import validate_integer
70
72
71
73
if TYPE_CHECKING :
@@ -389,6 +391,7 @@ def read_json(
389
391
date_unit : str | None = ...,
390
392
encoding : str | None = ...,
391
393
encoding_errors : str | None = ...,
394
+ engine : JSONEngine = ...,
392
395
lines : bool = ...,
393
396
chunksize : int ,
394
397
compression : CompressionOptions = ...,
@@ -417,6 +420,7 @@ def read_json(
417
420
compression : CompressionOptions = ...,
418
421
nrows : int | None = ...,
419
422
storage_options : StorageOptions = ...,
423
+ engine : JSONEngine = ...,
420
424
) -> JsonReader [Literal ["series" ]]:
421
425
...
422
426
@@ -440,6 +444,7 @@ def read_json(
440
444
compression : CompressionOptions = ...,
441
445
nrows : int | None = ...,
442
446
storage_options : StorageOptions = ...,
447
+ engine : JSONEngine = ...,
443
448
) -> Series :
444
449
...
445
450
@@ -463,6 +468,7 @@ def read_json(
463
468
compression : CompressionOptions = ...,
464
469
nrows : int | None = ...,
465
470
storage_options : StorageOptions = ...,
471
+ engine : JSONEngine = ...,
466
472
) -> DataFrame :
467
473
...
468
474
@@ -489,6 +495,7 @@ def read_json(
489
495
compression : CompressionOptions = "infer" ,
490
496
nrows : int | None = None ,
491
497
storage_options : StorageOptions = None ,
498
+ engine : JSONEngine = "ujson" ,
492
499
) -> DataFrame | Series | JsonReader :
493
500
"""
494
501
Convert a JSON string to pandas object.
@@ -605,6 +612,9 @@ def read_json(
605
612
606
613
.. versionadded:: 1.3.0
607
614
615
+ engine : {{'ujson', 'pyarrow'}}, default "ujson"
616
+ Parser engine to use.
617
+
608
618
lines : bool, default False
609
619
Read the file as a json object per line.
610
620
@@ -746,6 +756,7 @@ def read_json(
746
756
nrows = nrows ,
747
757
storage_options = storage_options ,
748
758
encoding_errors = encoding_errors ,
759
+ engine = engine ,
749
760
)
750
761
751
762
if chunksize :
@@ -782,6 +793,7 @@ def __init__(
782
793
nrows : int | None ,
783
794
storage_options : StorageOptions = None ,
784
795
encoding_errors : str | None = "strict" ,
796
+ engine : JSONEngine = "ujson" ,
785
797
) -> None :
786
798
787
799
self .orient = orient
@@ -793,6 +805,7 @@ def __init__(
793
805
self .precise_float = precise_float
794
806
self .date_unit = date_unit
795
807
self .encoding = encoding
808
+ self .engine = engine
796
809
self .compression = compression
797
810
self .storage_options = storage_options
798
811
self .lines = lines
@@ -810,9 +823,48 @@ def __init__(
810
823
self .nrows = validate_integer ("nrows" , self .nrows , 0 )
811
824
if not self .lines :
812
825
raise ValueError ("nrows can only be passed if lines=True" )
826
+ if self .engine == "pyarrow" :
827
+ if not self .lines :
828
+ raise ValueError (
829
+ "currently pyarrow engine only supports "
830
+ "the line-delimited JSON format"
831
+ )
832
+ if self .engine not in ["pyarrow" , "ujson" ]:
833
+ raise ValueError (
834
+ f"The engine type { self .engine } is currently not supported."
835
+ )
836
+
837
+ if self .engine == "pyarrow" :
838
+ self ._engine = self ._make_engine (filepath_or_buffer )
839
+ if self .engine == "ujson" :
840
+ data = self ._get_data_from_filepath (filepath_or_buffer )
841
+ self .data = self ._preprocess_data (data )
842
+
843
+ def _make_engine (
844
+ self ,
845
+ filepath_or_buffer : FilePath | ReadBuffer [str ] | ReadBuffer [bytes ],
846
+ ) -> ArrowJsonParserWrapper :
847
+
848
+ if not isinstance (filepath_or_buffer , list ):
849
+ is_text = False
850
+ mode = "rb"
851
+ self .handles = get_handle (
852
+ self ._get_data_from_filepath (filepath_or_buffer ),
853
+ mode = mode ,
854
+ encoding = self .encoding ,
855
+ is_text = is_text ,
856
+ compression = self .compression ,
857
+ storage_options = self .storage_options ,
858
+ errors = self .encoding_errors ,
859
+ )
860
+ filepath_or_buffer = self .handles .handle
813
861
814
- data = self ._get_data_from_filepath (filepath_or_buffer )
815
- self .data = self ._preprocess_data (data )
862
+ try :
863
+ return ArrowJsonParserWrapper (filepath_or_buffer )
864
+ except Exception :
865
+ if self .handles is not None :
866
+ self .handles .close ()
867
+ raise
816
868
817
869
def _preprocess_data (self , data ):
818
870
"""
@@ -896,20 +948,23 @@ def read(self) -> DataFrame | Series:
896
948
Read the whole JSON input into a pandas object.
897
949
"""
898
950
obj : DataFrame | Series
899
- if self .lines :
900
- if self .chunksize :
901
- obj = concat (self )
902
- elif self .nrows :
903
- lines = list (islice (self .data , self .nrows ))
904
- lines_json = self ._combine_lines (lines )
905
- obj = self ._get_object_parser (lines_json )
951
+ if self .engine == "pyarrow" :
952
+ obj = self ._engine .read ()
953
+ if self .engine == "ujson" :
954
+ if self .lines :
955
+ if self .chunksize :
956
+ obj = concat (self )
957
+ elif self .nrows :
958
+ lines = list (islice (self .data , self .nrows ))
959
+ lines_json = self ._combine_lines (lines )
960
+ obj = self ._get_object_parser (lines_json )
961
+ else :
962
+ data = ensure_str (self .data )
963
+ data_lines = data .split ("\n " )
964
+ obj = self ._get_object_parser (self ._combine_lines (data_lines ))
906
965
else :
907
- data = ensure_str (self .data )
908
- data_lines = data .split ("\n " )
909
- obj = self ._get_object_parser (self ._combine_lines (data_lines ))
910
- else :
911
- obj = self ._get_object_parser (self .data )
912
- self .close ()
966
+ obj = self ._get_object_parser (self .data )
967
+ self .close ()
913
968
return obj
914
969
915
970
def _get_object_parser (self , json ) -> DataFrame | Series :
0 commit comments