添加链接
link管理
链接快照平台
  • 输入网页链接,自动生成快照
  • 标签化管理网页链接
Databricks Community
Py4JJavaError Traceback (most recent call last) <command-3204971640072140> in <cell line: 2>() 1 # Read excel file ----> 2 sample1DF = (spark.read.format("com.crealytics.spark.excel") 3 .option("Header", "true") 4 .option("inferSchema", "false") 5 .option("treatEmptyValuesAsNulls", "false") /databricks/spark/python/pyspark/instrumentation_utils.py in wrapper(*args, **kwargs) 46 start = time.perf_counter() 47 try: ---> 48 res = func(*args, **kwargs) 49 logger.log_success( 50 module_name, class_name, function_name, time.perf_counter() - start, signature /databricks/spark/python/pyspark/sql/readwriter.py in load(self, path, format, schema, **options) 175 self.options(**options) 176 if isinstance(path, str): --> 177 return self._df(self._jreader.load(path)) 178 elif path is not None: 179 if type(path) != list: /databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py in __call__(self, *args) 1320 answer = self.gateway_client.send_command(command) -> 1321 return_value = get_return_value( 1322 answer, self.gateway_client, self.target_id, self.name) /databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw) 194 def deco(*a: Any, **kw: Any) -> Any: 195 try: --> 196 return f(*a, **kw) 197 except Py4JJavaError as e: 198 converted = convert_exception(e.java_exception) /databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client) 325 if answer[1] == REFERENCE_TYPE: --> 326 raise Py4JJavaError( 327 "An error occurred while calling {0}{1}{2}.\n". 328 format(target_id, ".", name), value) Py4JJavaError: An error occurred while calling o918.load. : java.lang.NoClassDefFoundError: shadeio/poi/schemas/vmldrawing/XmlDocument at shadeio.poi.xssf.usermodel.XSSFVMLDrawing.read(XSSFVMLDrawing.java:135) at shadeio.poi.xssf.usermodel.XSSFVMLDrawing.<init>(XSSFVMLDrawing.java:123) at shadeio.poi.ooxml.POIXMLFactory.createDocumentPart(POIXMLFactory.java:61) at shadeio.poi.ooxml.POIXMLDocumentPart.read(POIXMLDocumentPart.java:661) at shadeio.poi.ooxml.POIXMLDocumentPart.read(POIXMLDocumentPart.java:678) at shadeio.poi.ooxml.POIXMLDocument.load(POIXMLDocument.java:165) at shadeio.poi.xssf.usermodel.XSSFWorkbook.<init>(XSSFWorkbook.java:274) at shadeio.poi.xssf.usermodel.XSSFWorkbookFactory.createWorkbook(XSSFWorkbookFactory.java:118) at shadeio.poi.xssf.usermodel.XSSFWorkbookFactory.create(XSSFWorkbookFactory.java:98) at shadeio.poi.xssf.usermodel.XSSFWorkbookFactory.create(XSSFWorkbookFactory.java:36) at shadeio.poi.ss.usermodel.WorkbookFactory.lambda$create$2(WorkbookFactory.java:224) at shadeio.poi.ss.usermodel.WorkbookFactory.wp(WorkbookFactory.java:329) at shadeio.poi.ss.usermodel.WorkbookFactory.create(WorkbookFactory.java:224) at shadeio.poi.ss.usermodel.WorkbookFactory.create(WorkbookFactory.java:185) at com.crealytics.spark.excel.DefaultWorkbookReader.$anonfun$openWorkbook$1(WorkbookReader.scala:55) at scala.Option.fold(Option.scala:251) at com.crealytics.spark.excel.DefaultWorkbookReader.openWorkbook(WorkbookReader.scala:55) at com.crealytics.spark.excel.WorkbookReader.withWorkbook(WorkbookReader.scala:16) at com.crealytics.spark.excel.WorkbookReader.withWorkbook$(WorkbookReader.scala:15) at com.crealytics.spark.excel.DefaultWorkbookReader.withWorkbook(WorkbookReader.scala:50) at com.crealytics.spark.excel.ExcelRelation.excerpt$lzycompute(ExcelRelation.scala:32) at com.crealytics.spark.excel.ExcelRelation.excerpt(ExcelRelation.scala:32) at com.crealytics.spark.excel.ExcelRelation.headerColumns$lzycompute(ExcelRelation.scala:104) at com.crealytics.spark.excel.ExcelRelation.headerColumns(ExcelRelation.scala:103) at com.crealytics.spark.excel.ExcelRelation.$anonfun$inferSchema$1(ExcelRelation.scala:172) at scala.Option.getOrElse(Option.scala:189) at com.crealytics.spark.excel.ExcelRelation.inferSchema(ExcelRelation.scala:171) at com.crealytics.spark.excel.ExcelRelation.<init>(ExcelRelation.scala:36) at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:36) at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:13) at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:8) at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:385) at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:368) at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:324) at scala.Option.getOrElse(Option.scala:189) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:324) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:237) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380) at py4j.Gateway.invoke(Gateway.java:306) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:195) at py4j.ClientServerConnection.run(ClientServerConnection.java:115) at java.lang.Thread.run(Thread.java:750)

I installed "com.crealytics:spark-excel_2.12:3.2.1_0.16.4" on the cluster following this thread . Still getting error.

Environment:

  • Single node cluster
  • 11.2 ML (includes Apache Spark 3.3.0, Scala 2.12)

Any idea to solve this issue?

@davide.anghileri davide.anghileri​

Thanks a lot. It worked.

I tried to read another Excel file (with several sheets & multi-row header), and this time I get the error:

org.apache.poi.ooxml.POIXMLException: Strict OOXML isn't currently supported, please see bug #57699
---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<command-496434324351845> in <cell line: 4>()
      3 # Read excel file
----> 4 sampleDF_xlsx = (spark.read.format("com.crealytics.spark.excel")
      5   .option("Header", "true")
      6   .option("inferSchema", "false")
/databricks/spark/python/pyspark/instrumentation_utils.py in wrapper(*args, **kwargs)
     46             start = time.perf_counter()
     47             try:
---> 48                 res = func(*args, **kwargs)
     49                 logger.log_success(
     50                     module_name, class_name, function_name, time.perf_counter() - start, signature
/databricks/spark/python/pyspark/sql/readwriter.py in load(self, path, format, schema, **options)
    175         self.options(**options)
    176         if isinstance(path, str):
--> 177             return self._df(self._jreader.load(path))
    178         elif path is not None:
    179             if type(path) != list:
/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1320         answer = self.gateway_client.send_command(command)
-> 1321         return_value = get_return_value(
   1322             answer, self.gateway_client, self.target_id, self.name)
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
    194     def deco(*a: Any, **kw: Any) -> Any:
    195         try:
--> 196             return f(*a, **kw)
    197         except Py4JJavaError as e:
    198             converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    324             value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
    325             if answer[1] == REFERENCE_TYPE:
--> 326                 raise Py4JJavaError(
    327                     "An error occurred while calling {0}{1}{2}.\n".
    328                     format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o634.load.
: org.apache.poi.ooxml.POIXMLException: Strict OOXML isn't currently supported, please see bug #57699
	at org.apache.poi.ooxml.POIXMLDocumentPart.getPartFromOPCPackage(POIXMLDocumentPart.java:757)
	at org.apache.poi.ooxml.POIXMLDocumentPart.<init>(POIXMLDocumentPart.java:151)
	at org.apache.poi.ooxml.POIXMLDocumentPart.<init>(POIXMLDocumentPart.java:141)
	at org.apache.poi.ooxml.POIXMLDocument.<init>(POIXMLDocument.java:60)
	at org.apache.poi.xssf.usermodel.XSSFWorkbook.<init>(XSSFWorkbook.java:254)
	at org.apache.poi.xssf.usermodel.XSSFWorkbookFactory.createWorkbook(XSSFWorkbookFactory.java:118)
	at org.apache.poi.xssf.usermodel.XSSFWorkbookFactory.create(XSSFWorkbookFactory.java:98)
	at org.apache.poi.xssf.usermodel.XSSFWorkbookFactory.create(XSSFWorkbookFactory.java:36)
	at org.apache.poi.ss.usermodel.WorkbookFactory.lambda$create$2(WorkbookFactory.java:224)
	at org.apache.poi.ss.usermodel.WorkbookFactory.wp(WorkbookFactory.java:329)
	at org.apache.poi.ss.usermodel.WorkbookFactory.create(WorkbookFactory.java:224)
	at org.apache.poi.ss.usermodel.WorkbookFactory.create(WorkbookFactory.java:185)
	at com.crealytics.spark.excel.DefaultWorkbookReader.$anonfun$openWorkbook$3(WorkbookReader.scala:107)
	at scala.Option.fold(Option.scala:251)
	at com.crealytics.spark.excel.DefaultWorkbookReader.openWorkbook(WorkbookReader.scala:107)
	at com.crealytics.spark.excel.WorkbookReader.withWorkbook(WorkbookReader.scala:34)
	at com.crealytics.spark.excel.WorkbookReader.withWorkbook$(WorkbookReader.scala:33)
	at com.crealytics.spark.excel.DefaultWorkbookReader.withWorkbook(WorkbookReader.scala:92)
	at com.crealytics.spark.excel.ExcelRelation.excerpt$lzycompute(ExcelRelation.scala:48)
	at com.crealytics.spark.excel.ExcelRelation.excerpt(ExcelRelation.scala:48)
	at com.crealytics.spark.excel.ExcelRelation.headerColumns$lzycompute(ExcelRelation.scala:121)
	at com.crealytics.spark.excel.ExcelRelation.headerColumns(ExcelRelation.scala:120)
	at com.crealytics.spark.excel.ExcelRelation.$anonfun$inferSchema$1(ExcelRelation.scala:189)
	at scala.Option.getOrElse(Option.scala:189)
	at com.crealytics.spark.excel.ExcelRelation.inferSchema(ExcelRelation.scala:188)
	at com.crealytics.spark.excel.ExcelRelation.<init>(ExcelRelation.scala:52)
	at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:52)
	at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:29)
	at com.crealytics.spark.excel.DefaultSource.createRelation(DefaultSource.scala:24)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:385)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:368)
	at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:324)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:324)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:237)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
	at py4j.Gateway.invoke(Gateway.java:306)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:195)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:115)
	at java.lang.Thread.run(Thread.java:750)
						

For this dataset, I also tried binary file reading as below: 

xldf_xlsx = (
  spark.read.format("binaryFile")
 .option("pathGlobFilter", "*.xls*")
 .load(filepath_xlsx)
 excel_content = xldf_xlsx.head(1)[0].content
file_like_obj = io.BytesIO(excel_content)
xl = pd.ExcelFile(file_like_obj, engine="openpyxl")

And the last line gives the error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<command-496434324351842> in <cell line: 3>()
      1 excel_content = xldf_xlsx.head(1)[0].content
      2 file_like_obj = io.BytesIO(excel_content)
----> 3 xl = pd.ExcelFile(file_like_obj, engine="openpyxl")
/databricks/python/lib/python3.9/site-packages/pandas/io/excel/_base.py in __init__(self, path_or_buffer, engine, storage_options)
   1231         self.storage_options = storage_options
-> 1233         self._reader = self._engines[engine](self._io, storage_options=storage_options)
   1235     def __fspath__(self):
/databricks/python/lib/python3.9/site-packages/pandas/io/excel/_openpyxl.py in __init__(self, filepath_or_buffer, storage_options)
    520         """
    521         import_optional_dependency("openpyxl")
--> 522         super().__init__(filepath_or_buffer, storage_options=storage_options)
    524     @property
/databricks/python/lib/python3.9/site-packages/pandas/io/excel/_base.py in __init__(self, filepath_or_buffer, storage_options)
    418             self.handles.handle.seek(0)
    419             try:
--> 420                 self.book = self.load_workbook(self.handles.handle)
    421             except Exception:
    422                 self.close()
/databricks/python/lib/python3.9/site-packages/pandas/io/excel/_openpyxl.py in load_workbook(self, filepath_or_buffer)
    531         from openpyxl import load_workbook
--> 533         return load_workbook(
    534             filepath_or_buffer, read_only=True, data_only=True, keep_links=False
    535         )
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/reader/excel.py in load_workbook(filename, read_only, keep_vba, data_only, keep_links)
    315     reader = ExcelReader(filename, read_only, keep_vba,
    316                         data_only, keep_links)
--> 317     reader.read()
    318     return reader.wb
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/reader/excel.py in read(self)
    276         self.read_manifest()
    277         self.read_strings()
--> 278         self.read_workbook()
    279         self.read_properties()
    280         self.read_theme()
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/reader/excel.py in read_workbook(self)
    148         wb_part = _find_workbook_part(self.package)
    149         self.parser = WorkbookParser(self.archive, wb_part.PartName[1:], keep_links=self.keep_links)
--> 150         self.parser.parse()
    151         wb = self.parser.wb
    152         wb._sheets = []
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/reader/workbook.py in parse(self)
     47         src = self.archive.read(self.workbook_part_name)
     48         node = fromstring(src)
---> 49         package = WorkbookPackage.from_tree(node)
     50         if package.properties.date1904:
     51             self.wb.epoch = CALENDAR_MAC_1904
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/descriptors/serialisable.py in from_tree(cls, node)
     81             if hasattr(desc, 'from_tree'):
     82                 #descriptor manages conversion
---> 83                 obj = desc.from_tree(el)
     84             else:
     85                 if hasattr(desc.expected_type, "from_tree"):
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/descriptors/sequence.py in from_tree(self, node)
     84     def from_tree(self, node):
---> 85         return [self.expected_type.from_tree(el) for el in node]
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/descriptors/sequence.py in <listcomp>(.0)
     84     def from_tree(self, node):
---> 85         return [self.expected_type.from_tree(el) for el in node]
/local_disk0/.ephemeral_nfs/cluster_libraries/python/lib/python3.9/site-packages/openpyxl/descriptors/serialisable.py in from_tree(cls, node)
    101                 attrib[tag] = obj
--> 103         return cls(**attrib)
TypeError: __init__() missing 1 required positional argument: 'id'

If there isn’t a group near you, start one and help create a community that brings people together. Request a New Group Terraform Destroy not able to prune Databricks Provisioned GKE Cluster on GCP in Administration & Architecture [INSUFFICIENT_PERMISSIONS] Insufficient privileges: in Data Engineering Duplicates in CSV Export to ADLS in Data Engineering © Databricks 2024. All rights reserved. Apache, Apache Spark, Spark and the Spark logo are trademarks of the Apache Software Foundation.

  • Privacy Notice
  • Terms of Use
  • Your Privacy Choices
  • Your California Privacy Rights
  •