I have installed Apache Spark, Java and Python. I am using a Jupyter notebook. I have an error when creating a dataframe.
I write:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import findspark
findspark.init()
import pyspark
spark = (SparkSession
.builder
.appName("AuthorsAges")
.getOrCreate())
And it runs without any errors. But now, I write:
data_df = spark.createDataFrame([("Brooke", 20), ("Denny", 31), ("Jules", 30), ("TD", 35), ("Brooke", 25)], ["name", "age"])
And the error is :
---------------------------------------------------------------------------
Py4JError Traceback (most recent call last)
Cell In[5], line 2
1 #Create a DataFrame
----> 2 data_df = spark.createDataFrame([("Brooke", 20), ("Denny", 31), ("Jules", 30), ("TD", 35), ("Brooke", 25)], ["name", "age"])
File C:\anaconda\lib\site-packages\pyspark\sql\session.py:1276, in SparkSession.createDataFrame(self, data, schema, samplingRatio, verifySchema)
1271 if has_pandas and isinstance(data, pd.DataFrame):
1272 # Create a DataFrame from pandas DataFrame.
1273 return super(SparkSession, self).createDataFrame( # type: ignore[call-overload]
1274 data, schema, samplingRatio, verifySchema
1275 )
-> 1276 return self._create_dataframe(
1277 data, schema, samplingRatio, verifySchema # type: ignore[arg-type]
1278 )
File C:\anaconda\lib\site-packages\pyspark\sql\session.py:1318, in SparkSession._create_dataframe(self, data, schema, samplingRatio, verifySchema)
1316 rdd, struct = self._createFromRDD(data.map(prepare), schema, samplingRatio)
1317 else:
-> 1318 rdd, struct = self._createFromLocal(map(prepare, data), schema)
1319 assert self._jvm is not None
1320 jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd())
File C:\anaconda\lib\site-packages\pyspark\sql\session.py:962, in SparkSession._createFromLocal(self, data, schema)
959 data = list(data)
961 if schema is None or isinstance(schema, (list, tuple)):
--> 962 struct = self._inferSchemaFromList(data, names=schema)
963 converter = _create_converter(struct)
964 tupled_data: Iterable[Tuple] = map(converter, data)
File C:\anaconda\lib\site-packages\pyspark\sql\session.py:834, in SparkSession._inferSchemaFromList(self, data, names)
832 raise ValueError("can not infer schema from empty dataset")
833 infer_dict_as_struct = self._jconf.inferDictAsStruct()
--> 834 infer_array_from_first_element = self._jconf.legacyInferArrayTypeFromFirstElement()
835 prefer_timestamp_ntz = is_timestamp_ntz_preferred()
836 schema = reduce(
837 _merge_type,
838 (
(...)
847 ),
848 )
File C:\anaconda\lib\site-packages\py4j\java_gateway.py:1322, in JavaMember.__call__(self, *args)
1316 command = proto.CALL_COMMAND_NAME +\
1317 self.command_header +\
1318 args_command +\
1319 proto.END_COMMAND_PART
1321 answer = self.gateway_client.send_command(command)
-> 1322 return_value = get_return_value(
1323 answer, self.gateway_client, self.target_id, self.name)
1325 for temp_arg in temp_args:
1326 if hasattr(temp_arg, "_detach"):
File C:\anaconda\lib\site-packages\pyspark\errors\exceptions\captured.py:169, in capture_sql_exception.<locals>.deco(*a, **kw)
167 def deco(*a: Any, **kw: Any) -> Any:
168 try:
--> 169 return f(*a, **kw)
170 except Py4JJavaError as e:
171 converted = convert_exception(e.java_exception)
File C:\anaconda\lib\site-packages\py4j\protocol.py:330, in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
--> 330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
332 format(target_id, ".", name, value))
333 else:
334 raise Py4JError(
335 "An error occurred while calling {0}{1}{2}".
336 format(target_id, ".", name))
Py4JError: An error occurred while calling o28.legacyInferArrayTypeFromFirstElement. Trace:
py4j.Py4JException: Method legacyInferArrayTypeFromFirstElement([]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:274)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:1623)
How do I solve: Py4JError when using spark.createDataFrame?
Looks like you might have inconsistencies with your Spark versions installed.
You have your Pyspark code that tries to call the
legacyInferArrayTypeFromFirstElementmethod of the underlyingSQLConfobject, which has only been introduced since since v3.4.0.But since your error is
I would think that your underlying Spark installation is not on version 3.4.0. This is of course dependent on how you have Spark installed so it's hard to say exactly. Try to verify which version your Pyspark is using (should be 3.4.0) and which version of Spark the executors start up with.