importosfromautofaissimportbuild_indexfrompyspark.sqlimportSparkSession# pylint: disable=import-outside-toplevelfrompysparkimportSparkConf,SparkContextdefcreate_spark_session():# this must be a path that is available on all worker nodesos.environ['PYSPARK_PYTHON']="/opt/module/spark/demo/py_demo/image_search/autofaiss.pex"spark=(SparkSession.builder.config("spark.executorEnv.PEX_ROOT","./.pex").config("spark.executor.cores","4").config("spark.executor.memory","20G")# make sure to increase this if you're using more cores per executor.config("spark.num.executors","10").config("spark.yarn.queue","spark").master("local")# this should point to your master node, if using the tunnelling version, keep this to localhost.appName("autofaiss-create-index").getOrCreate())returnsparkspark=create_spark_session()index,index_infos=build_index(embeddings="hdfs://nameservice1:8020/home/image_search/parquet",distributed="pyspark",file_format="parquet",max_index_memory_usage="80G",# 16Gcurrent_memory_available="120G",# 24Gtemporary_indices_folder="hdfs://nameservice1:8020/home/image_search/tmp/distributed_autofaiss_indices",index_path="hdfs://nameservice1:8020/home/image_search/index/knn.index",index_infos_path="hdfs://nameservice1:8020/home/image_search/index/infos.json",)print("index, index_infos:",index,index_infos)