利润率每日更新

8d26273e · chenyuanjie · 72393bc8 · 8d26273e
Commit 8d26273e authored Mar 02, 2026 by chenyuanjie
Hide whitespace changes
Inline Side-by-side

Showing with 114 additions and 164 deletions

es_asin_profit_rate.py Pyspark_job/export_es/es_asin_profit_rate.py +114 -164

No files found.
--- a/Pyspark_job/export_es/es_asin_profit_rate.py
+++ b/Pyspark_job/export_es/es_asin_profit_rate.py
@@ -7,29 +7,27 @@ from utils.spark_util import SparkUtil
 from utils.es_util import EsUtils
 from pyspark.sql import functions as F
 from utils.common_util import CommonUtil
+from datetime import datetime, timedelta
 class EsAsinProfitRate(object):
-    def __init__(self, site_name):
+    def __init__(self, site_name, date_info):
        self.site_name = site_name
+        self.date_info = date_info
+        self.last_date_info = (datetime.strptime(date_info, "%Y-%m-%d").date() - timedelta(days=1)).strftime("%Y-%m-%d")
        self.spark = SparkUtil.get_spark_session(f"{self.__class__.__name__}")
        # ES相关配置
        self.es_client = EsUtils.get_es_client()
-        self.es_index = f"{self.site_name}_profit_rate_extra"
+        self.es_profit_rate_index = f"{self.site_name}_profit_rate_extra_v2"
-        self.es_body = self.get_es_body()
+        self.es_profit_rate_body = self.get_es_profit_rate_body()
-        self.es_options = self.get_es_options(self.es_index)
+        self.es_profit_rate_options = self.get_es_profit_rate_options(self.es_profit_rate_index)
-        self.profit_rate_policy = f"{self.site_name}_profit_rate_policy"
-        self.user_mask_asin_policy = "user_mask_asin_policy"
-        self.user_mask_category_policy = "user_mask_category_policy"
-        self.pipeline_id = f"{self.site_name}_user_mask_and_profit_rate_pipeline"
        self.df_asin_profit_rate = self.spark.sql(f"select 1+1;")
-        self.df_keepa_asin = self.spark.sql(f"select 1+1;")
    @staticmethod
-    def get_es_body():
+    def get_es_profit_rate_body():
        return {
            "settings": {
                "number_of_shards": "3",
@@ -51,13 +49,17 @@ class EsAsinProfitRate(object):
                    },
                    "air_profit": {
                        "type": "float"
+                    },
+                    "update_time": {
+                        "type": "date",
+                        "format": "yyyy-MM-dd"
                    }
                }
            }
        }
    @staticmethod
-    def get_es_options(index_name):
+    def get_es_profit_rate_options(index_name):
        return {
            "es.nodes": EsUtils.__es_ip__,
            "es.port": EsUtils.__es_port__,
@@ -74,173 +76,121 @@ class EsAsinProfitRate(object):
        }
    def run(self):
-        self.read_data()
+        self.save_profit_rate_add()
-        self.es_save()
+        self.update_history_index()
-        self.create_enrich_policy()
-        self.create_enrich_pipeline()
-    def read_data(self):
+    def save_profit_rate_add(self):
+        # 读取利润率整合数据（增量数据）
        sql = f"""
-        select asin, price, ocean_profit, air_profit, package_length, package_width, package_height, weight 
+        select asin, price, ocean_profit, air_profit, updated_time from dim_asin_profit_rate_info
-        from dim_asin_profit_rate_info where site_name = '{self.site_name}'
+        where site_name = '{self.site_name}' and updated_time >= '{self.last_date_info}'
        """
        self.df_asin_profit_rate = self.spark.sql(sqlQuery=sql).repartition(40, 'asin')
+        self.df_asin_profit_rate = self.df_asin_profit_rate.withColumn(
-        sql = f"""
-        select asin, package_length, package_width, package_height, weight 
-        from dim_keepa_asin_info where site_name = '{self.site_name}'
-        """
-        self.df_keepa_asin = self.spark.sql(sqlQuery=sql).repartition(40, 'asin')
-        # 因为 dim_asin_profit_rate_info 存在重复计算利润率的情况，保留与keepa最新数据所对应的数据行
-        self.df_asin_profit_rate = self.df_asin_profit_rate.join(
-            self.df_keepa_asin, on=['asin', 'package_length', 'package_width', 'package_height', 'weight'], how='inner'
-        ).select(
-            'asin', 'price', 'ocean_profit', 'air_profit'
-        ).withColumn(
            'profit_key', F.concat_ws("_", F.col("asin"), F.col("price"))
+        ).withColumn(
+            "update_time",
+            F.when(
+                F.col("updated_time").isNotNull(),
+                F.substring(F.col("updated_time"), 1, 10)
+            ).otherwise(F.lit("1970-01-01"))
+        ).select(
+            'profit_key', 'asin', 'price', 'ocean_profit', 'air_profit', 'update_time'
        ).cache()
+        print(f"增量利润率数据如下：")
+        self.df_asin_profit_rate.show(10, False)
-    def es_save(self):
+        print(f"创建利润率索引：{self.es_profit_rate_index}！")
-        print(f"创建富集索引：{self.es_index}！")
+        EsUtils.create_index(self.es_profit_rate_index, self.es_client, self.es_profit_rate_body)
-        EsUtils.create_index(self.es_index, self.es_client, self.es_body)
        try:
-            self.df_asin_profit_rate.write.format("org.elasticsearch.spark.sql") \
+            # self.df_asin_profit_rate.write.format("org.elasticsearch.spark.sql") \
-                .options(**self.es_options) \
+            #     .options(**self.es_profit_rate_options) \
-                .mode("append") \
+            #     .mode("append") \
-                .save()
+            #     .save()
-            print(f"ES {self.es_index} 索引更新完毕！")
+            print(f"ES {self.es_profit_rate_index} 索引更新完毕！")
        except Exception as e:
            print("An error occurred while writing to Elasticsearch:", str(e))
-            CommonUtil.send_wx_msg(['chenyuanjie'], '\u26A0 ES数据更新失败', f'失败索引：{self.es_index}')
+            CommonUtil.send_wx_msg(['chenyuanjie'], '\u26A0 ES数据更新失败', f'失败索引：{self.es_profit_rate_index}')
-    def create_enrich_policy(self):
+    def update_history_index(self):
-        # self.es_client.ingest.delete_pipeline(id=self.pipeline_id)
+        """更新历史月度索引的利润率数据"""
-        # self.es_client.enrich.delete_policy(name=self.policy_name)
+        # 从 2025-05 开始，遍历到最新索引
-        # print(f"创建富集策略：{self.policy_name}！")
+        start_date = datetime(2025, 5, 1)
-        # policy_body = {
+        current_date = start_date
-        #     "match": {
-        #         "indices": f"{self.es_index}",
+        while True:
-        #         "match_field": "profit_key",
+            year = current_date.year
-        #         "enrich_fields": ["ocean_profit", "air_profit"]
+            month = current_date.month
-        #     }
+            month_str = f"{year}-{month:02d}"
-        # }
+            index_name = f"{self.site_name}_st_detail_month_{year}_{month:02d}"
-        # self.es_client.enrich.put_policy(name=self.policy_name, body=policy_body)
+            # 检查索引是否存在
-        print(f"刷新富集策略：{self.profit_rate_policy}！")
+            if not self.es_client.indices.exists(index=index_name):
-        self.es_client.enrich.execute_policy(self.profit_rate_policy, request_timeout=1800)
+                print(f"索引 {index_name} 不存在，停止遍历")
+                break
-    def create_enrich_pipeline(self):
-        print(f"创建富集管道：{self.pipeline_id}！")
+            print(f"\n{'='*60}")
-        pipeline_body = {
+            print(f"开始处理索引: {index_name}")
-            "description": "asin profit_rate and user_mask pipeline",
+            print(f"{'='*60}")
-            "processors": [
-                {
+            try:
-                    "enrich": {
+                self.update_single_history_index(index_name, month_str)
-                        "policy_name": self.profit_rate_policy,
+            except Exception as e:
-                        "field": "profit_key",
+                print(f"更新索引 {index_name} 失败: {str(e)}")
-                        "target_field": "profit_rate_extra",
-                        "max_matches": 1,
+            # 移动到下个月
-                        "ignore_missing": True
+            if month == 12:
-                    },
+                current_date = datetime(year + 1, 1, 1)
-                },
+            else:
-                {
+                current_date = datetime(year, month + 1, 1)
-                    "enrich": {
-                        "policy_name": f"{self.user_mask_asin_policy}",
+    def update_single_history_index(self, index_name, month_str):
-                        "field": "asin",
+        """更新单个历史索引"""
-                        "target_field": "policy_add_1",
+        hive_sql = f"""
-                        "max_matches": 1,
+        select asin, asin_price as price from dwt_flow_asin where site_name = '{self.site_name}' and date_type = 'month' 
-                        "ignore_missing": True
+        and date_info = '{month_str}' and asin_price is not null
-                    },
+        """
-                },
+        df_hive = self.spark.sql(hive_sql)
-                {
-                    "enrich": {
-                        "policy_name": f"{self.user_mask_category_policy}",
-                        "field": "category_id",
-                        "target_field": "policy_add_2",
-                        "max_matches": 1,
-                        "ignore_missing": True
-                    },
-                },
-                {
-                    "set": {
-                        "field": "usr_mask_type",
-                        "value": "{{policy_add_1.usr_mask_type}}",
-                        "ignore_empty_value": True
-                    }
-                },
-                {
-                    "set": {
-                        "field": "usr_mask_progress",
-                        "value": "{{policy_add_1.usr_mask_progress}}",
-                        "ignore_empty_value": True
-                    }
-                },
-                {
-                    "set": {
-                        "field": "package_quantity",
-                        "value": "{{policy_add_1.package_quantity}}",
-                        "ignore_empty_value": True
-                    }
-                },
-                {
-                    "set": {
-                        "field": "usr_mask_type",
-                        "value": "{{policy_add_2.usr_mask_type}}",
-                        "ignore_empty_value": True
-                    }
-                },
-                {
-                    "remove": {
-                        "field": "policy_add_1",
-                        "ignore_missing": True
-                    }
-                },
-                {
-                    "remove": {
-                        "field": "policy_add_2",
-                        "ignore_missing": True
-                    }
-                },
-                {
-                    "convert": {
-                        "field": "package_quantity",
-                        "type": "integer",
-                        "ignore_missing": True
-                    }
-                }
-            ]
-        }
-        self.es_client.ingest.put_pipeline(id=self.pipeline_id, body=pipeline_body)
+        df_update = self.df_asin_profit_rate.join(
+            df_hive, on=['asin', 'price'], how='inner'
-        # 刷新ES数据，使pipeline生效
+        ).withColumn(
-        # body = {
+            "profit_rate_extra",
-        #     "query": {
+            F.struct(
-        #         "bool": {
+                F.col("ocean_profit").alias("ocean_profit"),
-        #             "must_not": {
+                F.col("air_profit").alias("air_profit")
-        #                 "exists": {
+            )
-        #                     "field": "profit_rate_extra"
+        ).select("asin", "profit_rate_extra")
-        #                 }
-        #             }
+        es_options = {
-        #         }
+            "es.nodes": EsUtils.__es_ip__,
-        #     }
+            "es.port": EsUtils.__es_port__,
-        # }
+            "es.net.http.auth.user": EsUtils.__es_user__,
-        body = {
+            "es.net.http.auth.pass": EsUtils.__es_passwd__,
+            "es.mapping.id": "asin",
+            "es.resource": f"{index_name}/_doc",
+            "es.batch.write.refresh": "false",
+            "es.batch.size.entries": "5000",
+            "es.write.operation": "update",
+            "es.batch.write.retry.count": "3",
+            "es.batch.write.retry.wait": "10s",
+            "es.internal.es.version.ignore": "true"  # 忽略版本检查   
        }
-        self.es_client.update_by_query(
-            index="us_st_detail_month_2025_11",
+        print(f"索引 {index_name} 待更新数据量: {df_update.count()}")
-            body=body,
+        df_update.show(5, False)
-            pipeline=self.pipeline_id,
-            refresh=True,
+        df_update.write.format("org.elasticsearch.spark.sql") \
-            wait_for_completion=False,
+            .options(**es_options) \
-            request_timeout=600
+            .mode("append") \
-        )
+            .save()
-        pass
+        print(f"索引 {index_name} 更新完毕！")
 if __name__ == "__main__":
    site_name = sys.argv[1]
-    handle_obj = EsAsinProfitRate(site_name)
+    date_info = sys.argv[2]
+    handle_obj = EsAsinProfitRate(site_name, date_info)
    handle_obj.run()
    print("success！！！")