Merge branch 'developer' of http://47.106.101.75/abel_cjy/Amazon-Selection-Data

42d87c58 · wangjing · ce835ee7 · dd03cbe7 · ce835ee7 · ce835ee7
Commit 42d87c58 authored Jan 06, 2026 by wangjing
169 changed files
--- a/.gitignore
+++ b/.gitignore
--- a/Pyspark_job/ct/__init__.py
+++ b/Pyspark_job/ct/__init__.py
--- a/Pyspark_job/ct/aba_2023_10_12_export.py
+++ b/Pyspark_job/ct/aba_2023_10_12_export.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.spark_util import SparkUtil
-from pyspark.sql.functions import col
-
-
-if __name__ == '__main__':
-    spark = SparkUtil.get_spark_session("ABA_2023_10_12_export")
-
-    sql1 = """
-        select 
-            date_info,
-            search_term,
-            st_bsr_cate_1_id_new as category_id,
-            market_cycle_type,
-            is_first_text,
-            is_ascending_text,
-            is_high_return_text,
-            is_search_text,
-            st_movie_label,
-            st_brand_label,
-            bsr_orders,
-            st_word_num,
-            st_num,
-            rank
-        from dwt_aba_st_analytics
-        where site_name = 'us' 
-        and date_type = 'month' 
-        and date_info in ('2023-10','2023-11','2023-12');
-    """
-    df_dwt_aba_st_analytics = spark.sql(sql1).cache()
-
-    sql2 = """
-        select 
-            category_id, 
-            en_name
-        from dim_bsr_category_tree
-        where site_name = 'us'
-        and category_parent_id = 0;
-    """
-    df_dim_bsr_category_tree = spark.sql(sql2).cache()
-
-    sql3 = """
-        select 
-            search_term,
-            rank_change_rate,
-            rank_rate_of_change,
-            date_info
-        from dwt_aba_last_change_rate
-        where site_name = 'us' 
-        and date_type = 'month' 
-        and date_info in ('2023-10','2023-11','2023-12');
-    """
-    df_dwt_aba_last_change_rate = spark.sql(sql3).cache()
-
-    # 过滤出满足条件的词
-    df_dwt_aba_st_analytics = df_dwt_aba_st_analytics.filter(
-        "(is_first_text = 1) or (is_ascending_text = 1) or (market_cycle_type in (1, 2))"
-    )
-
-    df_save = df_dwt_aba_st_analytics.join(
-        df_dim_bsr_category_tree, on='category_id', how='left'
-    ).join(
-        df_dwt_aba_last_change_rate, on=['date_info', 'search_term'], how='left'
-    )
-
-    df_save = df_save.select(
-        col('date_info').alias('year_month'),
-        col('search_term'),
-        col('en_name').alias('category'),
-        col('market_cycle_type'),
-        col('is_first_text'),
-        col('is_ascending_text'),
-        col('is_high_return_text'),
-        col('is_search_text'),
-        col('st_movie_label').alias('movie_label'),
-        col('st_brand_label').alias('brand_label'),
-        col('bsr_orders'),
-        col('st_word_num').alias('word_counts'),
-        col('st_num').alias('word_frequency'),
-        col('rank'),
-        col('rank_change_rate').alias('year_on_year'),
-        col('rank_rate_of_change').alias('month_on_month')
-    )
-    df_save.repartition(5).show(10, truncate=True)
-    df_save.write.saveAsTable(name='tmp_aba_2023_export', format='hive', mode='append')
-    spark.stop()
--- a/Pyspark_job/ct/aba_2023_10_12_word_frequency.py
+++ b/Pyspark_job/ct/aba_2023_10_12_word_frequency.py
-import os
-import re
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.common_util import CommonUtil
-from utils.spark_util import SparkUtil
-from pyspark.sql.functions import count, explode, split, udf, lit
-from pyspark.sql.types import ArrayType, StringType
-
-if __name__ == '__main__':
-    date_info = CommonUtil.get_sys_arg(1, None)
-    spark = SparkUtil.get_spark_session("ABA_2023_10_12_word_frequency")
-
-    # 自定义函数，将词组拆分为2个单词为一组
-    def split_tow_by_tow(search_term):
-        words = search_term.split()
-        pairs = []
-        for i in range(len(words) - 1):
-            pairs.append(words[i] + ' ' + words[i + 1])
-        return pairs
-    u_split_tow_by_tow = udf(split_tow_by_tow, ArrayType(StringType()))
-
-    # 自定义函数，将词组拆分为3个单词为一组
-    def split_three_by_three(search_term):
-        words = search_term.split()
-        triplets = []
-        for i in range(len(words) - 2):
-            triplets.append(words[i] + ' ' + words[i + 1] + ' ' + words[i + 2])
-        return triplets
-    u_split_three_by_three = udf(split_three_by_three, ArrayType(StringType()))
-
-    # 自定义函数，剔除掉多余字符
-    def characters_to_remove(search_term):
-        pattern = r'\s[^\w\s%\']+?\s'
-        cleaned_text = re.sub(pattern, ' ', search_term)
-        cleaned_text = cleaned_text.replace('\n', ' ')
-        return cleaned_text
-    u_characters_to_remove = udf(characters_to_remove, StringType())
-
-    sql = f"""
-        select 
-            search_term
-        from dwt_aba_st_analytics
-        where site_name = 'us' 
-        and date_type = 'month' 
-        and date_info = '{date_info}';
-    """
-    df_aba = spark.sql(sql).cache()
-    df_aba = df_aba.select(
-        u_characters_to_remove(df_aba['search_term']).alias('search_term')
-    )
-
-    df_one_word = df_aba.select(
-        explode(split(df_aba['search_term'], ' ')).alias('word')
-    ).groupby(
-        ['word']
-    ).agg(
-        count('word').alias('word_frequency')
-    ).filter(
-        'word_frequency >= 50'
-    ).withColumn(
-        'date_info',
-        lit(f'{date_info}-1')
-    )
-
-    df_tow_word = df_aba.select(
-        explode(u_split_tow_by_tow(df_aba['search_term'])).alias('word')
-    ).groupby(
-        ['word']
-    ).agg(
-        count('word').alias('word_frequency')
-    ).filter(
-        'word_frequency >= 50'
-    ).withColumn(
-        'date_info',
-        lit(f'{date_info}-2')
-    )
-
-    df_three_word = df_aba.select(
-        explode(u_split_three_by_three(df_aba['search_term'])).alias('word')
-    ).groupby(
-        ['word']
-    ).agg(
-        count('word').alias('word_frequency')
-    ).filter(
-        'word_frequency >= 50'
-    ).withColumn(
-        'date_info',
-        lit(f'{date_info}-3')
-    )
-
-    df_one_word.write.saveAsTable(name='tmp_word_frequency', format='hive', mode='append', partitionBy='date_info')
-    df_tow_word.write.saveAsTable(name='tmp_word_frequency', format='hive', mode='append', partitionBy='date_info')
-    df_three_word.write.saveAsTable(name='tmp_word_frequency', format='hive', mode='append', partitionBy='date_info')
--- a/Pyspark_job/ct/aba_2023_year_word_frequency.py
+++ b/Pyspark_job/ct/aba_2023_year_word_frequency.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.spark_util import SparkUtil
-from pyspark.sql.functions import count, explode, lit, desc, sum
-from pyspark.sql.types import ArrayType, StringType
-from textblob import Word
-from googletrans import Translator
-
-
-class ABA2023YearWordFrequency(object):
-
-    def __init__(self):
-        self.spark = SparkUtil.get_spark_session("spark_task: aba_2023_year_word_frequency")
-        self.df_aba_2023 = self.spark.sql(f"select 1+1;")
-        self.df_beside_category = self.spark.sql(f"select 1+1;")
-        self.df_translate = self.spark.sql(f"select 1+1;")
-        self.df_save = self.spark.sql(f"select 1+1;")
-        self.df_save1 = self.spark.sql(f"select 1+1;")
-        self.df_save2 = self.spark.sql(f"select 1+1;")
-        self.df_agg = self.spark.sql(f"select 1+1;")
-        # 自定义udf
-        self.u_get_singular_form = self.spark.udf.register('get_singular_form', self.get_singular_form, StringType())
-        self.u_word_tokenize = self.spark.udf.register('word_tokenize', self.word_tokenize, ArrayType(StringType()))
-        # self.u_word_translate = self.spark.udf.register('word_translate', self.word_translate, StringType())
-
-    @staticmethod
-    def get_singular_form(word: str):
-        """
-        将单词全部转化为单数形式
-        """
-        if word:
-            singular_form = Word(word).lemmatize("n")
-            # word_object = Word(word)
-            # singular_form = word_object.singularize()
-            return singular_form
-        return word
-
-    @staticmethod
-    def word_tokenize(title: str):
-        """
-        分词器
-        """
-        from nltk.tokenize import word_tokenize
-        result = word_tokenize(title, "english")
-        return result
-
-    # @staticmethod
-    # def word_translate(word: str):
-    #     if word:
-    #         try:
-    #             translator = Translator()
-    #             result = translator.translate(word, src='en', dest='zh-cn')
-    #             return result.text
-    #         except Exception as e:
-    #             # 处理其他未知错误
-    #             print(f"An unexpected error occurred: {e}")
-    #             return None
-    #     return None
-
-    def read_data(self):
-        sql1 = f"""
-            select 
-                search_term, 
-                category_id 
-            from dwt_aba_last365 
-            where site_name = 'us' 
-              and date_type = 'last365day' 
-              and date_info = '2023-12';
-        """
-        self.df_aba_2023 = self.spark.sql(sql1).cache()
-        print("df_aba_2023的数量：")
-        print(self.df_aba_2023.count())
-
-        sql2 = f"""
-            select 
-                category_id
-            from dim_bsr_category_tree
-            where site_name = 'us'
-              and en_name in ('Audible Books & Originals', 'Books', 'Kindle Store', 'Apps & Games', 'Movies & TV', 'CDs & Vinyl', 'Software', 'Video Games')
-              and category_parent_id = 0;
-        """
-        self.df_beside_category = self.spark.sql(sql2).cache()
-        print("df_beside_category的数量：")
-        print(self.df_beside_category.count())
-
-        sql3 = f"""
-            select 
-                word, 
-                simple_cn as cn 
-            from tmp_en_dict;
-        """
-        self.df_translate = self.spark.sql(sql3).cache()
-        print("df_translate的数量：")
-        print(self.df_translate.count())
-
-    def handle_data(self):
-        self.df_save = self.df_aba_2023.join(
-            self.df_beside_category, on='category_id', how='left_anti'
-        ).select('search_term')
-
-        self.df_save = self.df_save.select(explode(self.u_word_tokenize(self.df_save['search_term'])).alias('word'))
-        self.df_save = self.df_save.groupby(['word']).agg(
-            count('word').alias('word_frequency')
-        )
-        self.df_save = self.df_save.join(
-            self.df_translate, on='word', how='left'
-        ).withColumn(
-            'word_singular_form',
-            self.u_get_singular_form(self.df_save['word'])
-        ).cache()
-
-        self.df_save1 = self.df_save.select(
-            'word', 'word_frequency', 'cn'
-        ).orderBy(
-            desc('word_frequency')
-        ).withColumn(
-            'date_info',
-            lit('2023')
-        )
-        print("df_save1的数量：")
-        print(self.df_save1.count())
-        self.df_save1.write.saveAsTable(name='tmp_word_frequency', format='hive', mode='append', partitionBy='date_info')
-        print("df_save1存储完成！")
-
-        self.df_agg = self.df_save.groupby(['word_singular_form']).agg(
-            sum('word_frequency').alias('word_frequency')
-        )
-        self.df_save2 = self.df_save.select('word', 'cn', 'word_singular_form').join(
-            self.df_agg, on='word_singular_form', how='left'
-        ).select(
-            'word', 'word_frequency', 'cn'
-        ).orderBy(
-            desc('word_frequency')
-        ).withColumn(
-            'date_info',
-            lit('2023-merge')
-        )
-        print("df_save2的数量：")
-        print(self.df_save2.count())
-        self.df_save2.write.saveAsTable(name='tmp_word_frequency', format='hive', mode='append', partitionBy='date_info')
-        print("df_save2存储完成！")
-
-
-if __name__ == '__main__':
-    obj = ABA2023YearWordFrequency()
-    obj.read_data()
-    obj.handle_data()
--- a/Pyspark_job/ct/aba_re_run_month.py
+++ b/Pyspark_job/ct/aba_re_run_month.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.DolphinschedulerHelper import DolphinschedulerHelper
-from utils.common_util import CommonUtil
-from utils.spark_util import SparkUtil
-
-
-if __name__ == '__main__':
-    start_date = CommonUtil.get_sys_arg(1, None)
-    end_date = CommonUtil.get_sys_arg(2, None)
-
-    spark_session = SparkUtil.get_spark_session("re-run-aba-month")
-
-    sql = f"""
-    select distinct year_month as date_info from dim_date_20_to_30 where year_month >= '{start_date}' and year_month < '{end_date}';
-    """
-    date_df = spark_session.sql(sql)
-
-    print(date_df.show())
-
-    date_list = sorted([d.asDict().get("date_info") for d in date_df.collect()])
-
-    print(date_list)
-
-    for date_info in date_list:
-        startParams = {
-            "site_name": "us",
-            "date_type": "month",
-            "date_info": date_info
-        }
-        print(startParams)
-        DolphinschedulerHelper.start_and_watch_process_instance(
-            "big_data_selection",
-            process_df_name='月-重跑ABA四分位',
-            startParams={
-                "site_name": "us",
-                "date_type": "month",
-                "date_info": date_info
-            }
-        )
-
-    CommonUtil.send_wx_msg(["huangjian", "chenyuanjie"], "【月-重跑ABA四分位】重跑完成", "")
-
-    pass
--- a/Pyspark_job/ct/asin_to_number.py
+++ b/Pyspark_job/ct/asin_to_number.py
-def asin_to_number(asin):
-    """
-    Convert a 10-character ASIN string to a unique number.
-    This function assumes that ASIN consists of uppercase letters and digits.
-    """
-
-    def char_to_number(char):
-        if char.isdigit():
-            return int(char)
-        else:
-            return ord(char) - 55  # 'A' -> 10, 'B' -> 11, ..., 'Z' -> 35
-
-    if len(asin) != 10:
-        raise ValueError("ASIN must be 10 characters long")
-
-    base = 36
-    asin_number = 0
-    for i, char in enumerate(reversed(asin)):
-        asin_number += char_to_number(char) * (base ** i)
-
-    # The final number is taken modulo 1 billion to fit the range 1-10 billion
-    return asin_number % 1000000000
-
-
-if __name__ == '__main__':
-    x = asin_to_number('B0CGY4LZQ3')
-    print(x)
-    s = f'us_asin_image_part{int(x / 1000_0000) + 1}'
-    print(s)
--- a/Pyspark_job/ct/dwt_merchantwords_st_detail_merge.py
+++ b/Pyspark_job/ct/dwt_merchantwords_st_detail_merge.py
-import os
-import sys
-
-
-sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
-from utils.templates import Templates
-from utils.hdfs_utils import HdfsUtils
-from utils.spark_util import SparkUtil
-from pyspark.sql.window import Window
-from pyspark.storagelevel import StorageLevel
-from pyspark.sql import functions as F
-
-
-class DwtMerchantwordsStDetailMerge(Templates):
-
-    def __init__(self, site_name='us'):
-        super().__init__()
-        self.site_name = site_name
-        self.batch = '2024-1'
-        self.db_save = 'dwt_merchantwords_st_detail_merge'
-        self.spark = self.create_spark_object(
-            app_name=f"DwtMerchantwordsStDetailMerge: {self.site_name}, {self.batch}")
-        self.partitions_num = 15
-        self.partitions_by = ['site_name', 'batch']
-        self.df = self.spark.sql(f"select 1+1;")
-        self.df_save = self.spark.sql(f"select 1+1;")
-        hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dwt/{self.db_save}/site_name={self.site_name}/batch={self.batch}"
-        print(f"清除hdfs目录中.....{hdfs_path}")
-        HdfsUtils.delete_hdfs_file(hdfs_path)
-
-
-    def read_data(self):
-        print("读取dwt_merchantwords_st_detail数据")
-        sql = f"""
-        select 
-            keyword, 
-            volume, 
-            avg_3m, 
-            avg_12m, 
-            depth, 
-            results_count, 
-            sponsored_ads_count, 
-            page_1_reviews, 
-            appearance, 
-            last_seen, 
-            update_time, 
-            lang, 
-            batch as last_batch 
-        from dwt_merchantwords_st_detail 
-        where site_name = '{self.site_name}'
-        and batch in ('2023-1', '2024-1');
-        """
-        self.df = self.spark.sql(sqlQuery=sql)
-        self.df = self.df.repartition(80).persist(StorageLevel.MEMORY_ONLY)
-        self.df.show(10, truncate=True)
-
-    def handle_data(self):
-        window = Window.partitionBy('keyword').orderBy(
-            F.desc_nulls_last('last_batch')
-        )
-        self.df = self.df.withColumn("u_rank", F.row_number().over(window=window))
-        self.df = self.df.filter('u_rank=1').drop('u_rank')
-        self.df_save = self.df.withColumn(
-            'site_name',
-            F.lit(self.site_name)
-        ).withColumn(
-            'batch',
-            F.lit(self.batch)
-        )
-
-
-if __name__ == '__main__':
-    site_name = sys.argv[1]
-    handle_obj = DwtMerchantwordsStDetailMerge(site_name=site_name)
-    handle_obj.run()
--- a/Pyspark_job/ct/es_to_hive.py
+++ b/Pyspark_job/ct/es_to_hive.py
-import os
-import sys
-
-
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.hdfs_utils import HdfsUtils
-from utils.common_util import CommonUtil
-from utils.templates import Templates
-from pyspark.sql import functions as F
-
-
-class FlowAsinLast30days(Templates):
-
-    def __init__(self):
-        super().__init__()
-        self.db_save = "tmp_flow_asin_last30days"
-        self.spark = self.create_spark_object(app_name="FlowAsinLast30days")
-        self.partitions_num = 20
-        self.partition_dict = {}
-        self.df_es = self.spark.sql(f"select 1+1;")
-        self.df_parent = self.spark.sql(f"select 1+1;")
-        self.df_joined = self.spark.sql(f"select 1+1;")
-        self.df_save = self.spark.sql(f"select 1+1;")
-
-    def read_data(self):
-        self.df_es = self.spark.read.format("org.elasticsearch.spark.sql")\
-            .option("es.nodes", "192.168.10.217")\
-            .option("es.port", "9200")\
-            .option("es.net.http.auth.user", "elastic")\
-            .option("es.net.http.auth.pass", "selection2021.+")\
-            .option("es.resource", "us_st_detail_last_4_week")\
-            .option("es.query", '{"query": {"match_all": {}}}')\
-            .load()
-
-        columns = ["asin", "first_category_rank", "asin_bought_month", "total_comments", "variation_num", "site_name", "account_name"]
-        self.df_es = self.df_es.select(columns).cache()
-        self.df_es.show()
-
-        sql = f"""
-        select
-            asin,
-            parent_asin
-        from
-            ods_asin_variat;
-        """
-        self.df_parent = self.spark.sql(sqlQuery=sql).cache()
-
-    def handle_data(self):
-        # self.df_parent = self.df_parent.groupby(["parent_asin"]).agg(F.count("asin").alias("variation_num"))
-        self.df_joined = self.df_es.join(self.df_parent, "asin", "left")
-        self.df_joined = self.df_joined\
-            .withColumn("parent_asin_is_null", F.when(F.col("parent_asin").isNull(), F.lit(1)).otherwise(F.lit(0)))\
-            .withColumn("parent_asin_exist", F.when(F.col("parent_asin").isNotNull(), F.lit(1)).otherwise(F.lit(0)))
-
-    def save_data(self):
-        self.df_save = self.df_joined
-        hdfs_path_asin_info = CommonUtil.build_hdfs_path(self.db_save, partition_dict=self.partition_dict)
-        print(f"清除hdfs目录中:{hdfs_path_asin_info}")
-        HdfsUtils.delete_file_in_folder(hdfs_path_asin_info)
-        print(f"当前存储的表名为：{self.db_save}")
-        self.df_save.write.saveAsTable(name=self.db_save, format='hive', mode='append')
-        print("success")
-
-
-if __name__ == '__main__':
-    obj = FlowAsinLast30days()
-    obj.run()
\ No newline at end of file
--- a/Pyspark_job/ct/export_dwt_flow_asin.py
+++ b/Pyspark_job/ct/export_dwt_flow_asin.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.DolphinschedulerHelper import DolphinschedulerHelper
-from utils.common_util import CommonUtil
-
-
-if __name__ == '__main__':
-
-    date_list = ["2024-02","2024-01","2023-12","2023-11","2023-10","2023-09"]
-    for date_info in date_list:
-        startParams = {
-            "site_name": "us",
-            "date_type": "month",
-            "date_info": date_info
-        }
-        print(startParams)
-        DolphinschedulerHelper.start_and_watch_process_instance(
-            "big_data_selection",
-            process_df_name='export_dwt_flow_asin_api',
-            startParams=startParams
-        )
-
-    CommonUtil.send_wx_msg(["chenyuanjie", "wangrui4"], "【export_dwt_flow_asin_api】导出完成", "")
-
-    pass
--- a/Pyspark_job/ct/merchantwords_hive_to_pg16.py
+++ b/Pyspark_job/ct/merchantwords_hive_to_pg16.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-
-from utils.common_util import CommonUtil
-from utils.spark_util import SparkUtil
-from pyspark.sql.functions import row_number, lit
-from pyspark.sql.window import Window
-from pyspark.sql.types import StringType, ArrayType
-from urllib.parse import quote
-from datetime import datetime
-
-
-if __name__ == '__main__':
-    date_info = CommonUtil.get_sys_arg(1, None)
-    n = CommonUtil.get_sys_arg(2, 0)
-    hive_tb = "dwt_merchantwords_st_detail"
-    export_tb = "us_merchantwords_search_term_month_syn_2024"
-    spark = SparkUtil.get_spark_session(f"export: {hive_tb}")
-    # 一次导出400w条数据
-    batch_size = (int(n)-1) * 4000000
-    start_index = 1 + batch_size
-    end_index = 4000000 + batch_size
-
-    # 构建 URL 的函数
-    def build_urls(search_term):
-        url_template = f"https://www.amazon.com/s?k={{search_term}}&page={{page_number}}"
-        search_term_chinese = quote(search_term, 'utf-8')
-        search_term_chinese = search_term_chinese.replace("'", '%27').replace("/", '%2F')
-        urls = [
-            url_template.format(
-                search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
-                                                                                                                  '%28').replace(
-                    ')', '%29'), page_number=1),
-            url_template.format(
-                search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
-                                                                                                                  '%28').replace(
-                    ')', '%29'), page_number=2),
-            url_template.format(
-                search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
-                                                                                                                  '%28').replace(
-                    ')', '%29'), page_number=3)
-        ]
-        return urls
-
-    # 将Python函数转换为UDF
-    spark.udf.register("build_urls", build_urls, ArrayType(StringType()))
-
-    # 从 PostgreSQL 数据库中读取已有数据
-    # df_pg = spark.read.format("jdbc") \
-    #     .option("url", "jdbc:postgresql://192.168.10.225:5432/selection") \
-    #     .option("dbtable", export_tb) \
-    #     .option("user", "yswg_postgres") \
-    #     .option("password", "yswg_postgres") \
-    #     .load()
-    # df_pg = df_pg\
-    #     .select("search_term") \
-    #     .drop_duplicates(["search_term"]) \
-    #     .repartition(70) \
-    #     .cache()
-
-    # 从 Hive 表中读取数据
-    df_hive = spark.sql(f"SELECT keyword FROM {hive_tb}")
-    df_hive = df_hive\
-        .withColumn("row_num", row_number().over(Window.orderBy("keyword")))\
-        .filter(f"row_num BETWEEN {start_index} AND {end_index}")\
-        .select("keyword")\
-        .repartition(10) \
-        .cache()
-
-    # 过滤掉keyword含有中文的数据
-    df_hive = df_hive.filter(~df_hive["keyword"].rlike("[\u4e00-\u9fff]"))
-    # 过滤掉已存在于目标数据库中的数据
-    # df_hive = df_hive.join(df_pg, df_hive["keyword"] == df_pg["search_term"], "leftanti")
-
-    # 如果没有数据需要导出，退出循环
-    if df_hive.count() == 0:
-        print("-------数据已全部导出！-------")
-        quit()
-
-    df_hive = df_hive.selectExpr("keyword AS search_term")
-    df_hive = df_hive.selectExpr("search_term", "explode(build_urls(search_term)) AS url")
-    df_hive = df_hive.withColumn("date_info", lit(date_info))
-
-    # 导出数据到 PostgreSQL 数据库
-    df_hive.write.format("jdbc") \
-        .option("url", "jdbc:postgresql://192.168.10.225:5432/selection") \
-        .option("dbtable", export_tb) \
-        .option("user", "yswg_postgres") \
-        .option("password", "yswg_postgres") \
-        .mode("append") \
-        .save()
-
-    spark.stop()
\ No newline at end of file
--- a/Pyspark_job/ct/merchantwords_sr_to_pg16.py
+++ b/Pyspark_job/ct/merchantwords_sr_to_pg16.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-
-from utils.common_util import CommonUtil
-from utils.spark_util import SparkUtil
-from pyspark.sql.functions import row_number, lit, length
-from pyspark.sql.window import Window
-from pyspark.sql.types import StringType, ArrayType
-from urllib.parse import quote
-
-
-if __name__ == '__main__':
-    date_info = CommonUtil.get_sys_arg(1, None)
-    n = CommonUtil.get_sys_arg(2, 0)
-    import_tb = "search_term_result_year"
-    export_tb = "us_merchantwords_search_term_month_syn_2024"
-    spark = SparkUtil.get_spark_session("MerchantwordsSRToPG16")
-    # 一次导出400w条数据
-    batch_size = (int(n)-1) * 4000000
-    start_index = 1 + batch_size
-    end_index = 4000000 + batch_size
-
-    # 构建 URL 的函数
-    def build_urls(search_term):
-        url_template = f"https://www.amazon.com/s?k={{search_term}}&page={{page_number}}"
-        search_term_chinese = quote(search_term, 'utf-8')
-        search_term_chinese = search_term_chinese.replace("'", '%27').replace("/", '%2F')
-        urls = [
-            url_template.format(
-                search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
-                                                                                                                  '%28').replace(
-                    ')', '%29'), page_number=1),
-            url_template.format(
-                search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
-                                                                                                                  '%28').replace(
-                    ')', '%29'), page_number=2),
-            url_template.format(
-                search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
-                                                                                                                  '%28').replace(
-                    ')', '%29'), page_number=3)
-        ]
-        return urls
-
-    # 将Python函数转换为UDF
-    spark.udf.register("build_urls", build_urls, ArrayType(StringType()))
-
-    # 从SR数据库中读取已有数据
-    df = spark.read.format("jdbc") \
-        .option("url", "jdbc:mysql://192.168.10.151:19030/test") \
-        .option("dbtable", import_tb) \
-        .option("user", "chenyuanjie") \
-        .option("password", "chenyuanjie12345") \
-        .load()
-
-    df = df.withColumn(
-        "row_num",
-        row_number().over(Window.orderBy("search_term"))
-    ).filter(f"row_num BETWEEN {start_index} AND {end_index}").repartition(20).cache()
-
-    # 过滤掉keyword含有中文的数据
-    df = df.filter(~df["search_term"].rlike("[\u4e00-\u9fff]"))
-
-    # 如果没有数据需要导出，退出循环
-    if df.count() == 0:
-        print("-------数据已全部导出！-------")
-        quit()
-
-    df = df.selectExpr("search_term", "explode(build_urls(search_term)) AS url")
-    df = df.filter(length(df['url']) <= 450)
-    df = df.withColumn("date_info", lit(date_info))
-
-    # 导出数据到 PostgreSQL 数据库
-    df.write.format("jdbc") \
-        .option("url", "jdbc:postgresql://192.168.10.225:5432/selection") \
-        .option("dbtable", export_tb) \
-        .option("user", "yswg_postgres") \
-        .option("password", "yswg_postgres") \
-        .mode("append") \
-        .save()
-
-    spark.stop()
\ No newline at end of file
--- a/Pyspark_job/ct/ods_asin_detail_sr_to_hive.py
+++ b/Pyspark_job/ct/ods_asin_detail_sr_to_hive.py
-import os
-import sys
-
-
-
-sys.path.append(os.path.dirname(sys.path[0]))
-
-from utils.spark_util import SparkUtil
-from utils.StarRocksHelper import StarRocksHelper
-from utils.common_util import CommonUtil
-from utils.hdfs_utils import HdfsUtils
-
-if __name__ == '__main__':
-    spark = SparkUtil.get_spark_session("ods_asin_detail_sr_to_hive")
-    partition_dict = {
-        "site_name": 'us',
-        "date_type": 'month',
-        "date_info": '2024-03'
-    }
-    hdfs_path = CommonUtil.build_hdfs_path('ods_asin_detail_test', partition_dict=partition_dict)
-    HdfsUtils.delete_hdfs_file(hdfs_path)
-    connection_info = StarRocksHelper.get_connection_info('selection')
-    df_sr = spark.read.format("starrocks") \
-        .option("starrocks.fe.http.url", f"{connection_info['ip']}:{connection_info['http_port']}") \
-        .option("starrocks.fe.jdbc.url", f"jdbc:mysql://{connection_info['ip']}:{connection_info['jdbc_port']}") \
-        .option("starrocks.table.identifier", "test.ods_asin_detail_test2") \
-        .option("starrocks.user", connection_info['user']) \
-        .option("starrocks.password", connection_info['pwd']) \
-        .option("starrocks.request.tablet.size", "1") \
-        .option("starrocks.batch.size", "40960") \
-        .option("starrocks.exec.mem.limit", "21474836480") \
-        .load()
-    print("读取完毕")
-    df_sr.repartition(50)
-    partitions_by = ['site_name', 'date_type', 'date_info']
-    df_sr.write.saveAsTable(name='ods_asin_detail_test', format='hive', mode='append', partitionBy=partitions_by)
-    spark.stop()
-    # 创建lzo索引和修复元数据
-    CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb='ods_asin_detail_test')
--- a/Pyspark_job/ct/ods_asin_detail_to_sr.py
+++ b/Pyspark_job/ct/ods_asin_detail_to_sr.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-
-from utils.spark_util import SparkUtil
-from utils.StarRocksHelper import StarRocksHelper
-
-
-if __name__ == '__main__':
-    spark = SparkUtil.get_spark_session("ods_asin_detail_to_sr_test")
-
-    sql = """
-        select 
-            *
-        from ods_asin_detail 
-        where site_name = 'us'
-        and date_type = 'month'
-        and date_info = '2024-03'
-    """
-    df_hive = spark.sql(sql).repartition(40)
-    connection_info = StarRocksHelper.get_connection_info('selection')
-    df_hive.write.format("starrocks") \
-        .option("starrocks.fe.http.url", f"{connection_info['ip']}:{connection_info['http_port']}") \
-        .option("starrocks.fe.jdbc.url", f"jdbc:mysql://{connection_info['ip']}:{connection_info['jdbc_port']}") \
-        .option("starrocks.table.identifier", "test.ods_asin_detail_test") \
-        .option("starrocks.user", connection_info['user']) \
-        .option("starrocks.password", connection_info['pwd']) \
-        .option("starrocks.write.flush.interval.ms", "10000") \
-        .option("starrocks.write.properties.column_separator", "~!@#$%^&*~!@#$%^&*") \
-        .mode("append") \
-        .save()
-    print("导出完毕")
-
-    spark.stop()
-
--- a/Pyspark_job/ct/re_run_aba_2022.py
+++ b/Pyspark_job/ct/re_run_aba_2022.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.DolphinschedulerHelper import DolphinschedulerHelper
-from utils.common_util import CommonUtil
-
-
-if __name__ == '__main__':
-    date_list = ['2022-02', '2022-03', '2022-04', '2022-05', '2022-06',
-                 '2022-07', '2022-08', '2022-09', '2022-10', '2022-11', '2022-12']
-    for date_info in date_list:
-        print(f"当前执行的分区为：{date_info}")
-        success_flag = DolphinschedulerHelper.start_and_watch_process_instance(
-            "big_data_selection",
-            process_df_name='ABA品牌标签调整重跑_api',
-            startParams={
-                "site_name": "us",
-                "date_type": "month",
-                "date_info": date_info,
-                "wx_user": "chenyuanjie"
-            }
-        )
-        if success_flag:
-            continue
-        else:
-            CommonUtil.send_wx_msg(["chenyuanjie"], f"ABA品牌标签调整重跑_api {date_info} 执行失败")
-            break
-
-    CommonUtil.send_wx_msg(["chenyuanjie"], "ABA品牌标签调整重跑_api 2022年 执行结束")
-
-    pass
--- a/Pyspark_job/ct/re_run_aba_2023.py
+++ b/Pyspark_job/ct/re_run_aba_2023.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.DolphinschedulerHelper import DolphinschedulerHelper
-from utils.common_util import CommonUtil
-
-
-if __name__ == '__main__':
-    date_list = ['2023-11', '2023-12']
-    for date_info in date_list:
-        print(f"当前执行的分区为：{date_info}")
-        success_flag = DolphinschedulerHelper.start_and_watch_process_instance(
-            "big_data_selection",
-            process_df_name='ABA品牌标签调整重跑_api',
-            startParams={
-                "site_name": "us",
-                "date_type": "month",
-                "date_info": date_info,
-                "wx_user": "chenyuanjie"
-            }
-        )
-        if success_flag:
-            continue
-        else:
-            CommonUtil.send_wx_msg(["chenyuanjie"], f"ABA品牌标签调整重跑_api {date_info} 执行失败")
-            break
-
-    CommonUtil.send_wx_msg(["chenyuanjie"], "ABA品牌标签调整重跑_api 2023年 执行结束")
-
-    pass
--- a/Pyspark_job/ct/re_run_aba_2024.py
+++ b/Pyspark_job/ct/re_run_aba_2024.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.DolphinschedulerHelper import DolphinschedulerHelper
-from utils.common_util import CommonUtil
-
-
-if __name__ == '__main__':
-    date_list = ['2024-01', '2024-02', '2024-03', '2024-04',
-                 '2024-05', '2024-06', '2024-07', '2024-08']
-    for date_info in date_list:
-        print(f"当前执行的分区为：{date_info}")
-        success_flag = DolphinschedulerHelper.start_and_watch_process_instance(
-            "big_data_selection",
-            process_df_name='ABA品牌标签调整重跑_api',
-            startParams={
-                "site_name": "us",
-                "date_type": "month",
-                "date_info": date_info,
-                "wx_user": "chenyuanjie"
-            }
-        )
-        if success_flag:
-            continue
-        else:
-            CommonUtil.send_wx_msg(["chenyuanjie"], f"ABA品牌标签调整重跑_api {date_info} 执行失败")
-            break
-
-    CommonUtil.send_wx_msg(["chenyuanjie"], "ABA品牌标签调整重跑_api 2024年 执行结束")
-
-    pass
--- a/Pyspark_job/ct/spark_pg2pg.py
+++ b/Pyspark_job/ct/spark_pg2pg.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-
-from utils.common_util import CommonUtil
-from utils.spark_util import SparkUtil
-from pyspark.sql.functions import row_number, lit
-from pyspark.sql.window import Window
-from pyspark.sql.types import StringType, ArrayType
-from urllib.parse import quote
-from datetime import datetime
-
-
-if __name__ == '__main__':
-    date_info = CommonUtil.get_sys_arg(1, None)
-    year, month, day = date_info.split("-")
-    table = f"us_merchantwords_brand_analytics_2024_{month}_{day}"
-    spark = SparkUtil.get_spark_session(f"us_merchantwords_brand_analytics_2024:pg2pg,{date_info}")
-
-    df = spark.read.format("jdbc") \
-        .option("url", "jdbc:postgresql://113.100.143.162:5432/selection") \
-        .option("dbtable", table) \
-        .option("user", "yswg_postgres") \
-        .option("password", "yswg_postgres") \
-        .load()
-
-    df.write.format("jdbc") \
-        .option("url", "jdbc:postgresql://113.100.143.162:5443/selection") \
-        .option("dbtable", table) \
-        .option("user", "yswg_postgres") \
-        .option("password", "yswg_postgres") \
-        .mode("append") \
-        .save()
-
-    spark.stop()
\ No newline at end of file
--- a/Pyspark_job/ct/supplement_merchantwords_de.py
+++ b/Pyspark_job/ct/supplement_merchantwords_de.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-
-from utils.spark_util import SparkUtil
-from pyspark.sql.functions import lit, col
-from pyspark.sql.types import StringType, ArrayType
-from urllib.parse import quote
-
-
-if __name__ == '__main__':
-    export_tb = "de_merchantwords_search_term_month_syn_2024"
-    spark = SparkUtil.get_spark_session("MerchantwordsSupplement")
-
-    # 构建 URL 的函数
-    def build_urls(search_term):
-        url_template = f"https://www.amazon.de/s?k={{search_term}}&page={{page_number}}"
-        search_term_chinese = quote(search_term, 'utf-8')
-        search_term_chinese = search_term_chinese.replace("'", '%27').replace("/", '%2F')
-        urls = [
-            url_template.format(
-                search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
-                                                                                                                  '%28').replace(
-                    ')', '%29'), page_number=1),
-            url_template.format(
-                search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
-                                                                                                                  '%28').replace(
-                    ')', '%29'), page_number=2),
-            url_template.format(
-                search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
-                                                                                                                  '%28').replace(
-                    ')', '%29'), page_number=3)
-        ]
-        return urls
-    # 将Python函数转换为UDF
-    spark.udf.register("build_urls", build_urls, ArrayType(StringType()))
-
-    sql1 = """
-        select 
-            keyword,
-            volume,
-            st_monthly_sales,
-            greatest(results_count, asin_total_num) as asin_total_num,
-            st_sp_counts,
-            st_zr_counts
-        from dwt_merchantwords_merge 
-        where site_name = 'de'
-        and batch = '2024-07-01'
-    """
-    df_dwt_merchantwords_merge = spark.sql(sql1)
-
-    # sql2 = """
-    #     select
-    #         keyword
-    #     from dwt_merchantwords_st_detail
-    #     where site_name = 'de'
-    #     and batch = '2024-1'
-    # """
-    # df_dwt_merchantwords_st_detail = spark.sql(sql2)
-
-    # 产品总数大于80且没有月销
-    df1 = df_dwt_merchantwords_merge.filter('asin_total_num > 80 and st_monthly_sales <= 0').select('keyword')
-    print("产品总数大于80且没有月销:" + str(df1.count()))
-
-    # 搜索量较大且没有sp广告词
-    df2 = df_dwt_merchantwords_merge.filter('volume >= 1 and st_sp_counts <= 0').select('keyword')
-    print("搜索量较大且没有sp广告词:" + str(df2.count()))
-
-    # 自然词总数 <= 0的部分
-    df3 = df_dwt_merchantwords_merge.filter('st_zr_counts <= 0').select('keyword')
-    print("自然词总数 <= 0的部分:" + str(df3.count()))
-
-    # # 过滤掉keyword含有中文的数据
-    # df_hive = df_hive.filter(~df_hive["keyword"].rlike("[\u4e00-\u9fff]"))
-
-    df_save = df1.union(df2).union(df3).drop_duplicates(['keyword'])
-    df_save = df_save.selectExpr("keyword AS search_term")
-    df_save = df_save.selectExpr("search_term", "explode(build_urls(search_term)) AS url")
-    df_save = df_save.withColumn("date_info", lit('2024-06-26'))
-
-    # 导出数据到 PostgreSQL 数据库
-    df_save.write.format("jdbc") \
-        .option("url", "jdbc:postgresql://192.168.10.225:5433/selection_de") \
-        .option("dbtable", export_tb) \
-        .option("user", "yswg_postgres") \
-        .option("password", "yswg_postgres") \
-        .mode("append") \
-        .save()
-
-    spark.stop()
\ No newline at end of file
--- a/Pyspark_job/ct/supplement_merchantwords_us.py
+++ b/Pyspark_job/ct/supplement_merchantwords_us.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-
-from utils.spark_util import SparkUtil
-from pyspark.sql.functions import lit, col
-from pyspark.sql.types import StringType, ArrayType
-from urllib.parse import quote
-
-
-if __name__ == '__main__':
-    export_tb = "us_merchantwords_search_term_month_syn_2024"
-    spark = SparkUtil.get_spark_session("MerchantwordsSupplement")
-
-    # 构建 URL 的函数
-    def build_urls(search_term):
-        url_template = f"https://www.amazon.com/s?k={{search_term}}&page={{page_number}}"
-        search_term_chinese = quote(search_term, 'utf-8')
-        search_term_chinese = search_term_chinese.replace("'", '%27').replace("/", '%2F')
-        urls = [
-            url_template.format(
-                search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
-                                                                                                                  '%28').replace(
-                    ')', '%29'), page_number=1),
-            url_template.format(
-                search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
-                                                                                                                  '%28').replace(
-                    ')', '%29'), page_number=2),
-            url_template.format(
-                search_term=search_term_chinese.replace(' ', '+').replace('&', '%26').replace('#', '%23').replace('(',
-                                                                                                                  '%28').replace(
-                    ')', '%29'), page_number=3)
-        ]
-        return urls
-    # 将Python函数转换为UDF
-    spark.udf.register("build_urls", build_urls, ArrayType(StringType()))
-
-    sql1 = """
-        select 
-            keyword,
-            volume,
-            st_zr_counts,
-            st_sp_counts
-        from dwt_merchantwords_merge 
-        where site_name = 'us'
-        and batch = '2024-07-01'
-    """
-    df_dwt_merchantwords_merge = spark.sql(sql1)
-
-    # 搜索量较大且没有sp广告词
-    df1 = df_dwt_merchantwords_merge.filter('volume >= 1 and st_sp_counts <= 0').select('keyword')
-    print("搜索量较大且没有sp广告词:" + str(df1.count()))
-
-    # 自然词总数 <= 0的部分
-    df2 = df_dwt_merchantwords_merge.filter('st_zr_counts <= 0').select('keyword')
-    print("自然词总数 <= 0的部分:" + str(df2.count()))
-
-    # # 过滤掉keyword含有中文的数据
-    # df_hive = df_hive.filter(~df_hive["keyword"].rlike("[\u4e00-\u9fff]"))
-
-    df_save = df1.union(df2).drop_duplicates(['keyword'])
-    df_save = df_save.selectExpr("keyword AS search_term")
-    df_save = df_save.selectExpr("search_term", "explode(build_urls(search_term)) AS url")
-    df_save = df_save.withColumn("date_info", lit('2024-06-26'))
-
-    # 导出数据到 PostgreSQL 数据库
-    df_save.write.format("jdbc") \
-        .option("url", "jdbc:postgresql://192.168.10.225:5433/selection") \
-        .option("dbtable", export_tb) \
-        .option("user", "yswg_postgres") \
-        .option("password", "yswg_postgres") \
-        .mode("append") \
-        .save()
-
-    spark.stop()
\ No newline at end of file
--- a/Pyspark_job/ct/test.py
+++ b/Pyspark_job/ct/test.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
-
-from utils.DorisHelper import DorisHelper
-from utils.spark_util import SparkUtil
-from pyspark.sql import functions as F
-
-
-if __name__ == '__main__':
-        spark = SparkUtil.get_spark_session('aba_to_doris_test')
-        sql = f"""
-        select * 
-        from dwt_aba_last365 
-        where site_name = 'us' 
-          and date_type = 'month' 
-          and date_info = '2024-10';
-        """
-        df_aba = spark.sql(sql).drop('site_name', 'date_type').cache()
-        df_aba = df_aba.withColumn(
-            'date_info', F.concat(F.regexp_replace('date_info', '-', ''), F.lit('01'))
-        )
-        df_aba.show(10, True)
-        columns = df_aba.columns
-        columns_str = ",".join(columns)
-
-        DorisHelper.spark_export_with_columns(df_aba, 'test', 'dwt_aba_last365', columns_str)
-        print('导出完成')
--- a/Pyspark_job/ct/test1.py
+++ b/Pyspark_job/ct/test1.py
-from openai import OpenAI
-
-api_key = "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"
-client = OpenAI(api_key=api_key)
-
-completion = client.chat.completions.create(
-    model="gpt-4o-mini",
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {
-            "role": "user",
-            "content": "Write a haiku about recursion in programming."
-        }
-    ]
-)
-
-print(completion.choices[0].message)
--- a/Pyspark_job/ct/test2.py
+++ b/Pyspark_job/ct/test2.py
-import requests
-
-response = requests.post(
-    f"https://api.stability.ai/v2beta/stable-image/generate/ultra",
-    headers={
-        "authorization": f"sk-f2iOAkResIloOY3yE6xk2LlQbVrtQi3EczZDjA3n9ns7bmeR",
-        "accept": "image/*"
-    },
-    files={"none": ''},
-    data={
-        "prompt": "A little cat is in a bedroom with a bed, TV, and sofa",
-        "output_format": "webp",
-    },
-)
-
-if response.status_code == 200:
-    with open("./cat01.webp", 'wb') as file:
-        file.write(response.content)
-else:
-    raise Exception(str(response.json()))
--- a/Pyspark_job/ct/test_translate.py
+++ b/Pyspark_job/ct/test_translate.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.common_util import CommonUtil
-from utils.hdfs_utils import HdfsUtils
-
-from utils.spark_util import SparkUtil
-from pyspark.sql import functions as F
-from pyspark.sql.types import ArrayType, StringType, StructType, StructField, BooleanType, MapType
-
-"""
-merchantwords 搜索词分词词频
-"""
-
-
-def is_number(str):
-    """
-    判断一个字符是否是数字
-    :param str:
-    :return:
-    """
-    import re
-    return re.match(r"^-?\d+\.?\d+$", str) is not None
-
-
-def word_tokenize(keyword: str):
-    import re
-    keyword = re.sub(r'(\d+\.?\d*|-|\"|,|，|？|\?|/|、|)', '', keyword).strip()
-
-    from nltk.tokenize import word_tokenize
-    result = word_tokenize(keyword, "english")
-    # 过滤标点如下
-    filter_arr = [
-        " ", "\t", "\r", "\n", "(", ")", ",", "，", "[", "]", "、", "-", ":", "&", "|", "+", "``", "'", "'", "\""
-    ]
-
-    return list(filter(lambda x: not is_number(x) and x not in filter_arr, result))
-
-
-def run():
-    spark = SparkUtil.get_spark_session("app_name")
-
-    udf_word_tokenize = F.udf(word_tokenize, ArrayType(StringType()))
-
-    keywords_all = spark.sql("select keyword from dwt_merchantwords_st_detail where site_name='us'").cache()
-    df_all = keywords_all.withColumn("word", F.explode(udf_word_tokenize(F.col("keyword"))))
-    df_all = df_all.groupby(F.col("word")) \
-        .agg(F.count("word").alias("frequency")) \
-        .orderBy(F.col("frequency").desc()) \
-        .select(
-        F.col("word"),
-        F.col("frequency"),
-        F.lit("us").alias("site_name")
-    )
-
-    hive_tb = 'tmp_word_frequency'
-    # #  去重
-    partition_dict = {
-        "site_name": "us"
-    }
-    hdfs_path = CommonUtil.build_hdfs_path(hive_tb, partition_dict)
-    HdfsUtils.delete_hdfs_file(hdfs_path)
-    partition_by = list(partition_dict.keys())
-    print(f"当前存储的表名为：{hive_tb},分区为{partition_by}", )
-    df_all.write.saveAsTable(name=hive_tb, format='hive', mode='append', partitionBy=partition_by)
-
-
-def word_pluralize(keyword: str):
-    from textblob import Word
-    # 单数形式
-    singularize = Word(keyword).singularize().string
-    # 复数形式
-    pluralize = Word(singularize).pluralize().string
-
-    result = {
-        "text": keyword,
-        "singularize": singularize,
-        "pluralize": pluralize,
-        "pluralizeFlag": keyword == pluralize,
-        "not_regular": keyword not in [singularize, pluralize]
-    }
-    return result
-
-
-def word_stem(keyword: str):
-    from nltk.stem.snowball import SnowballStemmer
-    stemmer = SnowballStemmer("english", ignore_stopwords=False)
-    return stemmer.stem(keyword)
-
-
-def word_test():
-    spark = SparkUtil.get_spark_session("word_test")
-    udf_word_pluralize = F.udf(word_pluralize, StructType(
-        [
-            StructField('text', StringType(), True),
-            StructField('singularize', StringType(), True),
-            StructField('pluralize', StringType(), True),
-            StructField('pluralizeFlag', BooleanType(), True),
-            StructField('not_regular', BooleanType(), True),
-        ]
-    ))
-
-    udf_word_stem = F.udf(word_stem, StringType())
-
-    keywords_all = spark.sql("select word,frequency from tmp_word_frequency").cache()
-
-    keywords_all = keywords_all.withColumn("resultMap", udf_word_pluralize(F.col("word"))).select(
-        F.col("word"),
-        F.col("frequency"),
-        F.col("resultMap").getField("singularize").alias("singularize"),
-        F.col("resultMap").getField("pluralize").alias("pluralize"),
-        F.col("resultMap").getField("pluralizeFlag").alias("pluralizeFlag"),
-        F.col("resultMap").getField("not_regular").alias("not_regular"),
-    ).where("(pluralizeFlag == true) or (not_regular == true)")
-
-    # 计算词根
-    keywords_all = keywords_all.withColumn("word_stem", udf_word_stem(F.col("word")))
-    keywords_all = keywords_all.withColumn("singularize_stem", udf_word_stem(F.col("singularize")))
-    keywords_all = keywords_all.withColumn("pluralize_stem", udf_word_stem(F.col("pluralize")))
-
-    hive_tb = 'tmp_word_not_regular_v2'
-
-    keywords_all.write.saveAsTable(name=hive_tb, format='hive', mode='append')
-    print("success")
-
-
-
-
-
-def word_for_download():
-    spark = SparkUtil.get_spark_session("word_for_calc")
-    keywords_all = spark.sql("""
-    select word
-    from tmp_for_market
-    order by volume desc
-    """)
-    CommonUtil.df_export_csv(spark, keywords_all, csv_name='word_for_calc', limit=200 * 10000)
-    print("success")
-    pass
-
-
-if __name__ == '__main__':
-    # word_for_calc()
-    word_for_download()
-    print("success")
--- a/Pyspark_job/ct/test_udf.py
+++ b/Pyspark_job/ct/test_udf.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
-from pyspark.sql.types import StringType
-from utils.templates import Templates
-from google.cloud import translate_v2 as translate
-
-
-class Test(Templates):
-
-    def __init__(self):
-        super().__init__()
-        self.spark = self.create_spark_object(app_name=f"test")
-        self.df_st = self.spark.sql(f"select 1+1;")
-        self.translate_client = translate.Client()
-        # 自定义udf
-        self.u_translate_text = self.spark.udf.register('translate_text', self.translate_text, StringType())
-
-    def translate_text(self, word: str, target_language='zh'):
-        result = self.translate_client.translate(word, target_language=target_language)
-        return result['translatedText']
-
-    def read_data(self):
-        sql1 = f"""
-            select 
-                search_term 
-            from dwt_aba_last365 
-            where site_name = 'us' 
-              and date_type = 'last365day' 
-              and date_info = '2023-12';
-        """
-        self.df_st = self.spark.sql(sql1).limit(20).cache()
-
-    def handle_data(self):
-        self.df_st = self.df_st.withColumn(
-            'translate_text',
-            self.u_translate_text(self.df_st['search_term'])
-        )
-        self.df_st.show(20, False)
-
-
-if __name__ == '__main__':
-    handle_obj = Test()
-    handle_obj.run()
--- a/Pyspark_job/ct/test_vertify.py
+++ b/Pyspark_job/ct/test_vertify.py
-import os
-import sys
-import json
-
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.db_util import DBUtil
-from utils.common_util import CommonUtil
-from utils.spark_util import SparkUtil
-from pyspark.sql import functions as F
-from pyspark.sql.types import StructType, StructField, StringType, IntegerType
-
-if __name__ == '__main__':
-    site_name = CommonUtil.get_sys_arg(1, None)
-    date_type = CommonUtil.get_sys_arg(2, None)
-    date_info = CommonUtil.get_sys_arg(3, None)
-    assert site_name is not None, "site_name 不能为空！"
-    assert date_type is not None, "date_type 不能为空！"
-    assert date_info is not None, "date_info 不能为空！"
-
-    hive_table = f"dwt_flow_asin"
-    partition_dict = {
-        "site_name": site_name,
-        "date_type": date_type,
-        "date_info": date_info
-    }
-
-    # 获取计算分区
-    msg_params = ""
-    # 解析partition_dict获取分区查询条件
-    partition_conditions = []
-    for key, value in partition_dict.items():
-        if value is not None:
-            msg_params += f"{value} "
-            partition_conditions.append(f"{key} = '{value}'")
-    base_msg = f"{hive_table} {msg_params} "
-    site_name = partition_dict.get("site_name")
-    date_type = partition_dict.get("date_type")
-    spark_session = SparkUtil.get_spark_sessionV3("check_fields_rule")
-    # 获取维护的字段验证配置表数据
-    config_table_query = f"""select * from hive_field_verify_config 
-                                        where table_name ='{hive_table}' 
-                                        and site_name = '{site_name}'
-                                        and use_flag = 1 """
-    conn_info = DBUtil.get_connection_info('postgresql', 'us')
-    check_field_df = SparkUtil.read_jdbc_query(
-        session=spark_session,
-        url=conn_info["url"],
-        pwd=conn_info["pwd"],
-        username=conn_info["username"],
-        query=config_table_query
-    )
-    # 获取验证消息
-    check_field_list = check_field_df.select('field_name', 'verify_desc', 'verify_type', 'config_json',
-                                             'msg_usr_list').collect()
-    if not check_field_list:
-        print("============================无验证匹配条件跳过验证===================================")
-        exit()
-    # 创建一个df用于储存验证情况
-    # 定义列的结构
-    schema = StructType([
-        StructField("验证描述", StringType(), True),
-        StructField("验证类型", StringType(), True),
-        StructField("校验字段", StringType(), True),
-        StructField("校验条件查询占比", StringType(), True),
-        StructField("验证占比临界值上限", StringType(), True),
-        StructField("验证占比临界值下限", StringType(), True),
-        StructField("是否验证通过", IntegerType(), True),
-
-    ])
-
-    # 使用定义的结构创建空的 DataFrame
-    check_df = spark_session.createDataFrame([], schema)
-
-    # 进行验证sql组装
-    query = f"""
-                SELECT COUNT(1) AS total_count
-                FROM {hive_table}
-            """
-    # 拼接where条件
-    if partition_conditions:
-        query_total = query + f" WHERE {' AND '.join(partition_conditions)}"
-
-    # 执行sql获取验证值与df
-    total_df = spark_session.sql(query_total).cache()
-    total_count = int(total_df.collect()[0]['total_count'])
-
-    for row in check_field_list:
-        vertify_flag = True
-        field_name = row['field_name']
-        verify_type = row['verify_type']
-        config_json = json.loads(row['config_json'])
-        msg_usr = row['msg_usr_list']
-        msg_usr_list = [user.strip() for user in msg_usr.split(",")] if msg_usr else []
-        sql_condition = config_json['sql_condition']
-        partition_conf_list = config_json['partition_conf']
-
-        for conf in partition_conf_list:
-            conf_site_name = conf["site_name"]
-            conf_date_type = conf["date_type"]
-            if site_name == conf_site_name and date_type == conf_date_type:
-                vertify_flag = True
-                break
-            else:
-                vertify_flag = False
-
-        # 没有合适的匹配维度
-        if not vertify_flag:
-            break
-
-        # 拼接外部查询条件
-        if sql_condition:
-            query_field_check = query_total + f" AND {sql_condition} "
-
-        check_count_df = spark_session.sql(query_field_check).cache()
-        check_count = int(check_count_df.collect()[0]['total_count'])
-
-        calcult_rate = round((check_count / total_count), 3)
-
-        waring_max = conf['max_rate']
-        waring_min = conf['min_rate']
-        verify_flag = 1 if (calcult_rate <= waring_max) and (calcult_rate >= waring_min) else 0
-        ratio_df = spark_session.createDataFrame([(row['verify_desc'],verify_type,field_name,calcult_rate,waring_max,waring_min,verify_flag)],schema).repartition(1)
-        check_df = check_df.unionByName(ratio_df, False)
-
-    if check_df.count() < 1 :
-        print("无验证项验证")
-        exit()
-    check_df.show(50, truncate=False)
-
-    # 对校验结果进行判断是否有校验不通过的数据
-    schema_flag = bool(check_df.select(F.min("是否验证通过").alias("result")).first().asDict()['result'])
-    if not schema_flag:
-        msg = f"数据表：{hive_table} {msg_params}，计算数据存在验证不通过，请检查数据是否异常！！具体信息请查看日志！！"
-        CommonUtil.send_wx_msg(['chenjianyun'], f"\u26A0 {hive_table} {msg_params}流程数据导出前验证异常", msg)
-        spark_session.stop()
-    pass
\ No newline at end of file
--- a/Pyspark_job/ct/test_word_tokenize.py
+++ b/Pyspark_job/ct/test_word_tokenize.py
-def word_tokenize(title: str):
-    """
-    分词器
-    """
-    from nltk.tokenize import word_tokenize
-    result = word_tokenize(title, "english")
-    return result
-
-
-if __name__ == '__main__':
-    aba = "nation's bravest tales of courage and heroism"
-    print(word_tokenize(aba))
--- a/Pyspark_job/ct/update_de_brand_analytics_month_2024_05.py
+++ b/Pyspark_job/ct/update_de_brand_analytics_month_2024_05.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.spark_util import SparkUtil
-
-
-if __name__ == '__main__':
-    export_tb = "de_brand_analytics_month"
-    spark = SparkUtil.get_spark_session("update_de_brand_analytics_month_2024_05")
-
-    sql1 = """
-        select 
-            search_term
-        from ods_st_quantity_being_sold 
-        where site_name = 'de' 
-        and date_type = 'month' 
-        and date_info = '2024-05'
-        and quantity_being_sold in (16, 48)
-    """
-    df_aba = spark.sql(sql1)
-
-    sql2 = """
-        select 
-            search_term,
-            quantity_being_sold
-        from dwt_merchantwords_merge 
-        where site_name = 'de'
-    """
-    df_me = spark.sql(sql2)
-
-    df_save = df_aba.join(
-        df_me, on='search_term', how='inner'
-    )
-
-    # 导出数据到 PostgreSQL 数据库
-    df_save.write.format("jdbc") \
-        .option("url", "jdbc:postgresql://192.168.10.223:5433/selection_de") \
-        .option("dbtable", export_tb) \
-        .option("user", "yswg_postgres") \
-        .option("password", "yswg_postgres") \
-        .mode("append") \
-        .save()
-
-    spark.stop()
\ No newline at end of file
--- a/Pyspark_job/ct/update_merchantwords_measure.py
+++ b/Pyspark_job/ct/update_merchantwords_measure.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
-from pyspark.sql.functions import row_number
-from pyspark.sql import functions as F
-from pyspark.sql.window import Window
-from utils.common_util import CommonUtil
-from utils.spark_util import SparkUtil
-
-if __name__ == '__main__':
-    spark = SparkUtil.get_spark_session("UpdateMerchantwords")
-    hive_tb = 'dwd_merchantwords_measure'
-    partition_dict = {
-        "site_name": 'us',
-        "batch": '2023-01'
-    }
-    sql1 = f"""
-    select 
-        keyword,  
-        lang, 
-        st_ao_val, 
-        st_zr_flow_proportion, 
-        min_bid, 
-        max_bid, 
-        suggested_bid, 
-        volume, 
-        avg_3m, 
-        avg_12m, 
-        asin_total_num, 
-        asin_num, 
-        self_asin_num, 
-        self_asin_proportion, 
-        st_sp_counts, 
-        st_zr_counts, 
-        st_monthly_sales, 
-        listing_sales_avg, 
-        reviews_avg, 
-        rating_avg, 
-        price_avg, 
-        depth 
-    from dwd_merchantwords_measure 
-    where site_name = 'us'
-    and batch = '2023-01';
-    """
-    df_dwd = spark.sql(sqlQuery=sql1).cache()
-    df_dwd.repartition(80)
-
-    sql2 = f"""
-    select 
-        keyword, 
-        results_count, 
-        sponsored_ads_count, 
-        page_1_reviews, 
-        appearance, 
-        last_seen, 
-        update_time 
-    from dwt_merchantwords_st_detail 
-    where site_name = 'us'
-    and batch = '2023-1';
-    """
-    df_merchantwords_detail = spark.sql(sqlQuery=sql2)
-    df_merchantwords_detail = df_merchantwords_detail \
-        .withColumn("row_num", row_number().over(Window.orderBy("keyword"))) \
-        .filter("row_num BETWEEN 1 AND 12000000") \
-        .repartition(80) \
-        .drop("row_num") \
-        .cache()
-    df = df_dwd.join(df_merchantwords_detail, 'keyword', 'left')
-    df = df.withColumn(
-        'site_name',
-        F.lit('us')
-    ).withColumn(
-        'batch',
-        F.lit('2023-01')
-    )
-    CommonUtil.save_or_update_table(spark_session=spark,
-                                    hive_tb_name=hive_tb,
-                                    partition_dict=partition_dict,
-                                    df_save=df,
-                                    drop_exist_tmp_flag=True)
-
-
-
--- a/Pyspark_job/ct/us_aba_last365_word_frequency.py
+++ b/Pyspark_job/ct/us_aba_last365_word_frequency.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-
-from utils.spark_util import SparkUtil
-from pyspark.sql.functions import count, col
-
-
-class WordFrequency(object):
-
-    def __init__(self):
-        self.spark = SparkUtil.get_spark_session("us_aba_last365_word_frequency")
-
-    def run(self):
-        sql1 = f"""
-            select search_term, date_info
-            from dwt_aba_st_analytics
-            where site_name = 'us'
-              and date_type = 'month'
-              and date_info in
-                ('2024-10', '2024-09', '2024-08', '2024-07', '2024-06', '2024-05',
-                 '2024-04', '2024-03', '2024-02', '2024-01', '2023-12', '2023-11')
-              and rank <= 1000000
-              and st_brand_label = 1;
-        """
-        df_st = self.spark.sql(sql1).cache()
-        print("df_st数量是：")
-        print(df_st.count())
-
-        sql2 = f"""
-            select search_term, first_match_brand as brand, date_info
-            from dws_st_brand_info
-            where site_name = 'us'
-              and date_type = 'month'
-              and date_info in
-                  ('2024-10', '2024-09', '2024-08', '2024-07', '2024-06', '2024-05',
-                   '2024-04', '2024-03', '2024-02', '2024-01', '2023-12', '2023-11')
-              and st_brand_label = 1;
-        """
-        df_brand = self.spark.sql(sql2).cache()
-        print("df_brand数量是：")
-        print(df_brand.count())
-
-        df_save = df_st.join(
-            df_brand, on=['date_info', 'search_term'], how='left'
-        ).drop('date_info')
-        print("df_save数量是：")
-        print(df_save.count())
-
-        df_save = df_save.groupby(['brand']).agg(
-            count('brand').alias('frequency')
-        ).orderBy('frequency', ascending=False)
-        df_save.show(20, False)
-
-        df_save = df_save.withColumn("frequency", col("frequency").cast("int"))
-        total_sum = df_save.select("frequency").groupBy().sum().collect()[0][0]
-        if total_sum == df_st.count():
-            print('验证成功')
-        else:
-            print('验证失败')
-
-        output_path = "hdfs:///user/chenyuanjie/test1/"
-        df_save.write.mode("overwrite").format("csv").option("delimiter", "^").option("lineSep", "\n").option("header", "false").option("compression", "none").save(output_path)
-
-
-if __name__ == '__main__':
-    obj = WordFrequency()
-    obj.run()
--- a/Pyspark_job/ct/us_st_keepa_syn_2024.py
+++ b/Pyspark_job/ct/us_st_keepa_syn_2024.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.ssh_util import SSHUtil
-from utils.common_util import CommonUtil
-from utils.hdfs_utils import HdfsUtils
-
-if __name__ == '__main__':
-    hive_tb = "tmp_us_st_keepa_syn_2024"
-    hdfs_path = "/home/big_data_selection/tmp/tmp_us_st_keepa_syn_2024"
-    print(f"hdfs_path is {hdfs_path}")
-
-    query = f"""
-        select
-            asin
-        from us_st_keepa_syn_2024
-        where 1 = 1
-        and \$CONDITIONS
-    """
-
-    db_type = "postgresql"
-    empty_flag, check_flag = CommonUtil.check_schema_before_import(db_type=db_type,
-                                                                   site_name='us',
-                                                                   query=query,
-                                                                   hive_tb_name=hive_tb,
-                                                                   msg_usr=['chenyuanjie']
-                                                                   )
-    assert check_flag, f"导入hive表{hive_tb}表结构检查失败！请检查query是否异常！！"
-
-    if not empty_flag:
-        sh = CommonUtil.build_import_sh(site_name='us',
-                                        db_type=db_type,
-                                        query=query,
-                                        hdfs_path=hdfs_path)
-        # 导入前先删除
-        HdfsUtils.delete_hdfs_file(hdfs_path)
-        client = SSHUtil.get_ssh_client()
-        SSHUtil.exec_command_async(client, sh, ignore_err=False)
-        CommonUtil.after_import(hdfs_path=hdfs_path, hive_tb=hive_tb)
-        client.close()
-
-    pass
--- a/Pyspark_job/ct/us_st_keepa_syn_2024_export.py
+++ b/Pyspark_job/ct/us_st_keepa_syn_2024_export.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))
-
-from utils.spark_util import SparkUtil
-from pyspark.sql.functions import col, lit
-from utils.StarRocksHelper import StarRocksHelper
-
-
-if __name__ == '__main__':
-    spark = SparkUtil.get_spark_session("us_st_keepa_syn_2024_export")
-
-    # 从SR数据库中读取已有数据
-    sql = """
-    select distinct asin from selection.us_asin_latest_detail where date_info = '2024-06'
-    """
-    df_sr = StarRocksHelper.spark_import_with_sql(spark, sql).repartition(80, 'asin').cache()
-    print("starrocks读取：")
-    df_sr.show(10)
-
-    sql = """
-    select asin from tmp_us_st_keepa_syn_2024;
-    """
-    df_pg = spark.sql(sql).drop_duplicates(['asin']).repartition(80, 'asin').cache()
-    print("pg读取：")
-    df_pg.show(10)
-
-    df = df_sr.subtract(df_pg)
-    df_sr.unpersist()
-    df_pg.unpersist()
-
-    df = df.withColumn(
-        'state',
-        lit(7)
-    ).withColumn(
-        'asin_trun_4',
-        col('asin').substr(1, 4)
-    )
-    df.show(10)
-    print(df.count())
-
-    df.write.format("jdbc") \
-        .option("url", "jdbc:postgresql://192.168.10.224:5433/selection") \
-        .option("dbtable", "us_st_keepa_syn_2024") \
-        .option("user", "yswg_postgres") \
-        .option("password", "yswg_postgres") \
-        .mode("append") \
-        .save()
-
-    spark.stop()
--- a/Pyspark_job/ct/us_st_keepa_syn_2024_update.py
+++ b/Pyspark_job/ct/us_st_keepa_syn_2024_update.py
-import os
-import sys
-
-from sqlalchemy.dialects.postgresql import pypostgresql
-
-sys.path.append(os.path.dirname(sys.path[0]))
-
-from utils.spark_util import SparkUtil
-from utils.db_util import DBUtil
-from utils.StarRocksHelper import StarRocksHelper
-
-
-if __name__ == '__main__':
-    spark = SparkUtil.get_spark_session("us_st_keepa_syn_2024_export")
-
-    # 从SR数据库中读取已有数据
-    sql = """
-    select distinct asin from selection.us_asin_latest_detail where date_info = '2024-06' and (asin_launch_time>'2024-07-19' or asin_launch_time<'1990-01-01')
-    """
-    df_sr = StarRocksHelper.spark_import_with_sql(spark, sql).repartition(80, 'asin').cache()
-    print("starrocks读取：")
-    df_sr.show(10)
-
-    sql = """
-    select asin from tmp_us_st_keepa_syn_2024;
-    """
-    df_pg = spark.sql(sql).drop_duplicates(['asin']).repartition(80, 'asin').cache()
-    print("pg读取：")
-    df_pg.show(10)
-
-    df = df_sr.subtract(df_pg)
-    print(df.count())
-    df_sr.unpersist()
-    df_pg.unpersist()
-
-    update_asin = df.select("asin").rdd.map(lambda row: row[0]).collect()
-    print(update_asin)
-    pg_engine = DBUtil.get_db_engine('postgresql', 'us')
-    with pg_engine.begin() as conn:
-        update_query = f"""
-        UPDATE us_st_keepa_syn_2024 SET state = 5 WHERE asin IN {tuple(update_asin)}
-        """
-        conn.execute(update_query)
-
-    spark.stop()
--- a/Pyspark_job/ct/verify_rank.py
+++ b/Pyspark_job/ct/verify_rank.py
-import os
-import sys
-
-from pyspark.sql.types import ArrayType, FloatType, StructType, StructField, StringType
-
-sys.path.append(os.path.dirname(sys.path[0]))
-from utils.common_util import CommonUtil
-from utils.spark_util import SparkUtil
-from pyspark.sql import functions as F
-
-
-class VerifyRank(object):
-
-    def __init__(self):
-        self.spark = SparkUtil.get_spark_session("{self.__class__.__name__}")
-
-    def run(self):
-        sql = f"""
-            select 
-                search_term,
-                rank,
-                date_info
-            from ods_brand_analytics
-            where site_name = 'us'
-              and date_type = 'week'
-              and date_info >= '2024-01'
-              and rank < 100000
-        """
-        df_all = self.spark.sql(sql).repartition(40, 'search_term').cache()
-
-        def leave_one_out_means(structs):
-            ranks = [x['rank'] for x in structs]
-            date_infos = [x['date_info'] for x in structs]
-
-            total_sum = sum(ranks)
-            n = len(ranks)
-            if n > 1:
-                means = [round((total_sum - rank) / (n - 1), 2) for rank in ranks]
-            else:
-                means = [ranks[0]]
-            result = [{"means": mean, "date_info": date_info} for mean, date_info in zip(means, date_infos)]
-            return result
-
-        leave_one_out_means_udf = F.udf(leave_one_out_means, ArrayType(StructType([
-            StructField("means", FloatType(), True),
-            StructField("date_info", StringType(), True)
-        ])))
-
-        df_agg = df_all.groupBy("search_term").agg(
-            F.collect_list(F.struct("rank", "date_info")).alias("collect_row")
-            # F.collect_list("rank").alias("values")
-        )
-        df_agg = df_agg.withColumn(
-            "collect_row", leave_one_out_means_udf(F.col("collect_row"))
-        )
-
-        def calc_quantiles(structs):
-            values = [x['means'] for x in structs]
-            values = sorted(values)  # 将组内的数值进行排序
-            n = len(values)
-            # 计算 Q1 和 Q3 的位置（基于 25% 和 75% 的位置）
-            q1_index = int(n * 0.25)
-            q3_index = int(n * 0.75)
-            if n > 1:
-                q1 = values[q1_index]
-                q3 = values[q3_index]
-            else:
-                q1 = values[0]
-                q3 = values[0]
-            return [float(q1), float(q3)]
-        quantile_udf = F.udf(calc_quantiles, ArrayType(FloatType()))
-
-        df_agg = df_agg.withColumn(
-            "quantiles", quantile_udf(F.col("collect_row"))
-        ).withColumn(
-            "q1", F.col("quantiles")[0]
-        ).withColumn(
-            "q3", F.col("quantiles")[1]
-        ).withColumn(
-            "iqr", F.expr("q3 - q1")
-        ).withColumn(
-            "lower_bound", F.expr("q1 - 100 * iqr")
-        ).withColumn(
-            "upper_bound", F.expr("q3 + 100 * iqr")
-        ).select(
-            'search_term', 'collect_row', 'lower_bound', 'upper_bound'
-        ).repartition(40, 'search_term')
-
-        df_save = df_agg.withColumn(
-            "filtered_collect_row",
-            F.filter(
-                "collect_row",
-                lambda x: (x["means"] < F.col("lower_bound")) | (x["means"] > F.col("upper_bound"))
-            )
-        ).filter(
-            F.size(F.col("filtered_collect_row")) > 0
-        ).withColumn(
-            "has_2024_08",
-            F.exists(
-                "filtered_collect_row",
-                lambda x: x["date_info"].like("2024-08%")
-            )
-        ).filter(
-            ~F.col("has_2024_08")  # 过滤掉包含 '2024-08' 的行
-        ).select(
-            'search_term', 'filtered_collect_row', 'lower_bound', 'upper_bound'
-        )
-
-        df_save.show(20, False)
-
-
-if __name__ == '__main__':
-    site_name = CommonUtil.get_sys_arg(1, None)
-    date_type = CommonUtil.get_sys_arg(2, None)
-    date_info = CommonUtil.get_sys_arg(3, None)
-    obj = VerifyRank()
-    obj.run()
--- a/Pyspark_job/dim/dim_asin_amorders_info.py
+++ b/Pyspark_job/dim/dim_asin_amorders_info.py
@@ -9,8 +9,6 @@ from pyspark.sql import functions as F
 from pyspark.sql.window import Window
 from pyspark.sql.types import StructType, StructField, IntegerType, StringType
 # 导入udf公共方法
-from yswg_utils.common_udf import udf_parse_bs_category
-# from ..yswg_utils.common_udf import udf_parse_bs_category
 from utils.spark_util import SparkUtil
 from utils.hdfs_utils import HdfsUtils

@@ -39,7 +37,7 @@ class DimAsinAmordersInfo(Templates):
        # us month, month_week, 4_week, week
        # uk/de month, 4_week, week
        if self.site_name in ['us', 'uk', 'de']:
-            if self.date_type in ['month', 'month_week']:
+            if self.date_type in ['month', 'month_week', 'month_aba_me']:
                if (self.site_name == 'us') or (self.site_name in ['uk', 'de'] and self.date_info >= '2024-05'):
                    params = f"date_type='{self.date_type}' and date_info = '{self.date_info}'"
                else:
@@ -69,6 +67,8 @@ class DimAsinAmordersInfo(Templates):
        results_list = re.findall(pattern, str(asin_amazon_orders_str).lower())
        if len(results_list) == 1:
            result = int(results_list[0].replace("k", "000").replace(" ", ""))
+            if 'week' in asin_amazon_orders_str:
+                result = result * 4
        else:
            result = None
        return result

--- a/Pyspark_job/dim/dim_asin_bs_info.py
+++ b/Pyspark_job/dim/dim_asin_bs_info.py
--- a/Pyspark_job/dim/dim_asin_detail.py
+++ b/Pyspark_job/dim/dim_asin_detail.py
@@ -106,7 +106,7 @@ class DimAsinDetail(object):
            df_loc = df.loc[df.id == int(cur_id) - 21]
            week4 = list(df_loc.year_week)[0]
            complete_date_info_tuple = (week1, week2, week3, week4)
-        elif self.date_type in ['month', 'month_week']:
+        elif self.date_type in ['month', 'month_week', 'month_aba_me']:
            df_loc = df.loc[(df.year_month == f"{self.date_info}") & (df.week_day == 1)]
            complete_date_info_tuple = tuple(df_loc.year_week)
        print("self.complete_date_info_tuple:", complete_date_info_tuple)
@@ -118,6 +118,8 @@ class DimAsinDetail(object):
                date_sql = f" and date_type='{self.date_type}' and date_info = '{self.date_info}'"
            elif self.date_type == DateTypes.month.name and self.date_info >= '2023-10':
                date_sql = f" and date_type='{self.date_type}' and date_info = '{self.date_info}'"
+            elif self.date_type == 'month_aba_me':
+                date_sql = f" and date_type='{self.date_type}' and date_info = '{self.date_info}'"
            else:
                date_sql = f"and date_type='week' and date_info in {self.complete_date_info_tuple}"
        elif self.site_name in ['uk', 'de']:
@@ -154,8 +156,9 @@ class DimAsinDetail(object):
            five_star, low_star, together_asin, ac_name, node_id, data_type as asin_data_type, variat_list, 
            `describe` as asin_describe, follow_sellers as asin_follow_sellers, product_description, 
            image_view as asin_image_view, spider_int as asin_spider_num, buy_sales, lob_asin_json as asin_lob_info, 
-            REGEXP_REPLACE(seller_json, chr(10), '') as seller_json, buy_box_seller_type as asin_buy_box_seller_type, customer_reviews_json, parent_asin, img_list, 
-            created_at as created_time, updated_at as updated_time, updated_at as dt, variat_num as variation_num 
+            REGEXP_REPLACE(seller_json, chr(10), '') as seller_json, buy_box_seller_type as asin_buy_box_seller_type, 
+            customer_reviews_json, parent_asin, img_list, created_at as created_time, updated_at as updated_time, 
+            updated_at as dt, variat_num as variation_num, fbm_delivery_price as asin_fbm_price 
            from ods_asin_detail where site_name='{self.site_name}' {self.date_sql}"""
        print(sql)
        self.df_asin_detail = self.spark.sql(sqlQuery=sql)
@@ -208,7 +211,7 @@ class DimAsinDetail(object):
            "category_first_name", F.lower("category_first_name")
        ).repartition(100).persist(StorageLevel.DISK_ONLY)
        self.df_asin_category.show(10, truncate=False)
-        if self.date_type in ['month', 'month_week'] and self.date_info < '2024-06':
+        if self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info < '2024-06':
            sql = f"""
            SELECT asin, parent_asin, color as asin_color, `size` as asin_size, style as asin_style, 
            CASE WHEN state = 1 THEN 1 WHEN state = 2 THEN 0 ElSE NULL END as asin_is_sale, updated_time 
@@ -259,7 +262,7 @@ class DimAsinDetail(object):

    # 处理asin的变体信息
    def handle_asin_variation_attribute(self):
-        if self.date_type in ['month', 'month_week'] and self.date_info >= '2024-06':
+        if self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info >= '2024-06':
            print("执行新版的变体信息整合")
            variat_schema = ArrayType(ArrayType(StringType()))
            self.df_asin_detail = self.df_asin_detail.withColumn("variat_list_change", F.from_json(F.col("variat_list"), variat_schema))
@@ -278,7 +281,7 @@ class DimAsinDetail(object):
            self.df_asin_detail = self.df_asin_detail.join(
                df_asin_with_variation, on=['asin'], how='left'
            )
-        elif self.date_type in ['month', 'month_week'] and self.date_info < '2024-06':
+        elif self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info < '2024-06':
            print("执行历史数据的变体信息整合")
            window = Window.partitionBy(self.df_asin_variat.asin).orderBy(
                self.df_asin_variat.updated_time.desc_nulls_last())
@@ -298,7 +301,7 @@ class DimAsinDetail(object):

    # 处理asin的配送方式信息
    def handle_asin_buy_box_seller_type(self):
-        if (self.date_type in ['month', 'month_week'] and self.date_info >= '2024-05') \
+        if (self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info >= '2024-05') \
                or (self.date_type == '4_week' and self.date_info >= '2024-21'):
            self.df_asin_detail = self.df_asin_detail.withColumn(
                'seller_json',
@@ -440,7 +443,7 @@ class DimAsinDetail(object):

    # 处理asin小图信息
    def handle_asin_img_info(self):
-        if self.date_type in ['month', 'month_week'] and self.date_info >= '2024-06':
+        if self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info >= '2024-06':
            img_schema = ArrayType(ArrayType(StringType()))
            df_asin_with_img = self.df_asin_detail.withColumn("img_list", F.from_json(F.col("img_list"), img_schema)).filter(F.size("img_list") > 0).\
                select("asin", F.explode("img_list").alias("img_attributes")).\
@@ -456,7 +459,7 @@ class DimAsinDetail(object):

    # 处理parent_asin下最新变体信息
    def handle_latest_variation_info(self):
-        if self.date_type in ['month', 'month_week'] and self.date_info >= '2024-06':
+        if self.date_type in ['month', 'month_week', 'month_aba_me'] and self.date_info >= '2024-06':
            max_report_sql = f"""
                SELECT MAX(date_info) as table_date_info FROM {self.doris_db}.{self.parent_asin_latest_detail_table}
            """
@@ -541,6 +544,7 @@ class DimAsinDetail(object):
                    "package_quantity", "is_package_quantity_abnormal", "asin_quantity_variation_type", "seller_json",
                    "asin_bought_month", "asin_length", "asin_width", "asin_height", "asin_is_self",
                    "customer_reviews_json", "img_list", "variat_list",
+                    F.round("asin_fbm_price", 2).alias("asin_fbm_price"),
                    F.lit(self.site_name).alias('site_name'),
                    F.lit(self.date_type).alias('date_type'),
                    F.lit(self.date_info).alias('date_info')).persist(StorageLevel.MEMORY_ONLY)

--- a/Pyspark_job/dim/dim_asin_label.py
+++ b/Pyspark_job/dim/dim_asin_label.py
@@ -47,6 +47,8 @@ class DimAsinLabel(object):
                date_sql = f" and date_type='{self.date_type}' and date_info = '{self.date_info}'"
            elif self.date_type == DateTypes.month.name and self.date_info >= '2023-10':
                date_sql = f" and date_type='{self.date_type}' and date_info = '{self.date_info}'"
+            elif self.date_type == 'month_aba_me':
+                date_sql = f" and date_type='{self.date_type}' and date_info = '{self.date_info}'"
            else:
                date_sql = f"and date_type='week' and date_info in {self.complete_date_info_tuple}"
        elif self.site_name in ['uk', 'de']:

--- a/Pyspark_job/dim/dim_asin_launchtime_info.py
+++ b/Pyspark_job/dim/dim_asin_launchtime_info.py
@@ -84,7 +84,7 @@ class DimAsinLaunchtimeInfo(object):
        else:
            print("==================执行分区数据整合=================")
            # 按分区检测是否有新增的asin
-            if self.date_type in (DateTypes.week.name,DateTypes.month.name,DateTypes.month_week.name):
+            if self.date_type in (DateTypes.week.name,DateTypes.month.name,DateTypes.month_week.name, 'month_aba_me'):
                # 取st维度的st下的asin数据
                sql = f"""
                            select 

--- a/Pyspark_job/dim/dim_asin_related_traffic.py
+++ b/Pyspark_job/dim/dim_asin_related_traffic.py
--- a/Pyspark_job/dim/dim_asin_stable_info.py
+++ b/Pyspark_job/dim/dim_asin_stable_info.py
@@ -189,7 +189,9 @@ class DimAsinStableInfo(Templates):

    def read_data(self):
        if self.site_name == 'us':
-            params = f" and (date_type='week' or (date_type='month' and date_info='2023-10') or (date_type in ('month_week', 'month') and date_info>='2023-11'))"
+            # 选取最近1年的详情来计算
+            params = f" and date_type in ('month_week', 'month', 'month_aba_me') and date_info>='2025-01';"
+            # params = f" and (date_type='week' or (date_type='month' and date_info='2023-10') or (date_type in ('month_week', 'month') and date_info>='2023-11'))"
        else:
            params = f" and (date_type='week' or (date_type in ('month_week', 'month') and date_info>='2023-05'))"


--- a/Pyspark_job/dim/dim_cal_asin_history_detail.py
+++ b/Pyspark_job/dim/dim_cal_asin_history_detail.py
@@ -95,7 +95,7 @@ class DimCalAsinDetail(object):
                              and date_type='month' ;
                                """
            self.date_type = 'day_all'
-        elif self.date_type in (DateTypes.week.name, DateTypes.month.name, DateTypes.month_week.name):
+        elif self.date_type in (DateTypes.week.name, DateTypes.month.name, DateTypes.month_week.name, 'month_aba_me'):
            sql = f"""select 
                              asin, 
                              asin_img_url, 

--- a/Pyspark_job/dwd/dwd_ai_asin_add.py
+++ b/Pyspark_job/dwd/dwd_ai_asin_add.py
+import os
+import sys
+
+sys.path.append(os.path.dirname(sys.path[0]))
+
+from utils.spark_util import SparkUtil
+from pyspark.sql import functions as F
+from utils.common_util import CommonUtil
+from utils.hdfs_utils import HdfsUtils
+from utils.db_util import DBUtil
+
+
+class DwdAiAsinAdd(object):
+
+    def __init__(self, site_name="us", date_type="month", date_info="2024-10"):
+        self.site_name = site_name
+        self.date_type = date_type
+        self.date_info = date_info
+        app_name = f"{self.__class__.__name__}:{site_name}:{date_type}:{date_info}"
+        self.spark = SparkUtil.get_spark_session(app_name)
+
+        # 获取历史date_info
+        # 环比上月
+        self.date_info_last_month = CommonUtil.get_month_offset(self.date_info, -1)
+        # 同比去年
+        self.date_info_last_year = CommonUtil.get_month_offset(self.date_info, -12)
+
+        self.df_base_asin = self.spark.sql(f"select 1+1;")
+        self.df_asin_last_month = self.spark.sql(f"select 1+1;")
+        self.df_ai_asin = self.spark.sql(f"select 1+1;")
+        self.df_save1 = self.spark.sql(f"select 1+1;")
+        self.df_save2 = self.spark.sql(f"select 1+1;")
+
+    def run(self):
+        self.read_data()
+        self.handle_data()
+        self.save_data()
+
+    def read_data(self):
+        # 基于流量选品筛选ASIN信息库所需asin
+        sql1 = f"""
+        select
+            asin,
+            asin_bought_month,
+            category_id,
+            asin_category_desc
+        from dwt_flow_asin
+        where site_name = '{self.site_name}'
+          and date_type = '{self.date_type}'
+          and date_info = '{self.date_info}'
+          and asin_type in (0, 1)
+          and asin_bought_month >= 50
+        """
+        self.df_base_asin = self.spark.sql(sqlQuery=sql1).repartition(40, 'asin')
+
+        # 筛选需要过滤掉的分类
+        sql2 = f"""
+        select distinct category_id as category_id from category_full_name a 
+        where EXISTS (
+            select 1 from category_disable_config b where b.id_path is not null and a.id_path like concat(b.id_path, '%') and a.site = b.site 
+        ) and a.site = '{self.site_name}'
+        """
+        conn_info = DBUtil.get_connection_info("mysql", "us")
+        df_filter_category_id = SparkUtil.read_jdbc_query(
+            session=self.spark,
+            url=conn_info["url"],
+            pwd=conn_info["pwd"],
+            username=conn_info["username"],
+            query=sql2
+        )
+
+        # 二次过滤分类防止遗漏
+        sql3 = f"""
+        select distinct name_path as asin_category_desc from category_disable_config where site = '{self.site_name}'
+        """
+        df_filter_category_desc = SparkUtil.read_jdbc_query(
+            session=self.spark,
+            url=conn_info["url"],
+            pwd=conn_info["pwd"],
+            username=conn_info["username"],
+            query=sql3
+        )
+
+        # 读取流量选品历史数据
+        sql4 = f"""
+        select
+            asin,
+            asin_bought_month,
+            date_info
+        from dwt_flow_asin
+        where site_name = '{self.site_name}'
+          and date_type = '{self.date_type}'
+          and date_info in ('{self.date_info_last_month}', '{self.date_info_last_year}')
+        """
+        df_flow_asin = self.spark.sql(sqlQuery=sql4).cache()
+        df_flow_asin_last_month = df_flow_asin.filter(f"date_info = '{self.date_info_last_month}'").withColumnRenamed(
+            'asin_bought_month', 'asin_bought_last_month'
+        ).drop('date_info').repartition(40, 'asin')
+        df_flow_asin_last_year = df_flow_asin.filter(f"date_info = '{self.date_info_last_year}'").withColumnRenamed(
+            'asin_bought_month', 'asin_bought_last_year'
+        ).drop('date_info').repartition(40, 'asin')
+
+        # 关联历史数据
+        self.df_base_asin = self.df_base_asin.join(
+            df_filter_category_id, 'category_id', 'left_anti'
+        ).join(
+            df_filter_category_desc, 'asin_category_desc', 'left_anti'
+        ).join(
+            df_flow_asin_last_month, 'asin', 'left'
+        ).join(
+            df_flow_asin_last_year, 'asin', 'left'
+        ).drop('category_id', 'asin_category_desc').cache()
+        df_flow_asin.unpersist()
+        print("ASIN信息库基础数据如下：")
+        self.df_base_asin.show(10, truncate=True)
+
+        # 读取上个月维度asin，判断新增asin
+        sql5 = f"""
+        select asin, 0 as asin_is_new_flag 
+        from dwd_ai_asin_add 
+        where site_name = '{self.site_name}' 
+          and date_type = '{self.date_type}'
+          and date_info = '{self.date_info_last_month}'
+        """
+        self.df_asin_last_month = self.spark.sql(sqlQuery=sql5).repartition(40, 'asin')
+
+        # 读取ASIN信息库历史数据
+        sql6 = f"""
+        select asin from dim_ai_asin_base where site_name = '{self.site_name}'
+        """
+        self.df_ai_asin = self.spark.sql(sqlQuery=sql6).repartition(40, 'asin').cache()
+        print(f"ASIN信息库历史数量：{self.df_ai_asin.count()}")
+
+    def handle_data(self):
+        # 计算月销同比、环比
+        self.df_base_asin = self.df_base_asin.withColumn(
+            'asin_bought_yoy',
+            F.when(
+                F.col("asin_bought_month").isNull() & F.col("asin_bought_last_year").isNull(), F.lit(None)
+            ).when(
+                F.col("asin_bought_month").isNull(), F.lit(-1000.0000)
+            ).when(
+                F.col("asin_bought_last_year").isNull(), F.lit(1000.0000)
+            ).otherwise(
+                F.round((F.col("asin_bought_month") - F.col("asin_bought_last_year")) / F.col("asin_bought_last_year"), 4)
+            )
+        ).withColumn(
+            'asin_bought_mom',
+            F.when(
+                F.col("asin_bought_month").isNull() & F.col("asin_bought_last_month").isNull(), F.lit(None)
+            ).when(
+                F.col("asin_bought_month").isNull(), F.lit(-1000.0000)
+            ).when(
+                F.col("asin_bought_last_month").isNull(), F.lit(1000.0000)
+            ).otherwise(
+                F.round((F.col("asin_bought_month") - F.col("asin_bought_last_month")) / F.col("asin_bought_last_month"), 4)
+            )
+        )
+
+        # 给asin打销量标签：1.上升、2.不变、3.下降、0.默认
+        self.df_base_asin = self.df_base_asin.withColumn(
+            'asin_bought_month_flag',
+            F.when(
+                F.col("asin_bought_month").isNull() | F.col("asin_bought_last_month").isNull(), F.lit(0)
+            ).when(
+                F.col("asin_bought_month") - F.col("asin_bought_last_month") > 0, F.lit(1)
+            ).when(
+                F.col("asin_bought_month") - F.col("asin_bought_last_month") == 0, F.lit(2)
+            ).when(
+                F.col("asin_bought_month") - F.col("asin_bought_last_month") < 0, F.lit(3)
+            ).otherwise(F.lit(0))
+        )
+
+        # 给asin打新增标签（基于ASIN信息库）：1.新增、0.默认
+        self.df_base_asin = self.df_base_asin.join(
+            self.df_asin_last_month, 'asin', 'left'
+        ).fillna({
+            'asin_is_new_flag': 1
+        }).cache()
+
+    def save_data(self):
+        # 字段标准化
+        self.df_save1 = self.df_base_asin.select(
+            F.col("asin"),
+            F.col("asin_bought_month"),
+            F.col("asin_bought_last_month"),
+            F.col("asin_bought_last_year"),
+            F.col("asin_bought_mom"),
+            F.col("asin_bought_yoy"),
+            F.col("asin_bought_month_flag"),
+            F.col("asin_is_new_flag"),
+            F.lit(self.site_name).alias("site_name"),
+            F.lit(self.date_type).alias("date_type"),
+            F.lit(self.date_info).alias("date_info")
+        ).repartition(1)
+
+        # dwd_ai_asin_add数据存储
+        partition_by1 = ["site_name", "date_type", "date_info"]
+        hive_tb1 = "dwd_ai_asin_add"
+        hdfs_path1 = CommonUtil.build_hdfs_path(
+            hive_tb1,
+            partition_dict={
+                "site_name": self.site_name,
+                "date_type": self.date_type,
+                "date_info": self.date_info,
+            }
+        )
+        HdfsUtils.delete_file_in_folder(hdfs_path1)
+        print(f"正在进行数据存储，当前存储的表名为：{hive_tb1}，存储路径：{hdfs_path1}")
+        self.df_save1.write.saveAsTable(name=hive_tb1, format='hive', mode='append', partitionBy=partition_by1)
+
+        if self.date_info >= '2024-10':
+            self.df_save2 = self.df_base_asin.select(F.col("asin")).unionByName(
+                self.df_ai_asin
+            ).select(
+                F.col("asin"),
+                F.lit(self.site_name).alias("site_name")
+            ).distinct().repartition(1)
+
+            # dim_ai_asin_base数据存储
+            partition_by2 = ["site_name"]
+            hive_tb2 = "dim_ai_asin_base"
+            hdfs_path2 = CommonUtil.build_hdfs_path(
+                hive_tb2,
+                partition_dict={
+                    "site_name": self.site_name,
+                }
+            )
+            HdfsUtils.delete_file_in_folder(hdfs_path2)
+            print(f"正在进行数据存储，当前存储的表名为：{hive_tb2}，存储路径：{hdfs_path2}")
+            self.df_save2.write.saveAsTable(name=hive_tb2, format='hive', mode='append', partitionBy=partition_by2)
+
+        print("success!")
+
+
+if __name__ == "__main__":
+    site_name = sys.argv[1]
+    date_type = sys.argv[2]
+    date_info = sys.argv[3]
+    handle_obj = DwdAiAsinAdd(site_name=site_name, date_type=date_type, date_info=date_info)
+    handle_obj.run()
--- a/Pyspark_job/dwd/dwd_amazon_report.py
+++ b/Pyspark_job/dwd/dwd_amazon_report.py
-import os
-import sys
-
-sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
-from utils.hdfs_utils import HdfsUtils
-from utils.spark_util import SparkUtil
-from utils.templates import Templates
-from pyspark.sql import functions as F
-
-
-class DwdAmazonReport(Templates):
-
-    def __init__(self, site_name='us', date_type="month", date_info='2021-10'):
-        super().__init__()
-        self.site_name = site_name
-        self.date_type = date_type
-        self.date_info = date_info
-        self.db_save = f'dwd_amazon_report'
-        self.spark = self.create_spark_object(
-            app_name=f"{self.db_save}: {self.site_name}, {self.date_type}, {self.date_info}")
-        self.reset_partitions(partitions_num=5)
-        self.partitions_by = ['site_name', 'date_type', 'date_info']
-        self.df_buy_data = self.spark.sql(f"select 1+1;")
-        self.df_st_count = self.spark.sql(f"select 1+1;")
-        self.df_save = self.spark.sql(f"select 1+1;")
-
-    def read_data(self):
-        # 读取asin的月销数据
-        sql1 = f"""
-        select 
-            asin,
-            asin_amazon_orders as monthly_sales
-        from 
-            dim_asin_amorders_info 
-        where 
-            site_name = '{self.site_name}' 
-        and date_type = '{self.date_type}' 
-        and date_info = '{self.date_info}';
-        """
-        print(sql1)
-        self.df_buy_data = self.spark.sql(sqlQuery=sql1).repartition(15, 'asin').cache()
-        self.df_buy_data.show(10, truncate=True)
-
-        sql2 = f"""
-        select 
-            asin,
-            asin_zr_counts as zr_count,
-            asin_sp_counts as sp_count, 
-            asin_st_counts as total_count
-        from 
-            dwd_asin_measure 
-        where 
-            site_name = '{self.site_name}' 
-        and date_type = '{self.date_type}' 
-        and date_info = '{self.date_info}';
-        """
-        print(sql2)
-        self.df_st_count = self.spark.sql(sqlQuery=sql2).repartition(15, 'asin').cache()
-        self.df_st_count.show(10, truncate=True)
-
-    def handle_data(self):
-        hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dwd/{self.db_save}/site_name={self.site_name}/date_type={self.date_type}/date_info={self.date_info}"
-        print(f"清除hdfs目录中.....{hdfs_path}")
-        HdfsUtils.delete_hdfs_file(hdfs_path)
-
-        self.df_save = self.df_buy_data.join(
-            self.df_st_count, on='asin', how='full'
-        )
-        columns = self.df_save.columns
-        for col_name in columns:
-            self.df_save = self.df_save.withColumn(
-                col_name, self.df_save[col_name].cast('string')
-            )
-        self.df_save = self.df_save.fillna('-1')
-        self.df_save = self.df_save.withColumn(
-            "weekly_sales", F.lit(None)
-        ).withColumn(
-            "weekly_views", F.lit(None)
-        ).withColumn(
-            "monthly_views", F.lit(None)
-        ).withColumn(
-            "site_name", F.lit(self.site_name)
-        ).withColumn(
-            "date_type", F.lit(self.date_type)
-        ).withColumn(
-            "date_info", F.lit(self.date_info)
-        )
-
-
-if __name__ == '__main__':
-    site_name = sys.argv[1]
-    date_type = sys.argv[2]
-    date_info = sys.argv[3]
-    if (site_name in ['us', 'uk', 'de']) and (date_type == 'month') and (date_info >= '2024-04'):
-        handle_obj = DwdAmazonReport(site_name=site_name, date_type=date_type, date_info=date_info)
-        handle_obj.run()
-    else:
-        print("暂不计算该维度数据！")
-        quit()
--- a/Pyspark_job/dwd/dwd_asin_to_pg.py
+++ b/Pyspark_job/dwd/dwd_asin_to_pg.py
@@ -217,7 +217,7 @@ class DwdAsinToPg(Templates):
        self.df_save = self.df_save.fillna({"asin_is_variation": 0})
        self.df_save.show(10, truncate=False)
        print("self.df_save.count:", self.df_save.count())
-        users = ["fangxingjun", "wangrui4", "pengyanbing"]
+        users = ["fangxingjun", "chenyuanjie", "pengyanbing"]
        title = f"dwd_asin_to_pg: {self.site_name}, {self.date_type}, {self.date_info}"
        content = f"整合asin完成--等待导出到pg提供爬虫使用--self.df_save.count: {self.df_save.count()}"
        CommonUtil().send_wx_msg(users=users, title=title, content=content)

--- a/Pyspark_job/dwd/dwd_st_asin_measure.py
+++ b/Pyspark_job/dwd/dwd_st_asin_measure.py
@@ -126,6 +126,8 @@ class DwdStMeasure(Templates):

        if date_type in ['month', 'month_week'] and ((self.site_name == 'us' and date_info >= '2023-10') or (self.site_name in ['uk', 'de'] and self.date_info >= '2024-05')):
            sql = f"select * from dim_st_asin_info where site_name='{self.site_name}' and date_type='month' and date_info ='{self.date_info}'"
+        elif date_type in ['month_aba_me']:
+            sql = f"select * from dim_st_asin_info where site_name='{self.site_name}' and date_type='month_aba_me' and date_info ='{self.date_info}'"

        # else:
        #     if (int(self.year) == 2022 and int(self.month) < 10) or int(self.year) <= 2021:
@@ -175,7 +177,7 @@ class DwdStMeasure(Templates):
        self.df_asin_bs = self.spark.sql(sql).cache()
        self.df_asin_bs.show(10)

-        sql = f"select asin, asin_title, asin_price, parent_asin " \
+        sql = f"select asin, asin_title, asin_price, parent_asin, asin_bought_month " \
              f"from dim_asin_detail where site_name='{self.site_name}' and date_type='{self.date_type.replace('_old', '')}' and date_info='{self.date_info}';"
        print("sql:", sql)
        self.df_asin_detail = self.spark.sql(sql).cache()
@@ -239,6 +241,8 @@ class DwdStMeasure(Templates):
        self.handle_join()
        self.df_save_asin = self.handle_st_asin_counts(cal_type="asin", df_templates=self.df_asin_templates, page=3)
        self.df_save_st = self.handle_st_asin_counts(cal_type="st", df_templates=self.df_st_templates)
+        self.df_save_st.show(10, truncate=False)
+        print(f"self.df_save_st: {self.df_save_st.count()}")
        # self.handle_st_zr_page1_title_rate()
        self.handle_st_zr_sp_page123_title_rate(data_type='zr', page_type='page1')
        self.handle_st_zr_sp_page123_title_rate(data_type='zr', page_type='page123')
@@ -413,17 +417,34 @@ class DwdStMeasure(Templates):
                df_st_asin_agg, on=['asin'], how='left'
            )
        elif cal_type == "st":
+            # 计算搜索词DD占比
+            df_asin_bought_month = self.df_asin_detail.select('asin', 'asin_bought_month').join(
+                self.df_asin_amazon_orders, on=['asin'], how='left'
+            ).withColumn(
+                "asin_bought_month", F.coalesce(F.col("asin_bought_month"), F.col("asin_amazon_orders"))
+            ).drop("asin_amazon_orders")
+
            df_st_asin_agg = self.df_st_asin_duplicated.select("search_term", "asin").join(
                self.df_asin_self, on='asin', how='left'
+            ).join(
+                df_asin_bought_month, on='asin', how='left'
            ).withColumn(
                "is_self_asin",
                F.when(F.col("is_self_asin").isNotNull(), F.col("is_self_asin")).otherwise(F.lit(0))
            ).groupby(['search_term']).agg(
                F.sum('is_self_asin').alias("st_self_asin_counts"),
-                F.count('asin').alias("st_total_asin_counts")
+                F.count('asin').alias("st_total_asin_counts"),
+                F.sum(F.when(F.col("asin_bought_month") >= 50, 1).otherwise(0)).alias("st_dd50_counts"),
+                F.sum(F.when(F.col("asin_bought_month") >= 100, 1).otherwise(0)).alias("st_dd100_counts"),
+                F.sum(F.when(F.col("asin_bought_month") >= 200, 1).otherwise(0)).alias("st_dd200_counts")
+            ).withColumn(
+                'st_self_asin_proportion', F.round(F.col('st_self_asin_counts') / F.col('st_total_asin_counts'), 4)
+            ).withColumn(
+                'st_dd50_proportion', F.round(F.col('st_dd50_counts') / F.col('st_total_asin_counts'), 4)
+            ).withColumn(
+                'st_dd100_proportion', F.round(F.col('st_dd100_counts') / F.col('st_total_asin_counts'), 4)
            ).withColumn(
-                'st_self_asin_proportion',
-                F.round(F.col('st_self_asin_counts') / F.col('st_total_asin_counts'), 4)
+                'st_dd200_proportion', F.round(F.col('st_dd200_counts') / F.col('st_total_asin_counts'), 4)
            )
            df = df.join(
                df_st_asin_agg, on=['search_term'], how='left'
@@ -708,6 +729,7 @@ class DwdStMeasure(Templates):
        ).join(
            df_st_flow_proportion_matrix, on=['search_term'], how='left'
        )
+        print(f"{'===' * 10}, self.df_save_st: {self.df_save_st.count()}")
        self.df_save_st.show(10, truncate=False)

    def handle_st_num(self):

--- a/Pyspark_job/dwt/dwt_aba_st_analytics.py
+++ b/Pyspark_job/dwt/dwt_aba_st_analytics.py
@@ -181,7 +181,10 @@ class DwtAbaStAnalytics(Templates):
            st_zr_counts,
            st_sp_counts,
            st_self_asin_counts,
-            st_self_asin_proportion
+            st_self_asin_proportion,
+            st_dd50_proportion,
+            st_dd100_proportion,
+            st_dd200_proportion
        from dwd_st_measure
        where site_name = '{self.site_name}' 
        and date_type = '{self.date_type}' 
@@ -903,7 +906,10 @@ class DwtAbaStAnalytics(Templates):
            "st_self_asin_proportion",
            "lang",
            "asin_movie_type_count",
-            "is_hidden_cate"
+            "is_hidden_cate",
+            "st_dd50_proportion",
+            "st_dd100_proportion",
+            "st_dd200_proportion"
        )

        # 空值处理

--- a/Pyspark_job/dwt/dwt_ai_asin_add.py
+++ b/Pyspark_job/dwt/dwt_ai_asin_add.py
+import os
+import sys
+
+sys.path.append(os.path.dirname(sys.path[0]))
+
+from utils.spark_util import SparkUtil
+from pyspark.sql import functions as F, Window
+from utils.common_util import CommonUtil
+from utils.hdfs_utils import HdfsUtils
+
+
+class DwtAiAsinAdd(object):
+
+    def __init__(self, site_name="us", date_type="month", date_info="2024-10"):
+        self.site_name = site_name
+        self.date_type = date_type
+        self.date_info = date_info
+        app_name = f"{self.__class__.__name__}:{site_name}:{date_type}:{date_info}"
+        self.spark = SparkUtil.get_spark_session(app_name)
+
+        # 近6个月list
+        self.last_6_month = []
+        for i in range(0, 6):
+            self.last_6_month.append(CommonUtil.get_month_offset(self.date_info, -i))
+
+        self.df_base_asin = self.spark.sql(f"select 1+1;")
+        self.df_flow_asin_detail = self.spark.sql(f"select 1+1;")
+        self.df_fb_info = self.spark.sql(f"select 1+1;")
+        self.df_ods_asin_detail = self.spark.sql(f"select 1+1;")
+        self.df_ai_asin_detail = self.spark.sql(f"select 1+1;")
+        self.df_asin_bought_flag = self.spark.sql(f"select 1+1;")
+        self.df_save = self.spark.sql(f"select 1+1;")
+
+    def run(self):
+        self.read_data()
+        self.handle_data()
+        self.save_data()
+
+    def read_data(self):
+        # 读取ASIN信息库基础数据
+        sql1 = f"""
+        select 
+            asin, 
+            asin_bought_month,
+            asin_bought_mom,
+            asin_bought_yoy,
+            asin_bought_month_flag,
+            asin_is_new_flag
+        from dwd_ai_asin_add
+        where site_name = '{self.site_name}'
+          and date_type = '{self.date_type}'
+          and date_info = '{self.date_info}'
+        """
+        self.df_base_asin = self.spark.sql(sqlQuery=sql1).repartition(40, 'asin').cache()
+        print("ASIN信息库基础数据如下：")
+        self.df_base_asin.show(10, truncate=True)
+
+        # 读取流量选品详情数据
+        sql2 = f"""
+        select
+            asin,
+            asin_weight,
+            asin_category_desc,
+            asin_img_url,
+            asin_title,
+            asin_brand_name,
+            account_name,
+            asin_buy_box_seller_type,
+            asin_launch_time,
+            asin_img_num,
+            case when variation_num > 0 then 1 else 0 end as variation_flag,
+            variation_num,
+            asin_ao_val,
+            category_first_id,
+            category_id,
+            parent_asin,
+            first_category_rank,
+            asin_price,
+            asin_rating,
+            asin_total_comments,
+            asin_launch_time_type,
+            asin_describe
+        from dwt_flow_asin
+        where site_name = '{self.site_name}'
+          and date_type = '{self.date_type}'
+          and date_info = '{self.date_info}'
+          and asin_type in (0, 1)
+          and asin_bought_month >= 50
+        """
+        self.df_flow_asin_detail = self.spark.sql(sqlQuery=sql2).repartition(40, 'asin').cache()
+        print("流量选品详情数据如下：")
+        self.df_flow_asin_detail.show(10, truncate=True)
+
+        # 读取店铺数据
+        sql3 = f"""
+        select
+            account_name,
+            seller_id,
+            fb_country_name,
+            business_addr
+        from dwt_fb_base_report
+        where site_name = '{self.site_name}'
+          and date_type = '{self.date_type}'
+          and date_info = '{self.date_info}'
+        """
+        self.df_fb_info = self.spark.sql(sqlQuery=sql3).dropDuplicates(['account_name']).cache()
+        print("店铺详情数据如下：")
+        self.df_fb_info.show(10, truncate=True)
+
+        # 读取review_json_list等详情数据
+        sql4 = f"""
+        select
+            asin,
+            review_json_list,
+            product_json,
+            product_detail_json,
+            updated_at
+        from ods_asin_detail
+        where site_name = '{self.site_name}'
+          and date_type = '{self.date_type}'
+          and date_info = '{self.date_info}'
+        """
+        self.df_ods_asin_detail = self.spark.sql(sqlQuery=sql4)
+        window = Window.partitionBy(['asin']).orderBy(
+            self.df_ods_asin_detail.updated_at.desc_nulls_last()
+        )
+        self.df_ods_asin_detail = self.df_ods_asin_detail.withColumn(
+            'rank', F.row_number().over(window=window)
+        ).filter('rank = 1').drop('rank', 'updated_at').repartition(40, 'asin').cache()
+        print("ods详情数据如下：")
+        self.df_ods_asin_detail.show(10, truncate=True)
+
+        # df对象join聚合
+        self.df_ai_asin_detail = self.df_base_asin.join(
+            self.df_flow_asin_detail, 'asin', 'left'
+        ).join(
+            self.df_ods_asin_detail, 'asin', 'left'
+        ).join(
+            self.df_fb_info, 'account_name', 'left'
+        ).cache()
+        self.df_base_asin.unpersist()
+        self.df_flow_asin_detail.unpersist()
+        self.df_fb_info.unpersist()
+        self.df_ods_asin_detail.unpersist()
+
+        # 读取dwd_ai_asin_add月销标识
+        sql5 = f"""
+        select 
+            asin, 
+            asin_bought_month_flag 
+        from dwd_ai_asin_add
+        where site_name = '{self.site_name}'
+          and date_type = '{self.date_type}'
+          and date_info in ({CommonUtil.list_to_insql(self.last_6_month)})
+        """
+        self.df_asin_bought_flag = self.spark.sql(sqlQuery=sql5).repartition(40, 'asin').cache()
+        print("dwd_ai_asin_add月销标识数据如下：")
+        self.df_asin_bought_flag.show(10, truncate=True)
+
+    def handle_data(self):
+        # 计算上升产品标识：连续6个月销量上升
+        self.df_asin_bought_flag = self.df_asin_bought_flag.groupBy('asin').agg(
+            F.sum(F.when(F.col('asin_bought_month_flag') == 1, 1).otherwise(0)).alias('sum_flag')
+        ).withColumn(
+            'is_ascending_flag', F.when(F.col('sum_flag') == 6, 1).otherwise(0)
+        )
+
+    def save_data(self):
+        # 字段标准化
+        self.df_save = self.df_ai_asin_detail.join(
+            self.df_asin_bought_flag, 'asin', 'left'
+        ).select(
+            F.col("asin"),
+            F.col("asin_weight").alias("weight"),
+            F.col("asin_bought_month").alias("bought_month"),
+            F.col("asin_category_desc").alias("category"),
+            F.col("asin_img_url").alias("img"),
+            F.col("asin_title").alias("title"),
+            F.col("asin_brand_name").alias("brand"),
+            F.col("account_name"),
+            F.col("business_addr").alias("account_addr"),
+            F.col("asin_buy_box_seller_type").alias("buy_box_seller_type"),
+            F.col("asin_launch_time").alias("launch_time"),
+            F.col("asin_img_num").alias("img_num"),
+            F.col("variation_flag"),
+            F.col("variation_num"),
+            F.col("asin_ao_val").alias("ao_val"),
+            F.col("category_first_id").alias("category_id"),
+            F.col("category_id").alias("category_current_id"),
+            F.col("parent_asin"),
+            F.col("first_category_rank").alias("bsr_rank"),
+            F.col("asin_price").alias("price"),
+            F.col("asin_rating").alias("rating"),
+            F.col("asin_total_comments").alias("total_comments"),
+            F.col("seller_id"),
+            F.col("fb_country_name"),
+            F.col("review_json_list"),
+            F.col("asin_launch_time_type").alias("launch_time_type"),
+            F.col("asin_describe").alias("describe"),
+            F.col("product_json"),
+            F.col("product_detail_json"),
+            F.col("asin_bought_mom").alias("bought_month_mom"),
+            F.col("asin_bought_yoy").alias("bought_month_yoy"),
+            F.col("asin_is_new_flag").alias("is_new_flag"),
+            F.col("is_ascending_flag"),
+            F.lit(self.site_name).alias("site_name"),
+            F.lit(self.date_type).alias("date_type"),
+            F.lit(self.date_info).alias("date_info")
+        ).repartition(100).cache()
+
+        # 数据存储
+        partition_by = ["site_name", "date_type", "date_info"]
+        hive_tb = "dwt_ai_asin_add"
+        hdfs_path = CommonUtil.build_hdfs_path(
+            hive_tb,
+            partition_dict={
+                "site_name": self.site_name,
+                "date_type": self.date_type,
+                "date_info": self.date_info,
+            }
+        )
+        HdfsUtils.delete_file_in_folder(hdfs_path)
+        print(f"正在进行数据存储，当前存储的表名为：{hive_tb}，存储路径：{hdfs_path}")
+        self.df_save.write.saveAsTable(name=hive_tb, format='hive', mode='append', partitionBy=partition_by)
+
+        print("success!")
+
+
+if __name__ == "__main__":
+    site_name = sys.argv[1]
+    date_type = sys.argv[2]
+    date_info = sys.argv[3]
+    handle_obj = DwtAiAsinAdd(site_name=site_name, date_type=date_type, date_info=date_info)
+    handle_obj.run()
--- a/Pyspark_job/dwt/dwt_ai_asin_all.py
+++ b/Pyspark_job/dwt/dwt_ai_asin_all.py
--- a/Pyspark_job/dwt/dwt_amazon_report.py
+++ b/Pyspark_job/dwt/dwt_amazon_report.py
 import os
 import sys

-sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
+sys.path.append(os.path.dirname(sys.path[0]))
+
 from utils.hdfs_utils import HdfsUtils
 from utils.spark_util import SparkUtil
 from utils.common_util import CommonUtil
@@ -17,40 +18,37 @@ class DwtAmazonReport(Templates):
        self.site_name = site_name
        self.date_type = date_type
        self.date_info = date_info
-        self.db_save = f'dwt_amazon_report'
+        self.date_info_pre = CommonUtil.get_month_offset(self.date_info, -1)
+        self.db_save = 'dwt_amazon_report'
        self.spark = self.create_spark_object(
            app_name=f"{self.db_save}: {self.site_name}, {self.date_type}, {self.date_info}")
-        self.reset_partitions(partitions_num=120)
+        self.reset_partitions(partitions_num=200)
        self.partitions_by = ['site_name', 'date_type', 'date_info']
-        self.df_dwd_new = self.spark.sql(f"select 1+1;")
-        self.df_dwd_old = self.spark.sql(f"select 1+1;")
+
+        self.df_asin_detail_new = self.spark.sql(f"select 1+1;")
+        self.df_asin_detail_old = self.spark.sql(f"select 1+1;")
        self.df_joined = self.spark.sql(f"select 1+1;")
        self.df_save = self.spark.sql(f"select 1+1;")

-
    def read_data(self):
-        # 从dwd层读取本月数据
+        # 读取流量选品本月月销数据
        sql1 = f"""
        select 
            asin, 
-            monthly_sales as new_monthly_sales,
-            zr_count as new_zr_count,
-            sp_count as new_sp_count,
-            total_count as new_total_count,
+            asin_bought_month as new_monthly_sales, 
+            asin_zr_counts as new_zr_count, 
+            asin_sp_counts as new_sp_count, 
+            asin_st_counts as new_total_count, 
            date_info as new_date_info_list 
-        from 
-            dwd_amazon_report 
-        where 
-            site_name = '{self.site_name}' 
+        from dwt_flow_asin 
+        where site_name = '{self.site_name}' 
          and date_type = '{self.date_type}' 
          and date_info = '{self.date_info}';
        """
-        print(sql1)
-        self.df_dwd_new = self.spark.sql(sqlQuery=sql1).repartition(15, 'asin').cache()
-        self.df_dwd_new.show(10, truncate=True)
+        self.df_asin_detail_new = self.spark.sql(sqlQuery=sql1).repartition(15, 'asin').fillna('-1').cache()
+        self.df_asin_detail_new.show(10, truncate=True)

-        # 从dwt层读取上月数据
-        date_info_pre = CommonUtil.get_month_offset(self.date_info, -1)
+        # 读历史数据
        sql2 = f"""
        select 
            asin, 
@@ -59,25 +57,19 @@ class DwtAmazonReport(Templates):
            sp_count as old_sp_count, 
            total_count as old_total_count, 
            date_info_list as old_date_info_list 
-        from 
-            dwt_amazon_report 
-        where 
-            site_name = '{self.site_name}' 
+        from dwt_amazon_report 
+        where site_name = '{self.site_name}' 
          and date_type = '{self.date_type}' 
-        and date_info = '{date_info_pre}';
+          and date_info = '{self.date_info_pre}';
        """
-        print(sql2)
-        self.df_dwd_old = self.spark.sql(sqlQuery=sql2).repartition(15, 'asin').cache()
-        self.df_dwd_old.show(10, truncate=True)
+        self.df_asin_detail_old = self.spark.sql(sqlQuery=sql2).repartition(15, 'asin').cache()
+        self.df_asin_detail_old.show(10, truncate=True)

    def handle_data(self):
-        hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dwt/{self.db_save}/site_name={self.site_name}/date_type={self.date_type}/date_info={self.date_info}"
-        print(f"清除hdfs目录中.....{hdfs_path}")
-        HdfsUtils.delete_hdfs_file(hdfs_path)
        # 关联后的列名
        join_columns = ['monthly_sales', 'zr_count', 'sp_count', 'total_count', 'date_info_list']
        # 获取历史df对象中，date_info的数量，用来确定关联不到的历史asin填充多少个 -1
-        old_date_info_first = self.df_dwd_old.select('old_date_info_list').distinct().first()
+        old_date_info_first = self.df_asin_detail_old.select('old_date_info_list').distinct().first()
        if old_date_info_first is None:
            old_date_info_list = None
            old_date_info_list_len = 0
@@ -88,9 +80,9 @@ class DwtAmazonReport(Templates):
        # 本月数据如果关联不上，填充一个 -1
        fillna_new = '-1'
        # 关联df，并填充null值
-        self.df_joined = self.df_dwd_new.join(
-            self.df_dwd_old, on='asin', how='full'
-        )
+        self.df_joined = self.df_asin_detail_new.join(
+            self.df_asin_detail_old, on='asin', how='full'
+        ).cache()
        for col in join_columns:
            self.df_joined = self.df_joined.fillna({'old_' + col: fillna_old})
            self.df_joined = self.df_joined.fillna({'new_' + col: fillna_new})
@@ -104,25 +96,17 @@ class DwtAmazonReport(Templates):
        if old_date_info_first is None:
            for col in join_columns:
                self.df_joined = self.df_joined.withColumn(
-                    col,
-                    self.df_joined['new_' + col]
+                    col, self.df_joined['new_' + col]
                )
        else:
            for col in join_columns:
                self.df_joined = self.df_joined.withColumn(
-                    col,
-                    concat_ws(',', self.df_joined['old_' + col], self.df_joined['new_' + col])
+                    col, concat_ws(',', self.df_joined['old_' + col], self.df_joined['new_' + col])
                )
        # 选择需要的列
        selected_columns = ['asin'] + join_columns
        self.df_save = self.df_joined.select(selected_columns)
        self.df_save = self.df_save.withColumn(
-            "weekly_sales", F.lit(None)
-        ).withColumn(
-            "weekly_views", F.lit(None)
-        ).withColumn(
-            "monthly_views", F.lit(None)
-        ).withColumn(
            "site_name", F.lit(self.site_name)
        ).withColumn(
            "date_type", F.lit(self.date_type)
@@ -130,14 +114,14 @@ class DwtAmazonReport(Templates):
            "date_info", F.lit(self.date_info)
        )

+        hdfs_path = f"/home/{SparkUtil.DEF_USE_DB}/dwt/{self.db_save}/site_name={self.site_name}/date_type={self.date_type}/date_info={self.date_info}"
+        print(f"清除hdfs目录中.....{hdfs_path}")
+        HdfsUtils.delete_hdfs_file(hdfs_path)
+

 if __name__ == '__main__':
    site_name = sys.argv[1]
    date_type = sys.argv[2]
    date_info = sys.argv[3]
-    if (site_name in ['us', 'uk', 'de']) and (date_type == 'month') and (date_info >= '2024-04'):
    handle_obj = DwtAmazonReport(site_name=site_name, date_type=date_type, date_info=date_info)
    handle_obj.run()
-    else:
-        print("暂不计算该维度数据！")
-        quit()
--- a/Pyspark_job/dwt/dwt_asin_related_traffic.py
+++ b/Pyspark_job/dwt/dwt_asin_related_traffic.py
+import os
+import sys
+
+sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
+
+from pyspark.sql import functions as F
+from utils.spark_util import SparkUtil
+from utils.common_util import CommonUtil
+from utils.hdfs_utils import HdfsUtils
+
+
+class DwtAsinRelatedTraffic(object):
+
+    def __init__(self, site_name, date_type, date_info):
+        super().__init__()
+        self.site_name = site_name
+        self.date_type = date_type
+        self.date_info = date_info
+        self.hive_tb = f'dwt_asin_related_traffic'
+        self.partition_dict = {
+            "site_name": site_name,
+            "date_type": date_type,
+            "date_info": date_info
+        }
+        self.hdfs_path = CommonUtil.build_hdfs_path(self.hive_tb, partition_dict=self.partition_dict)
+        app_name = f"{self.__class__.__name__}:{site_name}:{date_type}:{date_info}"
+        self.spark = SparkUtil.get_spark_session(app_name)
+        self.partitions_by = ['site_name', 'date_type', 'date_info']
+
+        self.df_dim_asin_related_traffic = self.spark.sql(f"select 1+1;")
+        self.df_save = self.spark.sql(f"select 1+1;")
+
+        self.col_num_index = {
+            "four_star_above": 1,
+            "brand_recommendation": 2,
+            "similar_items": 3,
+            "look_and_look": 4,
+            "look_also_look": 5,
+            "look_but_bought": 6,
+            "bundle_bought": 7,
+            "combination_bought": 8,
+            "more_relevant": 9,
+            "bought_and_bought": 10,
+            "product_adv": 11,
+            "brand_adv": 12
+        }
+
+    def read_data(self):
+        print("读取dim_asin_related_traffic流量数据")
+        sql = f"""
+        select 
+            asin, 
+            four_star_above, 
+            brand_recommendation, 
+            similar_items, 
+            look_and_look, 
+            look_also_look, 
+            look_but_bought, 
+            bundle_bought, 
+            combination_bought, 
+            more_relevant, 
+            bought_and_bought, 
+            product_adv, 
+            brand_adv, 
+            updated_at as related_time 
+        from dim_asin_related_traffic where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{self.date_info}';
+        """
+        self.df_dim_asin_related_traffic = self.spark.sql(sqlQuery=sql).cache()
+        print("dim_asin_related_traffic数据如下：")
+        self.df_dim_asin_related_traffic.show(10, True)
+
+    # 聚合计算
+    def handle_data(self):
+        cols = [col for col in self.df_dim_asin_related_traffic.columns if col != 'asin' and col != 'related_time']
+
+        for col in cols:
+            self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
+                col, F.concat_ws(",", F.filter(F.split(F.col(col), ","), lambda x: (F.length(F.trim(x)) == 10)))
+            ).withColumn(
+                col, F.when(F.col(col) == "", None).otherwise(F.col(col))
+            )
+
+        # 将所有类型下的关联流量asin拼接
+        self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
+            "related_asin", F.concat_ws(",", *[F.col(col) for col in cols])
+        )
+
+        # 根据map映射 生成与流量asin数量相等的编号列
+        for col in cols:
+            num = self.col_num_index[col]
+            self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
+                f"{col}_num", F.when(
+                    F.col(col).isNull(), F.lit(None)
+                ).otherwise(
+                    F.concat_ws(",", F.array_repeat(F.lit(num), F.size(F.split(F.col(col), ","))))
+                )
+            )
+
+        # 将所有编号列进行拼接
+        self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.withColumn(
+            "related_type", F.concat_ws(",", *[F.col(f"{col}_num") for col in cols])
+        ).cache()
+
+        # 统计关联数
+        df_related = self.df_dim_asin_related_traffic.select(
+            'asin', F.explode(F.split(F.col('related_asin'), ',')).alias('related_asin')
+        ).drop_duplicates(['asin', 'related_asin']).groupBy(
+            'related_asin'
+        ).agg(
+            F.count('asin').alias('related_count')
+        ).withColumnRenamed(
+            'related_asin', 'asin'
+        )
+
+        self.df_dim_asin_related_traffic = self.df_dim_asin_related_traffic.join(
+            df_related, on='asin', how='left'
+        ).fillna({
+            'related_count': 0
+        })
+
+    # 数据落盘
+    def save_data(self):
+        self.df_save = self.df_dim_asin_related_traffic.select(
+            'asin',
+            'related_asin',
+            'related_type',
+            'related_time',
+            'related_count',
+            F.lit(self.site_name).alias('site_name'),
+            F.lit(self.date_type).alias('date_type'),
+            F.lit(self.date_info).alias('date_info')
+        )
+        print(f"清除hdfs目录中:{self.hdfs_path}")
+        HdfsUtils.delete_file_in_folder(self.hdfs_path)
+        print(f"当前存储的表名为:{self.hive_tb},分区为:{self.partitions_by}")
+        self.df_save.repartition(40).write.saveAsTable(name=self.hive_tb, format='hive', mode='append', partitionBy=self.partitions_by)
+        print("success")
+
+    def run(self):
+        # 读取数据
+        self.read_data()
+        # 聚合计算
+        self.handle_data()
+        # 数据落盘
+        self.save_data()
+
+
+if __name__ == '__main__':
+    site_name = sys.argv[1]
+    date_type = sys.argv[2]
+    date_info = sys.argv[3]
+    handle_obj = DwtAsinRelatedTraffic(site_name=site_name, date_type=date_type, date_info=date_info)
+    handle_obj.run()
--- a/Pyspark_job/dwt/dwt_fb_base_report.py
+++ b/Pyspark_job/dwt/dwt_fb_base_report.py
@@ -65,33 +65,62 @@ class DwtFbBaseReport(object):
        # 初始化UDF函数
        self.udf_new_asin_flag = F.udf(udf_new_asin_flag, IntegerType())
        self.u_judge_package_quantity = F.udf(udf_get_package_quantity, IntegerType())
+        self.u_get_business_val = F.udf(self.get_business_val, StringType())
+
+    # 解析seller_address字段，获取卖家公司数据
+    @staticmethod
+    def get_business_val(seller_address, key):
+        if not seller_address:
+            return None
+
+        parts = [p.strip() for p in seller_address.split("|-|")]
+        for i, p in enumerate(parts):
+            if p.startswith(key):
+                # Business Address: 拼接后续所有内容
+                if key in ("Business Address", "Geschäftsadresse"):
+                    return "  ".join(parts[i + 1:]).strip()
+                # 其他key: 只取下一个
+                elif i + 1 < len(parts):
+                    return parts[i + 1].strip()
+        return None

    def read_data(self):
        # ods_seller_account_feedback　月度店铺报告表主表
        print("获取 ods_seller_account_feedback")
-        sql = f"""select cur_fd.seller_id,
+        sql = f"""
+        select 
+            cur_fd.seller_id,
            cur_fd.fb_web_asin_num,
            cur_fd.fb_country_name,
            cur_fd.count_30_day_num,
            cur_fd.count_1_year_num,
            cur_fd.count_lifetime_num,
+            cur_fd.seller_address,
            cur_fd.fb_crawl_date,
            round((count_30_day_num - last_30_day_num) / last_30_day_num, 4) as count_30_day_rate,
            round((count_1_year_num - last_1_year_num) / last_1_year_num, 4) as count_1_year_rate,
            round((count_lifetime_num - last_lifetime_num) / last_lifetime_num, 4) as count_life_time_rate
-from (select seller_id,
+        from 
+        (
+            select 
+                seller_id,
                num as fb_web_asin_num,
                count_30_day as count_30_day_num,   
                count_1_year as count_1_year_num,
                count_lifetime as count_lifetime_num,
                country_name as fb_country_name,
+                seller_address,
                date_format(updated_at, 'yyyy-MM-dd HH:mm:ss') as fb_crawl_date
            from ods_seller_account_feedback
            where site_name = '{self.site_name}'
              and date_type = '{self.date_type}'
              and date_info = '{self.date_info}'
-        and length(seller_id) > 2 ) cur_fd
-         left join (select seller_id,
+              and length(seller_id) > 2 
+        ) cur_fd
+         left join 
+        (
+            select 
+                seller_id,
                count_30_day as last_30_day_num,
                count_1_year as last_1_year_num,
                count_lifetime as last_lifetime_num
@@ -99,8 +128,10 @@ from (select seller_id,
            where site_name = '{self.site_name}'
              and date_type = '{self.date_type}'
              and date_info = '{self.last_month}'
-                      and length(seller_id) > 2 ) last_fd
-                   on cur_fd.seller_id = last_fd.seller_id"""
+              and length(seller_id) > 2 
+        ) last_fd
+        on cur_fd.seller_id = last_fd.seller_id
+        """
        self.df_fb_feedback = self.spark.sql(sqlQuery=sql)
        self.df_fb_feedback = self.df_fb_feedback.drop_duplicates(['seller_id']).cache()
        print(sql)
@@ -108,9 +139,8 @@ from (select seller_id,
        # 获取我们内部的店铺与asin的数据库(从搜索词抓下来，店铺与asin的关系表)
        print("获取 ods_seller_asin_account")
        sql = f"""
-            select seller_id,asin from ods_seller_asin_account 
-            where site_name='{self.site_name}'
-             and date_format(created_at,'yyyy-MM-dd') <= '{self.cal_date}' 
+        select seller_id, asin from ods_seller_asin_account 
+        where site_name='{self.site_name}' and date_format(created_at,'yyyy-MM-dd') <= '{self.cal_date}' 
        """
        self.df_fb_asin = self.spark.sql(sqlQuery=sql)
        self.df_fb_asin = self.df_fb_asin.drop_duplicates(['seller_id', 'asin'])
@@ -157,16 +187,17 @@ from (select seller_id,

        # 获取ods_asin_variat提取parent_asin用于计算是多变体
        print("获取 dim_asin_variation_info")
-        sql = f"select asin,parent_asin from dim_asin_variation_info " \
-              f"where site_name='{self.site_name}'" \
-              f" and asin != parent_asin "
+        sql = f"""
+        select asin, parent_asin from dim_asin_variation_info where site_name='{self.site_name}' and asin != parent_asin
+        """
        self.df_asin_parent = self.spark.sql(sqlQuery=sql)
        print(sql)

        # 获取ods_seller_account_syn提取account_name
        print("获取 ods_seller_account_syn")
-        sql = f"select seller_id,account_name,id from ods_seller_account_syn " \
-              f"where site_name='{self.site_name}'"
+        sql = f"""
+        select seller_id, account_name, id from ods_seller_account_syn where site_name='{self.site_name}'
+        """
        self.df_seller_account = self.spark.sql(sqlQuery=sql)
        # 进行去重
        self.df_seller_account = self.df_seller_account.orderBy(self.df_seller_account.id.desc())
@@ -177,8 +208,7 @@ from (select seller_id,
        # 获取mysql：selection.accounts ,用于排除公司内部店铺
        print("获取 selection.accounts")
        sql = f"""
-        select seller_id, 1 as is_self_fb from
-            (select distinct seller_id  from selection.accounts) t1
+        select seller_id, 1 as is_self_fb from (select distinct seller_id from selection.accounts) t1
        """
        conn_info = DBUtil.get_connection_info("mysql", "us")
        self.df_self_seller_id = SparkUtil.read_jdbc_query(
@@ -291,6 +321,22 @@ from (select seller_id,
        # 没有关联上的赋值为0，则不是公司内部店铺
        self.df_fb_agg = self.df_fb_agg.na.fill({"is_self_fb": 0})

+        # 解析卖家公司数据，不同语言区别处理
+        if self.site_name in ("us", "uk"):
+            self.df_fb_agg = self.df_fb_agg.withColumn(
+                "business_name", self.u_get_business_val(F.col("seller_address"), F.lit("Business Name"))
+            ).withColumn(
+                "business_addr", self.u_get_business_val(F.col("seller_address"), F.lit("Business Address"))
+            )
+        elif self.site_name == "de":
+            self.df_fb_agg = self.df_fb_agg.withColumn(
+                "business_name", self.u_get_business_val(F.col("seller_address"), F.lit("Geschäftsname"))
+            ).withColumn(
+                "business_addr", self.u_get_business_val(F.col("seller_address"), F.lit("Geschäftsadresse"))
+            )
+        else:
+            pass
+
    # 输出数据集-report
    def save_data_report(self):
        # 关联ods_seller_account_syn,带回account_name-采用inner join过滤掉库中无店铺名称的数据
@@ -358,6 +404,8 @@ from (select seller_id,
            F.date_format(F.current_timestamp(), 'yyyy-MM-dd HH:mm:SS').alias('updated_time'),
            F.lit(None).alias('usr_mask_type'),
            F.lit(None).alias('usr_mask_progress'),
+            F.col('business_name'),
+            F.col('business_addr'),
            F.lit(self.site_name).alias('site_name'),
            F.lit(self.date_type).alias('date_type'),
            F.lit(self.date_info).alias('date_info')

--- a/Pyspark_job/dwt/dwt_flow_asin.py
+++ b/Pyspark_job/dwt/dwt_flow_asin.py
@@ -56,7 +56,11 @@ class DwtFlowAsin(Templates):
        self.df_save = self.spark.sql(f"select 1+1;")
        self.partitions_by = ['site_name', 'date_type', 'date_info']
        self.reset_partitions(60)
-        self.launch_time_interval_dict = self.get_launch_time_interval_dict()
+
+        launch_time_base_date = self.spark.sql(
+            f"""SELECT max(`date`) AS last_day FROM dim_date_20_to_30 WHERE year_month = '{self.date_info}'"""
+        ).collect()[0]['last_day']
+        self.launch_time_interval_dict = self.get_launch_time_interval_dict(launch_time_base_date)
        # 初始化全局df
        self.df_asin_detail = self.spark.sql(f"select 1+1;")
        self.df_asin_measure = self.spark.sql(f"select 1+1;")
@@ -123,15 +127,15 @@ class DwtFlowAsin(Templates):
            return str(datetime.now().date())

    @staticmethod
-    def get_launch_time_interval_dict():
-        cur_date = datetime.now().date()
+    def get_launch_time_interval_dict(base_date):
+        base_date = datetime.strptime(base_date, '%Y-%m-%d')
        return {
-            "one_month": (cur_date + timedelta(days=-30)).strftime('%Y-%m-%d'),
-            "three_month": (cur_date + timedelta(days=-90)).strftime('%Y-%m-%d'),
-            "six_month": (cur_date + timedelta(days=-180)).strftime('%Y-%m-%d'),
-            "twelve_month": (cur_date + timedelta(days=-360)).strftime('%Y-%m-%d'),
-            "twenty_four_month": (cur_date + timedelta(days=-720)).strftime('%Y-%m-%d'),
-            "thirty_six_month": (cur_date + timedelta(days=-1080)).strftime('%Y-%m-%d')
+            "one_month": (base_date + timedelta(days=-30)).strftime('%Y-%m-%d'),
+            "three_month": (base_date + timedelta(days=-90)).strftime('%Y-%m-%d'),
+            "six_month": (base_date + timedelta(days=-180)).strftime('%Y-%m-%d'),
+            "twelve_month": (base_date + timedelta(days=-360)).strftime('%Y-%m-%d'),
+            "twenty_four_month": (base_date + timedelta(days=-720)).strftime('%Y-%m-%d'),
+            "thirty_six_month": (base_date + timedelta(days=-1080)).strftime('%Y-%m-%d')
        }

    @staticmethod
@@ -167,7 +171,7 @@ class DwtFlowAsin(Templates):
            date_format(created_time, 'yyyy-MM-dd HH:mm:ss') as asin_crawl_date, asin_bought_month, asin_image_view,  
            case when product_description is not null then 1 else 0 end as is_with_product_description, asin_describe, 
            category_id as top_category_id, category_first_id as top_category_first_id, customer_reviews_json, img_list as img_info, 
-            asin_follow_sellers as follow_sellers_count 
+            asin_follow_sellers as follow_sellers_count, asin_fbm_price
            from dim_asin_detail where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{self.date_info}'"""
        print("sql:" + sql)
        self.df_asin_detail = self.spark.sql(sqlQuery=sql)
@@ -295,7 +299,10 @@ class DwtFlowAsin(Templates):
            WHEN asin_price >= 15 AND asin_price < 20 THEN 3 
            WHEN asin_price >= 20 AND asin_price < 30 THEN 4 
            WHEN asin_price >= 30 AND asin_price < 50 THEN 5 
-            WHEN asin_price >= 50 THEN 6 ELSE 0 END"""))
+            WHEN asin_price >= 50 AND asin_price < 70 THEN 6 
+            WHEN asin_price >= 70 AND asin_price < 100 THEN 7
+            WHEN asin_price >= 100 AND asin_price < 150 THEN 8
+            WHEN asin_price >= 150 THEN 9 ELSE 0 END"""))

    # 处理asin分类、排名、排名类型字段、是否有效排名信息
    def handle_asin_category_info(self):
@@ -396,7 +403,7 @@ class DwtFlowAsin(Templates):
                "img_num_rating") + F.col("img_enlarge_rating")
        )
        self.df_asin_detail = self.df_asin_detail.\
-            drop("is_with_product_description", "asin_describe", "asin_image_view", "category_node_rating", "zr_rating",
+            drop("is_with_product_description", "asin_image_view", "category_node_rating", "zr_rating",
                 "sp_rating", "a_add_rating", "video_rating", "brand_rating", "product_describe_rating",
                 "highlight_rating", "title_len_rating", "title_brand_rating", "img_num_rating", "img_enlarge_rating")

@@ -500,7 +507,7 @@ class DwtFlowAsin(Templates):
                   F.lit(None).alias("buy_data_viewed_month"), F.lit(None).alias("buy_data_viewed_week"),
                   F.lit(None).alias("theme_en"), F.lit(None).alias("theme_label_en"), "asin_lqs_rating",
                   "asin_lqs_rating_detail", "title_matching_degree", "zr_flow_proportion", "matrix_flow_proportion",
-                   "matrix_ao_val", "follow_sellers_count", "seller_json",
+                   "matrix_ao_val", "follow_sellers_count", "seller_json", "asin_describe", "asin_fbm_price",
                   F.lit(self.site_name).alias("site_name"), F.lit(self.date_type).alias("date_type"),
                   F.lit(self.date_info).alias("date_info"))
        self.df_save = self.df_save.na.fill(
@@ -514,7 +521,7 @@ class DwtFlowAsin(Templates):
             "package_quantity": 1, "is_movie_label": 0, "is_brand_label": 0, "is_alarm_brand": 0,
             "title_matching_degree": 0.0, "asin_lqs_rating": 0.0, "follow_sellers_count": -1})
        self.df_save = self.df_save.repartition(60).persist(StorageLevel.DISK_ONLY)
-        self.df_save = self.df_save.drop_duplicates(['asin']).filter(F.length(F.col("asin"))<=10)
+        self.df_save = self.df_save.drop_duplicates(['asin']).filter((F.col("asin").isNotNull()) & (F.col("asin") != "") & (F.length(F.col("asin")) <= 10))
        print("数据量为：", self.df_save.count())
        self.df_save.show(10, truncate=False)

@@ -550,14 +557,14 @@ class DwtFlowAsin(Templates):
                           F.col("current_category_rank").alias("category_current_rank"), "asin_type",
                           "bsr_orders", F.col("sales").alias("bsr_orders_sale"),
                           F.col("asin_page_inventory").alias("page_inventory"), "asin_bought_month", "seller_json",
-                           F.col("asin_buy_box_seller_type").alias("buy_box_seller_type")
+                           F.col("asin_buy_box_seller_type").alias("buy_box_seller_type"), "asin_describe", "asin_fbm_price"
                           )
                table_columns = """asin, asin_ao_val, asin_title, asin_title_len, asin_category_desc, asin_volume, 
                          asin_weight, asin_launch_time, asin_brand_name, one_star, two_star, three_star, four_star, five_star, low_star, 
                          account_name, account_id, seller_country_name, category_first_id, parent_asin, variation_num, img_info, 
                          asin_crawl_date, asin_price, asin_rating, asin_total_comments, matrix_ao_val, zr_flow_proportion, matrix_flow_proportion, 
                          date_info, img_url, category_current_id, category_first_rank, category_current_rank, asin_type, bsr_orders, bsr_orders_sale, 
-                          page_inventory, asin_bought_month, seller_json, buy_box_seller_type"""
+                          page_inventory, asin_bought_month, seller_json, buy_box_seller_type, asin_describe, asin_fbm_price"""
                DorisHelper.spark_export_with_columns(df_save=df_doris, db_name=self.doris_db, table_name=self.asin_latest_detail_table, table_columns=table_columns)
                print("save asin_latest_detail success")
            else:

--- a/Pyspark_job/dwt/dwt_new_store_collect_info.py
+++ b/Pyspark_job/dwt/dwt_new_store_collect_info.py
@@ -95,18 +95,18 @@ def handle_new_store_collections(new_collect_store_id):
                except:
                    pass
                    print("推送失败")
-                    CommonUtil.send_wx_msg(['wangrui4'], f"\u26A0店铺收藏消息推送失败\u26A0", f"任务信息: {cmd} 请注意检查!")
+                    CommonUtil.send_wx_msg(['chenyuanjie'], f"\u26A0店铺收藏消息推送失败\u26A0", f"任务信息: {cmd} 请注意检查!")

            else:
                print("dwt执行失败")
                print("错误信息为：============")
                print(error.decode())
-                CommonUtil.send_wx_msg(['wangrui4'], f"\u26A0店铺收藏更新失败\u26A0", f"任务信息: {cmd} 请注意检查!")
+                CommonUtil.send_wx_msg(['chenyuanjie'], f"\u26A0店铺收藏更新失败\u26A0", f"任务信息: {cmd} 请注意检查!")
        else:
            print("dws执行失败")
            print("错误信息为：============")
            print(error.decode())
-            CommonUtil.send_wx_msg(['wangrui4'], f"\u26A0店铺收藏更新失败\u26A0", f"任务信息: {cmd} 请注意检查!")
+            CommonUtil.send_wx_msg(['chenyuanjie'], f"\u26A0店铺收藏更新失败\u26A0", f"任务信息: {cmd} 请注意检查!")


 if __name__ == '__main__':

--- a/Pyspark_job/dwt/dwt_st_mt_christmas_info.py
+++ b/Pyspark_job/dwt/dwt_st_mt_christmas_info.py
--- a/Pyspark_job/dwt/dwt_st_pcp_current.py
+++ b/Pyspark_job/dwt/dwt_st_pcp_current.py
@@ -81,5 +81,82 @@ def handle_calc():
    print("success")


+def handle_calc_new():
+    day_end = CommonUtil.format_now("%Y-%m-%d")
+    CommonUtil.orctable_concatenate(
+        hive_table="dim_st_pcp_history",
+        partition_dict={
+            "date_info": CommonUtil.get_day_offset(day_end, -1)
+        },
+        innerFlag=False,
+        min_part_num=10,
+        max_retry_time=5
+    )
+    spark = SparkUtil.get_spark_session("dwt_st_pcp_current")
+    day_start = CommonUtil.get_day_offset(day_end, -90)
+    df_all = spark.sql(f"""
+            select site_id,
+               group_id,
+               keyword_id,
+               keyword,
+               match_type,
+               created_at,
+               min_bid,
+               max_bid,
+               suggested_bid,
+               date_info
+            from dim_st_pcp_history 
+            where date_info >= '{day_start}'
+            and date_info <= '{day_end}'
+            """)
+
+    window = Window.partitionBy(['site_id', 'match_type', 'keyword'])
+    df_all = df_all.where("site_id is not null and created_at is not null")
+    # 去重
+    df_all = df_all.dropDuplicates(['site_id', 'match_type', 'keyword', 'date_info'])
+
+    # 获取最小的那天
+    df_save = df_all.withColumn("day_row_number",
+                                F.row_number().over(window.orderBy(F.col("date_info").desc())))
+    df_save = df_save.where("day_row_number == 1")
+
+    # 取最近的一天中的最小建议竞价的那一行作为过滤值
+    df_save = df_save.withColumn("min_row_number",
+                                 F.row_number().over(window.orderBy(F.col("suggested_bid").asc())))
+
+    df_save = df_save.where("min_row_number == 1")
+
+    df_history = df_all.groupby([F.col("site_id"), F.col("keyword"), F.col("match_type")]).agg(
+        F.collect_list(F.struct(F.col("min_bid"), F.col("max_bid"), F.col("suggested_bid"), F.col("created_at"))).alias("list")
+    )
+
+    df_history = df_history.withColumn("history_json",
+                                       F.when(F.size(F.col("list")) <= 1, F.lit(None)).otherwise(F.to_json(F.col("list"))))
+
+    df_save = df_save.join(df_history, on=['site_id', 'keyword', 'match_type'], how='left').select(
+        df_save['site_id'],
+        F.col('group_id'),
+        F.col('keyword_id'),
+        df_save['keyword'],
+        df_save['match_type'],
+        F.col('created_at'),
+        F.col('min_bid'),
+        F.col('max_bid'),
+        F.col('suggested_bid'),
+        F.col('history_json'),
+        F.lit("90").alias("day")
+    )
+    # 更新
+    CommonUtil.save_or_update_table(
+        spark_session=spark,
+        hive_tb_name="dwt_st_pcp_current_v2",
+        partition_dict={
+            "day": "90"
+        },
+        df_save=df_save
+    )
+    print("success")
+
+
 if __name__ == '__main__':
-    handle_calc()
+    handle_calc_new()
--- a/Pyspark_job/dwt/dwt_st_theme_agg.py
+++ b/Pyspark_job/dwt/dwt_st_theme_agg.py
--- a/Pyspark_job/dwt/dwt_theme_bsr_orders_new.py
+++ b/Pyspark_job/dwt/dwt_theme_bsr_orders_new.py
@@ -42,10 +42,13 @@ class DwtThemeBsOrders(Templates):
        #       f"union all " \
        #       f"select asin, asin_title, bsr_orders, date_info from dwt_flow_asin where site_name='{self.site_name}' and " \
        #       f"date_type='month' and date_info in ('2023-01', '2023-02', '2023-03', '2023-04', '2023-05', '2023-06');"
-        sql = f"select asin, asin_title, bsr_orders, date_info from dwt_flow_asin where site_name='{self.site_name}' and " \
-              f"date_type='month' and date_info >= '2023-01' and date_info <= '2023-12';"
+        # sql = f"select asin, asin_title, bsr_orders, date_info from dwt_flow_asin where site_name='{self.site_name}' and " \
+        #       f"date_type='month' and date_info >= '2023-01' and date_info <= '2023-12';"
              # f"date_type='month' and date_info >= '2023-01' and date_info <= '2023-01' limit 1000000;"

+        sql = f"select asin, asin_title, bsr_orders, date_info from dwt_flow_asin where site_name='{self.site_name}' and " \
+              f"date_type='month' and date_info >= '2024-08' and date_info <= '2025-07';"
+
        print("sql:", sql)
        self.df_flow = self.spark.sql(sql).cache()
        self.df_flow.show(10, truncate=False)
@@ -186,7 +189,8 @@ class DwtThemeBsOrders(Templates):
        self.df_save = self.df_save.cache()
        self.df_save.show(50, truncate=False)
        df = self.df_save.toPandas()
-        df.to_csv("/root/theme_new_2023.csv", index=False)
+        # df.to_csv("/root/theme_new_2023.csv", index=False)
+        df.to_csv("/home/fangxingjun/theme_new_202408-202507.csv", index=False)


 if __name__ == '__main__':

--- a/Pyspark_job/dwt/dwt_top100.py
+++ b/Pyspark_job/dwt/dwt_top100.py
@@ -90,11 +90,11 @@ class DwtTop100(Templates):
    def save_data(self):
        self.df_save = self.df_flow_asin
        self.df_save = self.df_save.toPandas()
-        self.df_save.to_csv(f"/root/asin_bsr_{self.site_name}_{self.date_info}.csv", index=False)
+        self.df_save.to_csv(f"/home/fangxingjun/asin_bsr_{self.site_name}_{self.date_info}.csv", index=False)

    def save_data_old(self):
        self.df_save = self.df_save.toPandas()
-        self.df_save.to_csv(f"/root/asin_bsr_{self.site_name}_{self.date_info}.csv", index=False)
+        self.df_save.to_csv(f"/home/fangxingjun/asin_bsr_{self.site_name}_{self.date_info}.csv", index=False)


 if __name__ == '__main__':

--- a/Pyspark_job/es_handle/es_update_with_asin_bought_month.py
+++ b/Pyspark_job/es_handle/es_update_with_asin_bought_month.py
+import os
+import sys
+
+sys.path.append(os.path.dirname(sys.path[0]))
+
+from utils.spark_util import SparkUtil
+from utils.db_util import DBUtil
+from utils.common_util import CommonUtil
+from utils.DorisHelper import DorisHelper
+from pyspark.sql import functions as F
+
+__es_ip__ = "192.168.10.217"
+__es_port__ = "9200"
+__es_user__ = "elastic"
+__es_passwd__ = "Selection20251#+"
+
+
+class EsUpdate(object):
+
+    def __init__(self, site_name, date_info):
+        self.site_name = site_name
+        self.date_info = date_info
+        self.spark = SparkUtil.get_spark_session(f"{self.__class__.__name__}")
+        year, month = self.date_info.split('-')
+        self.index_name = f"{site_name}_st_detail_month_{year}_{month}"
+        self.es_options = {
+            "es.nodes": __es_ip__,
+            "es.port": __es_port__,
+            "es.net.http.auth.user": __es_user__,
+            "es.net.http.auth.pass": __es_passwd__,
+            "es.mapping.id": "asin",
+            "es.resource": f"{self.index_name}/_doc",
+            "es.batch.write.refresh": "false",
+            "es.batch.write.retry.wait": "60s",
+            "es.batch.size.entries": "5000",
+            "es.nodes.wan.only": "false",
+            "es.batch.write.concurrency": "60",
+            "es.write.operation": "upsert"
+        }
+
+        self.df_asin = self.spark.sql(f"select 1+1;")
+        self.df_es_asin = self.spark.sql(f"select 1+1;")
+        self.df_need_update = self.spark.sql(f"select 1+1;")
+
+    def run(self):
+        self.get_update_asin()
+        self.update_es_filed()
+
+    def get_update_asin(self):
+        sql = f"""
+            select asin from {self.site_name}_asin_detail_2025_buysales_err where date_info = '{self.date_info}'
+        """
+        pg_con_info = DBUtil.get_connection_info("postgresql_14", self.site_name)
+        self.df_asin = SparkUtil.read_jdbc_query(
+            session=self.spark,
+            url=pg_con_info['url'],
+            username=pg_con_info['username'],
+            pwd=pg_con_info['pwd'],
+            query=sql
+        )
+        self.df_asin = self.df_asin.dropDuplicates(['asin']).repartition(40, 'asin').cache()
+        print("爬虫表数据量为：", self.df_asin.count())
+
+    def update_es_filed(self):
+        es_asin_sql = f"""
+            SELECT asin from es_selection.default_db.{self.index_name}
+        """
+        self.df_es_asin = DorisHelper.spark_import_with_sql(self.spark, es_asin_sql).repartition(40, 'asin')
+        self.df_need_update = self.df_asin.join(
+            self.df_es_asin, on=['asin'], how='inner'
+        ).withColumn(
+            'asin_bought_month', F.lit(0)
+        ).cache()
+        print("ES待更新的数据量为：", self.df_need_update.count())
+        print(f"正在更新ES数据，更新索引：{self.index_name}")
+
+        try:
+            self.df_need_update.write.format("org.elasticsearch.spark.sql") \
+                .options(**self.es_options) \
+                .mode("append") \
+                .save()
+            print(f"ES {self.index_name} 索引更新完毕！")
+        except Exception as e:
+            print("An error occurred while writing to Elasticsearch:", str(e))
+            CommonUtil.send_wx_msg(['chenyuanjie'], '\u26A0 ES月销数据更新失败', f'失败索引：{self.index_name}')
+
+
+if __name__ == "__main__":
+    site_name = sys.argv[1]
+    date_info = sys.argv[2]
+    handle_obj = EsUpdate(site_name, date_info)
+    handle_obj.run()
--- a/Pyspark_job/es_handle/es_update_with_info.py
+++ b/Pyspark_job/es_handle/es_update_with_info.py
@@ -11,7 +11,7 @@ from utils.DorisHelper import DorisHelper
 __es_ip__ = "192.168.10.217"
 __es_port__ = "9200"
 __es_user__ = "elastic"
-__es_passwd__ = "selection2021.+"
+__es_passwd__ = "Selection20251#+"
 __warehouse_dir__ = "hdfs://nameservice1:8020/home/big_data_selection"
 __metastore_uris__ = "thrift://hadoop16:9083"


--- a/Pyspark_job/es_handle/es_update_with_usr_mask.py
+++ b/Pyspark_job/es_handle/es_update_with_usr_mask.py
@@ -17,7 +17,7 @@ from utils.DorisHelper import DorisHelper
 __es_ip__ = "192.168.10.217"
 __es_port__ = "9200"
 __es_user__ = "elastic"
-__es_passwd__ = "selection2021.+"
+__es_passwd__ = "Selection20251#+"
 __warehouse_dir__ = "hdfs://nameservice1:8020/home/big_data_selection"
 __metastore_uris__ = "thrift://hadoop16:9083"

@@ -81,7 +81,7 @@ def update_es_fileds(spark, df_main, date_info_list, site_name, run_type):
            print(f"elasticsearch {index_name} 更新完毕!")
        except Exception as e:
            print("An error occurred while writing to Elasticsearch:", str(e))
-            CommonUtil.send_wx_msg(['wujicang', 'wangrui4'], '\u26A0 es用户标记信息更新失败', f'es更新用户标记信息失败：{site_name}, {date_info}')
+            CommonUtil.send_wx_msg(['wujicang', 'chenyuanjie'], '\u26A0 es用户标记信息更新失败', f'es更新用户标记信息失败：{site_name}, {date_info}')
            pass
    print("elasticsearch 所有数据全部更新完毕")


--- a/Pyspark_job/export_es/es_ai_asin_add.py
+++ b/Pyspark_job/export_es/es_ai_asin_add.py
+import os
+import sys
+
+sys.path.append(os.path.dirname(sys.path[0]))
+
+from utils.common_util import CommonUtil
+from utils.spark_util import SparkUtil
+from utils.es_util import EsUtils
+from utils.db_util import DBUtil
+from datetime import datetime, timedelta
+from pyspark.sql import functions as F
+
+
+class EsAiAsinAdd(object):
+
+    def __init__(self, site_name, date_type, date_info):
+        self.site_name = site_name
+        self.date_type = date_type
+        self.date_info = date_info
+        self.spark = SparkUtil.get_spark_session(f"{self.__class__.__name__}")
+
+        if self.site_name == 'us':
+            self.pg_tb = "ai_asin_analyze_detail"
+        else:
+            self.pg_tb = f"{self.site_name}_ai_asin_analyze_detail"
+
+        launch_time_base_date = self.spark.sql(
+            f"""SELECT max(`date`) AS last_day FROM dim_date_20_to_30 WHERE year_month = '{self.date_info}'"""
+        ).collect()[0]['last_day']
+        self.launch_time_interval_dict = self.get_launch_time_interval_dict(launch_time_base_date)
+
+        self.es_client = EsUtils.get_es_client()
+        self.es_index = f"{self.site_name}_ai_asin_analyze_detail_{self.date_info.replace('-', '_')}"
+        self.es_pipeline = f"{self.site_name}_ai_analyze_pipeline"
+        self.es_options = self.get_es_options(self.es_index, self.es_pipeline)
+
+        self.df_ai_asin_detail = self.spark.sql(f"select 1+1;")
+        self.df_ai_asin_analyze = self.spark.sql(f"select 1+1;")
+        self.df_save = self.spark.sql(f"select 1+1;")
+
+    @staticmethod
+    def get_launch_time_interval_dict(base_date):
+        base_date = datetime.strptime(base_date, '%Y-%m-%d')
+        return {
+            "one_month": (base_date + timedelta(days=-30)).strftime('%Y-%m-%d'),
+            "three_month": (base_date + timedelta(days=-90)).strftime('%Y-%m-%d'),
+            "six_month": (base_date + timedelta(days=-180)).strftime('%Y-%m-%d'),
+            "twelve_month": (base_date + timedelta(days=-360)).strftime('%Y-%m-%d'),
+            "twenty_four_month": (base_date + timedelta(days=-720)).strftime('%Y-%m-%d'),
+            "thirty_six_month": (base_date + timedelta(days=-1080)).strftime('%Y-%m-%d')
+        }
+
+    @staticmethod
+    def get_es_options(index_name, pipeline_id):
+        return {
+            "es.nodes": EsUtils.__es_ip__,
+            "es.port": EsUtils.__es_port__,
+            "es.net.http.auth.user": EsUtils.__es_user__,
+            "es.net.http.auth.pass": EsUtils.__es_passwd__,
+            "es.mapping.id": "asin",
+            "es.resource": f"{index_name}/_doc",
+            "es.batch.write.refresh": "false",
+            "es.batch.write.retry.wait": "60s",
+            "es.batch.size.entries": "5000",
+            "es.nodes.wan.only": "false",
+            "es.batch.write.concurrency": "40",
+            "es.write.operation": "index",
+            "es.ingest.pipeline": f"{pipeline_id}"
+        }
+
+    def run(self):
+        self.read_data()
+        self.handle_data()
+        self.save_data()
+
+    def read_data(self):
+        # 读取asin信息库月数据
+        sql1 = f"""
+        select 
+            site_name,
+            asin,
+            weight,
+            bought_month,
+            category,
+            img,
+            title,
+            brand,
+            account_name,
+            account_addr,
+            buy_box_seller_type,
+            launch_time,
+            img_num,
+            variation_flag,
+            variation_num,
+            ao_val,
+            category_id,
+            category_current_id,
+            parent_asin,
+            bsr_rank,
+            price,
+            rating,
+            total_comments,
+            seller_id,
+            fb_country_name,
+            bought_month_mom,
+            bought_month_yoy,
+            is_new_flag,
+            is_ascending_flag
+        from dwt_ai_asin_add 
+        where site_name = '{self.site_name}' 
+          and date_type = '{self.date_type}' 
+          and date_info = '{self.date_info}'
+        """
+        self.df_ai_asin_detail = self.spark.sql(sqlQuery=sql1).repartition(40, 'asin').cache()
+        print("ASIN信息库数据如下：")
+        self.df_ai_asin_detail.show(10, True)
+
+        # 读取Ai分析结果
+        sql2 = f"""
+        select
+            asin, 					
+            id as analyze_id,
+            package_quantity,
+            material,
+            color,
+            appearance,
+            size,
+            function,
+            shape,
+            scene_title,
+            scene_comment,
+            uses,
+            theme,
+            crowd,
+            short_desc,
+            title_pic_flag,
+            title_word_flag,
+            title_pic_content,
+            title_word_content,
+            array_to_string(package_quantity_arr, ',') as package_quantity_arr,
+            package_quantity_flag,
+            label_content
+        from {self.pg_tb}
+        """
+        conn_info = DBUtil.get_connection_info("postgresql", "us")
+        self.df_ai_asin_analyze = SparkUtil.read_jdbc_query(
+            session=self.spark,
+            url=conn_info["url"],
+            pwd=conn_info["pwd"],
+            username=conn_info["username"],
+            query=sql2
+        ).withColumn(
+            'package_quantity_arr', F.split(F.col('package_quantity_arr'), ',')
+        ).withColumn(
+            'package_quantity_arr', F.expr('transform(package_quantity_arr, x -> cast(x as int))')
+        ).repartition(40, 'asin').cache()
+        print("AI分析数据如下：")
+        self.df_ai_asin_analyze.show(10, True)
+
+    def handle_data(self):
+        # 补充launch_time_type字段
+        one_month = self.launch_time_interval_dict['one_month']
+        three_month = self.launch_time_interval_dict['three_month']
+        six_month = self.launch_time_interval_dict['six_month']
+        twelve_month = self.launch_time_interval_dict['twelve_month']
+        twenty_four_month = self.launch_time_interval_dict['twenty_four_month']
+        thirty_six_month = self.launch_time_interval_dict['thirty_six_month']
+        expr_str = f"""
+            CASE WHEN launch_time >= '{one_month}' THEN 1 
+                 WHEN launch_time >= '{three_month}' AND launch_time < '{one_month}' THEN 2 
+                 WHEN launch_time >= '{six_month}' AND launch_time < '{three_month}' THEN 3 
+                 WHEN launch_time >= '{twelve_month}' AND launch_time < '{six_month}' THEN 4 
+                 WHEN launch_time >= '{twenty_four_month}' AND launch_time < '{twelve_month}' THEN 5 
+                 WHEN launch_time >= '{thirty_six_month}' AND launch_time < '{twenty_four_month}' THEN 6 
+                 WHEN launch_time < '{thirty_six_month}' THEN 7 
+                 ELSE 0 END
+        """
+        self.df_ai_asin_detail = self.df_ai_asin_detail.withColumn('launch_time_type', F.expr(expr_str))
+
+    def save_data(self):
+        self.df_save = self.df_ai_asin_detail.join(
+            self.df_ai_asin_analyze, 'asin', 'inner'
+        ).select(
+            'account_addr',
+            'account_name',
+            'analyze_id',
+            'ao_val',
+            'appearance',
+            'asin',
+            'bought_month',
+            'bought_month_mom',
+            'bought_month_yoy',
+            'brand',
+            'bsr_rank',
+            'buy_box_seller_type',
+            'category',
+            'category_current_id',
+            'category_id',
+            'color',
+            'crowd',
+            'fb_country_name',
+            'function',
+            'img',
+            'img_num',
+            'is_ascending_flag',
+            'is_new_flag',
+            'label_content',
+            'launch_time',
+            'launch_time_type',
+            'material',
+            'package_quantity',
+            'package_quantity_arr',
+            'package_quantity_flag',
+            'parent_asin',
+            'price',
+            'rating',
+            'scene_comment',
+            'scene_title',
+            'seller_id',
+            'shape',
+            'short_desc',
+            'site_name',
+            'size',
+            'theme',
+            'title',
+            'title_pic_content',
+            'title_pic_flag',
+            'title_word_content',
+            'title_word_flag',
+            'total_comments',
+            'uses',
+            'variation_flag',
+            'variation_num',
+            'weight'
+        ).cache()
+        try:
+            self.df_save.write.format("org.elasticsearch.spark.sql") \
+                .options(**self.es_options) \
+                .mode("append") \
+                .save()
+            print(f"ES {self.es_index} 索引更新完毕！")
+        except Exception as e:
+            print("An error occurred while writing to Elasticsearch:", str(e))
+            CommonUtil.send_wx_msg(['chenyuanjie'], '\u26A0 ES数据更新失败', f'失败索引：{self.es_index}')
+
+
+if __name__ == "__main__":
+    site_name = sys.argv[1]
+    date_type = sys.argv[2]
+    date_info = sys.argv[3]
+    print("开始执行时间：", datetime.now().strftime("%Y-%m-%d %H:%M"))
+    handle_obj = EsAiAsinAdd(site_name, date_type, date_info)
+    handle_obj.run()
+    print("执行结束时间：", datetime.now().strftime("%Y-%m-%d %H:%M"))
+    print("success！！！")
--- a/Pyspark_job/export_es/es_ai_asin_all.py
+++ b/Pyspark_job/export_es/es_ai_asin_all.py
+import os
+import sys
+
+sys.path.append(os.path.dirname(sys.path[0]))
+
+from utils.spark_util import SparkUtil
+from utils.es_util import EsUtils
+from pyspark.sql import functions as F
+from utils.common_util import CommonUtil
+
+
+class EsAiAsinAll(object):
+
+    def __init__(self, site_name):
+        self.site_name = site_name
+        self.spark = SparkUtil.get_spark_session(f"{self.__class__.__name__}")
+
+        # ES相关配置
+        self.es_client = EsUtils.get_es_client()
+        self.es_index = f"{site_name}_ai_analyze_extra"
+        self.es_body = self.get_es_body()
+        self.es_options = self.get_es_options(self.es_index)
+        self.policy_name = f"{site_name}_ai_analyze_policy"
+        self.pipeline_id = f"{site_name}_ai_analyze_pipeline"
+
+        self.df_asin_detail = self.spark.sql(f"select 1+1;")
+
+    @staticmethod
+    def get_es_body():
+        return {
+            "settings": {
+                "number_of_shards": "3",
+                "number_of_replicas": "1"
+            },
+            "mappings": {
+                "properties": {
+                    "asin": {
+                        "type": "keyword"
+                    },
+                    "is_stable_flag": {
+                        "type": "short"
+                    },
+                    "is_periodic_flag": {
+                        "type": "short"
+                    },
+                    "is_ascending_flag": {
+                        "type": "short"
+                    },
+                    "max_bought_month_arr": {
+                        "type": "integer"
+                    }
+                }
+            }
+        }
+
+    @staticmethod
+    def get_es_options(index_name):
+        return {
+            "es.nodes": EsUtils.__es_ip__,
+            "es.port": EsUtils.__es_port__,
+            "es.net.http.auth.user": EsUtils.__es_user__,
+            "es.net.http.auth.pass": EsUtils.__es_passwd__,
+            "es.mapping.id": "asin",
+            "es.resource": f"{index_name}/_doc",
+            "es.batch.write.refresh": "false",
+            "es.batch.write.retry.wait": "60s",
+            "es.batch.size.entries": "5000",
+            "es.nodes.wan.only": "false",
+            "es.batch.write.concurrency": "40",
+            "es.write.operation": "index"
+        }
+
+    def run(self):
+        self.read_data()
+        self.es_save()
+        self.create_enrich_policy()
+        self.create_enrich_pipeline()
+
+    def read_data(self):
+        sql = f"""
+        select 
+            asin, 
+            is_stable_flag, 
+            is_periodic_flag, 
+            is_ascending_flag, 
+            max_month_last_12_month as max_bought_month_arr
+        from dwt_ai_asin_all 
+        where site_name = '{self.site_name}'
+        """
+        self.df_asin_detail = self.spark.sql(sqlQuery=sql).repartition(40, 'asin').withColumn(
+            "max_bought_month_arr", F.split(F.col("max_bought_month_arr"), ",")
+        ).withColumn(
+            "max_bought_month_arr", F.expr("transform(max_bought_month_arr, x -> cast(x as int))")
+        ).cache()
+        print("ASIN信息库数据如下：")
+        self.df_asin_detail.show(10, True)
+
+    def es_save(self):
+        print(f"创建富集索引：{self.es_index}！")
+        EsUtils.create_index(self.es_index, self.es_client, self.es_body)
+        try:
+            self.df_asin_detail.write.format("org.elasticsearch.spark.sql") \
+                .options(**self.es_options) \
+                .mode("append") \
+                .save()
+            print(f"ES {self.es_index} 索引更新完毕！")
+        except Exception as e:
+            print("An error occurred while writing to Elasticsearch:", str(e))
+            CommonUtil.send_wx_msg(['chenyuanjie'], '\u26A0 ES数据更新失败', f'失败索引：{self.es_index}')
+
+    def create_enrich_policy(self):
+        # print(f"创建富集策略：{self.policy_name}！")
+        # policy_body = {
+        #     "match": {
+        #         "indices": f"{self.es_index}",
+        #         "match_field": "asin",
+        #         "enrich_fields": ["is_stable_flag", "is_periodic_flag", "is_ascending_flag", "max_bought_month_arr"]
+        #     }
+        # }
+        # self.es_client.enrich.put_policy(name=self.policy_name, body=policy_body)
+
+        print(f"刷新富集策略：{self.policy_name}！")
+        self.es_client.enrich.execute_policy(self.policy_name, request_timeout=1800)
+
+    def create_enrich_pipeline(self):
+        print(f"创建富集管道：{self.pipeline_id}！")
+        pipeline_body = {
+            "description": "ai asin analyze pipeline",
+            "processors": [
+                {
+                    "enrich": {
+                        "policy_name": self.policy_name,
+                        "field": "asin",
+                        "target_field": "last_year_extra",
+                        "max_matches": 1,
+                        "ignore_missing": True
+                    },
+                }
+            ]
+        }
+        self.es_client.ingest.put_pipeline(id=self.pipeline_id, body=pipeline_body)
+
+        pass
+
+
+if __name__ == "__main__":
+    site_name = sys.argv[1]
+    handle_obj = EsAiAsinAll(site_name)
+    handle_obj.run()
+    print("success！！！")
--- a/Pyspark_job/export_es/es_flow_asin.py
+++ b/Pyspark_job/export_es/es_flow_asin.py
@@ -48,13 +48,17 @@ class EsStDetail(TemplatesMysql):
        self.record_table_name_field = f'{self.site_name}_flow_asin_last_month' if self.date_type == 'month' else f'{self.site_name}_flow_asin_last30day'
        # elasticsearch相关配置
        self.client = EsUtils.get_es_client()
-        self.es_options = EsUtils.get_es_options(self.es_index_name)
+        # 富集策略相关配置，用于更新 usr_mask_type 字段
+        self.policy_name1 = "user_mask_asin_policy"
+        self.policy_name2 = "user_mask_category_policy"
+        self.pipeline_id = "user_asin_mask_enrich_pipeline"
+        self.es_options = EsUtils.get_es_options(self.es_index_name, self.pipeline_id)
        self.es_body = EsUtils.get_es_body()

        # 正式导出需入导出记录表
        if result_type == 'formal':
            CommonUtil.judge_is_work_hours(site_name=site_name, date_type=date_type, date_info=date_info,
-                                           principal='wangrui4', priority=3, export_tools_type=2,
+                                           principal='chenyuanjie', priority=3, export_tools_type=2,
                                           belonging_to_process='流量选品')

    def get_date_from_week(self):
@@ -93,7 +97,7 @@ class EsStDetail(TemplatesMysql):
            current_category_rank, asin_weight_ratio, asin_bought_month, asin_lqs_rating, asin_lqs_rating_detail, 
            title_matching_degree, asin_lob_info, is_contains_lob_info, is_package_quantity_abnormal, zr_flow_proportion, 
            matrix_flow_proportion, matrix_ao_val, customer_reviews_json as product_features, img_info, 
-            coalesce(parent_asin, asin) as collapse_asin, follow_sellers_count 
+            coalesce(parent_asin, asin) as collapse_asin, follow_sellers_count, asin_describe, asin_fbm_price as fbm_price
            from {self.table_name} where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{self.date_info}'
        """
        print("sql:", sql)
@@ -105,6 +109,10 @@ class EsStDetail(TemplatesMysql):
    def es_prepare(self):
        print("当前链接的es节点信息为：" + str(EsUtils.__es_ip__))
        EsUtils.create_index(self.es_index_name, self.client, self.es_body)
+        # 执行富集策略
+        self.client.enrich.execute_policy(name=self.policy_name1)
+        self.client.enrich.execute_policy(name=self.policy_name2)
+        # EsUtils.user_enrich_pipeline(self.client, self.pipeline_id, self.policy_name1, self.policy_name2)
        if self.date_type != 'month':
            if not EsUtils.exist_index_alias(self.alias_name, self.client):
                EsUtils.create_index_alias(self.es_index_name, self.alias_name, self.client)

--- a/Pyspark_job/export_other/self_asin_redis.py
+++ b/Pyspark_job/export_other/self_asin_redis.py
@@ -97,7 +97,9 @@ def save_to_doris(df_all: DataFrame):
                               "other_seller_name",
                               "buy_sales",
                               "updated_at",
-                               "img_num"
+                               "img_num",
+                               "online_time",
+                               "is_high_return_rate"
                               )
    write_fields = ",".join(df_all.schema.fieldNames())

@@ -123,10 +125,13 @@ def save_to_doris(df_all: DataFrame):
 def export():
    spark = SparkUtil.get_spark_session("self_asin_redis:export")
    day = CommonUtil.get_sys_arg(1, CommonUtil.format_now("%Y-%m-%d"))
+    export_type = CommonUtil.get_sys_arg(2, "redis&&doris")
    last_day = CommonUtil.get_day_offset(day, -1)
    next_day = CommonUtil.get_day_offset(day, 1)
+
    # 先删除
    redis_key = f"self_asin_detail:{day}"
+    if "redis" in export_type:
        client = RedisUtils.get_redis_client_by_type(db_type='microservice')
        if client.exists(redis_key):
            client.delete(redis_key)
@@ -161,7 +166,9 @@ def export():
               other_seller_name,
               buy_sales,
               img_num,
-               date_format(updated_at, '%Y-%m-%d %H:%m:%S') updated_at
+               date_format(updated_at, '%Y-%m-%d %H:%m:%S') updated_at,
+               date_format(online_time, '%Y-%m-%d %H:%m:%S') online_time,
+               returns
        from {site_name}_self_asin_detail
        where updated_at >= '{last_day}'
          and updated_at <= '{next_day}'
@@ -175,13 +182,27 @@ def export():
            .where("row_number == 1") \
            .drop("row_number")

+        asin_df = asin_df.withColumn("is_high_return_rate", F.expr("""
+                    case
+					when returns = 'Customers usually keep this item' then 1
+					when returns = 'Frequently returned item' then 2
+					else 0
+					end
+        """)).drop("returns")
+
        #  填充默认值
        asin_df = na_fill(asin_df).cache()
+        if "redis" in export_type:
            asin_df.toJSON().foreachPartition(functools.partial(save_to_redis_list, batch=5000, redis_key=redis_key, ttl=3600 * 24))
            print(f"{site_name}:redis:success")
+
+        if "doris" in export_type:
            save_to_doris(asin_df)
            print(f"{site_name}:doris:success")
+
        print("success all")
+
+    if "redis" in export_type:
        check_total()
    pass

@@ -212,7 +233,9 @@ def na_fill(asin_df):
        "account_name": "",
        "other_seller_name": "",
        "buy_sales": "",
-        "img_num": 0
+        "img_num": 0,
+        "online_time": "",
+        "is_high_return_rate": 0
    })
    pass

@@ -284,7 +307,9 @@ def export_all():
                   other_seller_name,
                   buy_sales,
                   img_num,
-                   date_format(updated_at, '%Y-%m-%d %H:%m:%S') updated_at
+                   date_format(updated_at, '%Y-%m-%d %H:%m:%S') updated_at,
+                   date_format(online_time, '%Y-%m-%d %H:%m:%S') online_time,
+                   returns
            from (
                     select max(id) as max_id
                     from {site_name}_self_asin_detail
@@ -293,6 +318,13 @@ def export_all():
                     inner join {site_name}_self_asin_detail tmp2 on tmp1.max_id = tmp2.id
            """
        asin_df = SparkUtil.read_jdbc(spark, DbTypes.mysql.name, site_name, query=query)
+        asin_df = asin_df.withColumn("is_high_return_rate", F.expr("""
+                    case
+					when returns = 'Customers usually keep this item' then 1
+					when returns = 'Frequently returned item' then 2
+					else 0
+					end
+        """)).drop("returns")
        #  填充默认值
        asin_df = na_fill(asin_df)
        asin_df.toJSON().foreachPartition(functools.partial(save_to_redis_list, batch=1000, redis_key=redis_key, ttl=3600 * 24 * 7))

--- a/Pyspark_job/export_vertify/vertify_dwt_flow_asin.py
+++ b/Pyspark_job/export_vertify/vertify_dwt_flow_asin.py
@@ -157,6 +157,6 @@ if __name__ == '__main__':
    # print(schema_flag)
    if not schema_flag:
        msg = f"数据表：{hive_table} {msg_params}，计算数据存在验证不通过，请检查数据是否异常！！具体信息请查看日志！！"
-        CommonUtil.send_wx_msg(['chenjianyun', 'wangrui4'], f"\u26A0 {hive_table} {msg_params}流程数据导出前验证异常", msg)
+        CommonUtil.send_wx_msg(['chenjianyun'], f"\u26A0 {hive_table} {msg_params}流程数据导出前验证异常", msg)
        spark_session.stop()
    pass
\ No newline at end of file
--- a/Pyspark_job/img_search/__init__.py
+++ b/Pyspark_job/img_search/__init__.py
+"""
+以图搜图执行代码顺序:
+# 1. 更新图片并下载到本地h7, h5运行
+/mnt/opt/module/anaconda3/envs/pyspark/bin/python3.8 /opt/module/spark-3.2.0-bin-hadoop3.2/demo/py_demo/img_search/img_download.py us amazon_inv 200 1
+# 2. 新增图片-默认选择最近7天
+/mnt/opt/module/anaconda3/envs/pyspark/bin/python3.8 /opt/module/spark-3.2.0-bin-hadoop3.2/demo/py_demo/img_search/img_local_path.py us amazon_inv
+# 3. 提取图片特征 -- h567--多台机器同时跑(暂时放在h5)
+/opt/module/anaconda3/envs/pyspark/bin/python3.8 /opt/module/spark-3.2.0-bin-hadoop3.2/demo/py_demo/img_search/img_extract_features.py us amazon_inv 1000 5
+# 4. 导入图片特征数据ods
+/mnt/run_shell/sqoop_shell/import/img_features.sh us amazon_inv
+# 5. 切片dim
+/mnt/run_shell/spark_shell/dim/img_dim_features_slice.sh us amazon_inv
+# 6.1 建立索引对应关系--doris-img_hdfs_index   先导入copy表
+/opt/module/anaconda3/envs/pyspark/bin/python3.8 /opt/module/spark-3.2.0-bin-hadoop3.2/demo/py_demo/img_search/img_hdfs_index.py us amazon_inv
+# 6.2 建立索引对应关系--hive-img_dwd_id_index
+/opt/module/spark/bin/spark-submit --master yarn --driver-memory 2g --executor-memory 4g --executor-cores 1 --num-executors 1 --queue spark /opt/module/spark/demo/py_demo/img_search/img_dwd_id_index_multiprocess.py us amazon_inv 3
+# 7. 导出id和index对应关系到doris(copy表)
+/opt/module/spark/bin/spark-submit --master yarn --driver-memory 20g --executor-memory 20g --executor-cores 4 --num-executors 2 --queue spark /opt/module/spark/demo/py_demo/img_search/img_id_index_to_doris.py us amazon_inv
+# 8. 删除索引hdfs路径相关的文件
+# 删除索引hdfs路径相关的文件
+hdfs dfs -rm -r /home/img_search/img_parquet/${site_name}/${img_type}/* 2>/dev/null || true
+hdfs dfs -rm -r /home/img_search/img_tmp/${site_name}/${img_type}/* 2>/dev/null || true
+hdfs dfs -rm -r /home/img_search/img_index/${site_name}/${img_type}/* 2>/dev/null || true
+# 9. 上传parquet文件到hdfs
+hdfs dfs -put /mnt/data/img_data/img_parquet/${site_name}/${img_type}/*/*.parquet /home/img_search/img_parquet/${site_name}/${img_type}/
+# 10. 创建索引
+/mnt/opt/module/anaconda3/envs/pyspark/bin/python3.8 /opt/module/spark/demo/py_demo/img_search/img_create_index.py
+# 11. 把hdfs的索引文件拉到本地
+rm -rf /mnt/data/img_data/img_index/${site_name}/${img_type}/* 2>/dev/null || true
+hdfs dfs -get /home/img_search/img_index/${site_name}/${img_type}/knn.index /mnt/data/img_data/img_index/${site_name}/${img_type}/
+# 12. 开启接口
+ssh hadoop7 systemctl restart img_search.service
+# 13. 交换表名
+/opt/module/anaconda3/envs/pyspark/bin/python3.8 /opt/module/spark/demo/py_demo/img_search/img_alter_table_name.py ${site_name} ${img_type}
+
+"""
\ No newline at end of file
--- a/Pyspark_job/img_search/img_alter_table_name.py
+++ b/Pyspark_job/img_search/img_alter_table_name.py
+import os
+import sys
+import time
+
+import pandas as pd
+
+os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
+sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
+from utils.templates import Templates
+from utils.db_util import DbTypes, DBUtil
+from datetime import date
+
+
+class ImgAlterTableName(Templates):
+
+    def __init__(self, site_name='us', img_type="amazon_inv"):
+        super(ImgAlterTableName, self).__init__()
+        self.site_name = site_name
+        self.img_type = img_type
+        self.engine_doris = DBUtil.get_db_engine(db_type=DbTypes.doris.name, site_name=self.site_name)
+        self.spark = self.create_spark_object(app_name=f"{self.db_save}: {self.site_name}")
+
+    def read_data(self):
+        pass
+
+    def handle_data(self):
+        while True:
+            with self.engine_doris.begin() as conn:
+                sql1 = "ALTER TABLE img_id_index RENAME img_id_index_temp;"
+                conn.execute(sql1)
+                sql2 = "ALTER TABLE img_id_index_copy RENAME img_id_index;"
+                conn.execute(sql2)
+                sql3 = "ALTER TABLE img_id_index_temp RENAME img_id_index_copy;"
+                conn.execute(sql3)
+            print(f"交换表名称完成--sql1: {sql1}\nsql2: {sql2}\nsql3: {sql3}")
+            sql_read = "select * from img_id_index limit 1"
+            df_read = pd.read_sql(sql_read, con=self.engine_doris)
+            created_time = str(list(df_read.created_time)[0])
+            created_date = created_time[:10] if len(created_time) > 10 else None
+            # 获取今天日期
+            today_str = date.today().strftime('%Y-%m-%d')  # '2025-08-20'
+            if created_date == today_str:
+                print(f"日期校验通过: created_date--{created_date}, today_str--{today_str}")
+                break
+            else:
+                print(f"日期校验不通过，等待10s继续: created_date--{created_date}, today_str--{today_str}")
+                self.engine_doris = DBUtil.get_db_engine(db_type=DbTypes.doris.name, site_name=self.site_name)
+                time.sleep(10)
+                continue
+
+    def save_data(self):
+        pass
+
+
+if __name__ == '__main__':
+    site_name = sys.argv[1]
+    img_type = sys.argv[2]
+    handle_obj = ImgAlterTableName(site_name=site_name, img_type=img_type)
+    handle_obj.run()
\ No newline at end of file
--- a/Pyspark_job/img_search/img_create_index.py
+++ b/Pyspark_job/img_search/img_create_index.py
+import os
+
+from autofaiss import build_index
+from pyspark.sql import SparkSession  # pylint: disable=import-outside-toplevel
+
+from pyspark import SparkConf, SparkContext
+
+
+def create_spark_session():
+    # this must be a path that is available on all worker nodes
+
+    # os.environ['PYSPARK_PYTHON'] = "/opt/module/spark/demo/py_demo/img_search/autofaiss.pex"
+    spark = (
+        SparkSession.builder
+            .config("spark.executorEnv.PEX_ROOT", "./.pex")
+            .config("spark.executor.cores", "4")
+            .config("spark.executor.memory", "20G")  # make sure to increase this if you're using more cores per executor
+            .config("spark.num.executors", "10")
+            .config("spark.yarn.queue", "spark")
+            .master("local")  # this should point to your master node, if using the tunnelling version, keep this to localhost
+            .appName("autofaiss-create-index")
+            .getOrCreate()
+    )
+    return spark
+
+
+spark = create_spark_session()
+
+index, index_infos = build_index(
+    # embeddings="hdfs://nameservice1:8020/home/img_search/us/amazon_inv/parquet",
+    embeddings="hdfs://nameservice1:8020/home/img_search/img_parquet/us/amazon_inv",
+    distributed="pyspark",
+    file_format="parquet",
+    max_index_memory_usage="80G",  # 16G
+    current_memory_available="120G",  # 24G
+    temporary_indices_folder="hdfs://nameservice1:8020/home/img_search/img_tmp/us/amazon_inv//distributed_autofaiss_indices",
+    index_path="hdfs://nameservice1:8020/home/img_search/img_index/us/amazon_inv/knn.index",
+    index_infos_path="hdfs://nameservice1:8020/home/img_search/img_index/us/amazon_inv/infos.json",
+)
+print("index, index_infos:", index, index_infos)
--- a/Pyspark_job/img_search/img_dim_features_slice.py
+++ b/Pyspark_job/img_search/img_dim_features_slice.py
+import ast
+import os
+import sys
+import pandas as pd
+
+os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
+sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
+from utils.templates import Templates
+# from ..utils.templates import Templates
+from pyspark.sql import functions as F
+from pyspark.sql.types import ArrayType, FloatType
+
+
+class PicturesDimFeaturesSlice(Templates):
+
+    def __init__(self, site_name='us', img_type='amazon_inv'):
+        super(PicturesDimFeaturesSlice, self).__init__()
+        self.site_name = site_name
+        self.img_type = img_type
+        self.db_save = f'img_dim_features_slice'
+        self.spark = self.create_spark_object(
+            app_name=f"{self.db_save}: {self.site_name}")
+        self.df_asin_features = self.spark.sql(f"select 1+1;")
+        self.df_save = self.spark.sql(f"select 1+1;")
+        # self.partitions_by = ['site_name', 'block']
+        self.partitions_by = ['site_name', 'img_type']
+        self.partitions_num = 10
+
+    def read_data(self):
+        # sql = f"select id, asin, img_vector as embedding from ods_asin_extract_features;"
+        sql = f"select id, img_unique, features, img_type from img_ods_features where site_name='{self.site_name}' and img_type='{self.img_type}';"
+        print("sql:", sql)
+        self.df_save = self.spark.sql(sql).cache()
+        self.df_save.show(10)
+        print(f"self.df_save.count(): {self.df_save.count()}")
+
+        # 由于不需要在这一步生成array类型
+        # partitions_num = self.df_asin_features.rdd.getNumPartitions()
+        # print("分区数量:", partitions_num)  # 642
+        # # self.partitions_num = 1000
+        # self.df_save = self.df_save.repartition(self.partitions_num)
+        # print("重置分区数量:", self.partitions_num)  # 642
+
+    def handle_data(self):
+        # 定义一个将字符串转换为列表的UDF
+        # str_to_list_udf = F.udf(lambda s: ast.literal_eval(s), ArrayType(FloatType()))
+        # # 对DataFrame中的列应用这个UDF
+        # self.df_save = self.df_save.withColumn("embedding", str_to_list_udf(self.df_save["embedding"]))
+        self.df_save = self.df_save.withColumn('site_name', F.lit(self.site_name))
+
+
+if __name__ == '__main__':
+    site_name = sys.argv[1]  # 参数1：站点
+    handle_obj = PicturesDimFeaturesSlice(site_name=site_name)
+    handle_obj.run()
+
--- a/Pyspark_job/img_search/img_download.py
+++ b/Pyspark_job/img_search/img_download.py
+import os
+import socket
+import sys
+import threading
+import logging
+import time
+import traceback
+import uuid
+
+import pandas as pd
+import redis
+import requests
+from sqlalchemy import text
+
+logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s', level=logging.INFO)
+os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
+sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
+
+from utils.db_util import DbTypes, DBUtil, get_redis_h14
+
+
+class ImgDownload(object):
+
+    def __init__(self, site_name='us', img_type="amazon_inv", thread_num=10, limit=200):
+        self.site_name = site_name
+        self.img_type = img_type
+        self.thread_num = thread_num
+        self.limit = limit
+        self.engine_mysql = DBUtil.get_db_engine(db_type=DbTypes.mysql.name, site_name=self.site_name)
+        self.client_redis = get_redis_h14()
+        self.hostname = socket.gethostname()
+        self.first_local_dir, self.read_table = self.get_first_local_dir()
+        # self.read_table = f"{self.site_name}_inv_img_info"
+        self.local_name = self.read_table
+
+    def get_first_local_dir(self):
+        if self.img_type == 'amazon_self':
+            first_local_dir = f"/mnt/data/img_data/amazon_self/{self.site_name}"
+            image_table = f'{self.site_name}_self_asin_image'
+        elif self.img_type == 'amazon':
+            first_local_dir = f"/mnt/data/img_data/amazon/{self.site_name}"
+            image_table = f'{self.site_name}_amazon_image'
+        elif self.img_type == 'amazon_inv':
+            first_local_dir = f"/mnt/data/img_data/amazon_inv/{self.site_name}"
+            image_table = f'{self.site_name}_inv_img_info'
+        else:
+            first_local_dir = ""
+            image_table = ""
+        return first_local_dir, image_table
+
+    def acquire_lock(self, lock_name, timeout=100):
+        """
+        尝试获取分布式锁, 能正常设置锁的话返回True, 不能设置锁的话返回None
+        lock_name: 锁的key, 建议和任务名称保持一致
+        """
+        lock_value = str(uuid.uuid4())
+        lock_acquired = self.client_redis.set(lock_name, lock_value, nx=True, ex=timeout)  # 可以不设置超时时间
+        # lock_acquired = self.client_redis.set(lock_name, lock_value, nx=True)
+        return lock_acquired, lock_value
+
+    def release_lock(self, lock_name, lock_value):
+        """释放分布式锁"""
+        script = """
+        if redis.call("get", KEYS[1]) == ARGV[1] then
+            return redis.call("del", KEYS[1])
+        else
+            return 0
+        end
+        """
+        result = self.client_redis.eval(script, 1, lock_name, lock_value)
+        return result
+
+    @staticmethod
+    def img_download(img_url, img_path, img_name):
+        file_path = f"{img_path}{img_name}"
+        for d_num in range(5):
+            try:
+                response = requests.get(img_url)
+                if response.status_code == 200:
+                    # Open a file in binary write mode
+                    with open(file_path, 'wb') as file:
+                        file.write(response.content)
+                        # print("Image downloaded successfully.")
+                        return True
+                else:
+                    continue
+            except Exception as e:
+                error = "No such file or directory"
+                if error in str(e):
+                    os.makedirs(img_path)
+                print(f"{d_num}次--下载图片失败, 图片路径:　{file_path}, 图片url: {img_url}, \n错误信息: {e, traceback.format_exc()}")
+                time.sleep(2)
+        return False
+
+    def update_state(self, id_list, state, state_value="success"):
+        if id_list:
+            while True:
+                try:
+                    with self.engine_mysql.begin() as conn:
+                        id_tuple = tuple(id_list)
+                        print(f"{state_value}--id_tuple: {len(id_tuple)}, {id_tuple[:10]}", )
+                        if id_tuple:
+                            id_tuple_str = f"('{id_tuple[0]}')" if len(id_tuple) == 1 else f"{id_tuple}"
+                            sql_update = f"UPDATE {self.read_table} SET state={state} WHERE id IN {id_tuple_str};"
+                            print("sql_update:", sql_update[:150])
+                            conn.execute(sql_update)
+                    break
+                except Exception as e:
+                    print(f"读取数据错误: {e}", traceback.format_exc())
+                    time.sleep(20)
+                    self.engine_mysql = DBUtil.get_db_engine(db_type=DbTypes.mysql.name, site_name=self.site_name)
+                    self.client_redis = get_redis_h14()
+                    continue
+
+    def read_data(self):
+        while True:
+            try:
+                lock_acquired, lock_value = self.acquire_lock(lock_name=self.local_name)
+                if lock_acquired:
+                    print("self.hostname:", self.hostname)
+                    with self.engine_mysql.begin() as conn:
+                        sql_read = text(f"SELECT id, img_id, img_type, img_url, id_segment FROM {self.read_table} WHERE state=1 LIMIT {self.limit};")
+                        df = pd.read_sql(sql=sql_read, con=self.engine_mysql)
+                        id_tuple = tuple(df.id)
+                        print(f"sql_read: {sql_read}, {df.shape}", id_tuple[:10])
+                        if id_tuple:
+                            id_tuple_str = f"('{id_tuple[0]}')" if len(id_tuple) == 1 else f"{id_tuple}"
+                            sql_update = f"UPDATE {self.read_table} SET state=2 WHERE id IN {id_tuple_str};"
+                            print("sql_update:", sql_update[:150])
+                            conn.execute(sql_update)
+                    self.release_lock(lock_name=self.local_name, lock_value=lock_value)
+                    return df
+                else:
+                    print(f"当前有其它进程占用redis的锁, 等待5秒继续获取数据")
+                    time.sleep(10)  # 等待5s继续访问锁
+                    continue
+            except Exception as e:
+                print(f"读取数据错误: {e}", traceback.format_exc())
+                time.sleep(20)
+                self.engine_mysql = DBUtil.get_db_engine(db_type=DbTypes.mysql.name, site_name=self.site_name)
+                self.client_redis = get_redis_h14()
+                continue
+
+    def handle_data(self, df, thread_id):
+        # 1. 下载图片
+        img_success_id_list = []
+        img_failed_id_list = []
+        id_list = list(df.id)
+        id_len = len(id_list)
+        for id_segment, id, img_id, img_type, img_url in zip(df.id_segment, df.id, df.img_id, df.img_type, df.img_url):
+            img_path = f"{self.first_local_dir}/{id_segment}/"
+            img_name = f"{id_segment}_{id}_{img_id}_{img_type}.jpg"
+            if self.hostname not in ['hadoop5', 'hadoop6', 'hadoop7', 'hadoop8']:
+                img_path = img_path.replace("/mnt", "/home")
+            d_flag = self.img_download(img_url=img_url, img_path=img_path, img_name=img_name)
+            id_index = id_list.index(id)
+            print(f"self.hostname: {self.hostname}, 线程: {thread_id}, 是否成功: {d_flag}, id_index: {id_index}, 进度: {round(id_index/id_len * 100, 2)}%, img_path: {img_path}{img_name}")
+            if d_flag:
+                img_success_id_list.append(id)
+            else:
+                img_failed_id_list.append(id)
+        # 2. 更改状态 -- 成功3 失败4
+        print(f"success: {len(img_success_id_list)}, failed: {len(img_failed_id_list)}")
+        self.update_state(id_list=img_success_id_list, state=3, state_value="success")
+        self.update_state(id_list=img_failed_id_list, state=4, state_value="failed")
+
+    def save_data(self):
+        pass
+
+    def run(self, thread_id=1):
+        while True:
+            try:
+                df = self.read_data()
+                if df.shape[0]:
+                    self.handle_data(df=df, thread_id=thread_id)
+                    self.save_data()
+                    # break
+                else:
+                    break
+            except Exception as e:
+                print(e, traceback.format_exc())
+                self.engine_mysql = DBUtil.get_db_engine(db_type=DbTypes.mysql.name, site_name=self.site_name)
+                self.client_redis = get_redis_h14()
+                time.sleep(20)
+                continue
+
+    def run_thread(self):
+        logging.info("所有线程处理开始")
+        thread_list = []
+        for thread_id in range(self.thread_num):
+            thread = threading.Thread(target=self.run, args=(thread_id, ))
+            thread_list.append(thread)
+            thread.start()
+        for thread in thread_list:
+            thread.join()
+        logging.info("所有线程处理完成")
+
+
+if __name__ == '__main__':
+    # handle_obj = PicturesFeatures(self_flag='_self')
+    # site_name = int(sys.argv[1])  # 参数1：站点
+    # site_name = 'us'
+    # img_type = "amazon_inv"
+    # limit = 100
+    # thread_num = 1
+    site_name = sys.argv[1]  # 参数1：站点
+    img_type = sys.argv[2]  # 参数2：图片来源类型
+    limit = int(sys.argv[3])  # 参数3：每次读取的数量--1000
+    thread_num = int(sys.argv[4])  # 参数4：线程数量--5
+    handle_obj = ImgDownload(site_name=site_name, img_type=img_type, thread_num=thread_num, limit=limit)
+    # handle_obj.run()
+    handle_obj.run_thread()
\ No newline at end of file
--- a/Pyspark_job/img_search/img_dwd_id_index.py
+++ b/Pyspark_job/img_search/img_dwd_id_index.py
--- a/Pyspark_job/img_search/img_dwd_id_index_multiprocess.py
+++ b/Pyspark_job/img_search/img_dwd_id_index_multiprocess.py
+import multiprocessing
+import os
+import sys
+import time
+import traceback
+
+import pandas as pd
+
+os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
+sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
+from utils.templates import Templates
+from utils.db_util import DbTypes, DBUtil
+
+
+class JudgeFinished(Templates):
+    def __init__(self, site_name='us', img_type="amazon_inv"):
+        super(JudgeFinished, self).__init__()
+        self.site_name = site_name
+        self.img_type = img_type
+        self.engine_doris = DBUtil.get_db_engine(db_type=DbTypes.doris.name, site_name=self.site_name)
+        self.tn_pics_hdfs_index = f"img_hdfs_index"
+
+    def judge(self):
+        sql = f"select * from {self.tn_pics_hdfs_index} where state in (1, 2) and site_name='{self.site_name}' and img_type='{self.img_type}';"
+        df = pd.read_sql(sql, con=self.engine_doris)
+        print(f"sql: {sql}, {df.shape}")
+        result_flag = True if df.shape[0] else False
+        return result_flag
+
+
+def main(site_name='us', img_type='amazon_inv', p_num=0):
+    while True:
+        try:
+            judge_obj = JudgeFinished(site_name=site_name, img_type=img_type)
+            result_flag = judge_obj.judge()
+            if result_flag:
+                print(f"继续, result_flag: {result_flag}")
+                os.system(f"/opt/module/spark/bin/spark-submit --master yarn --driver-memory 5g --executor-memory 10g --executor-cores 2 --num-executors 1 --queue spark /opt/module/spark/demo/py_demo/img_search/img_dwd_id_index.py {site_name} {img_type}")
+            else:
+                print(f"结束, result_flag: {result_flag}")
+                break
+        except Exception as e:
+            print(e, traceback.format_exc())
+            time.sleep(20)
+            error = "ValueError: Length mismatch: Expected axis has 0 elements"
+            if error in e:
+                print(f"当前已经跑完所有block块id对应的index关系，退出进程-{p_num}")
+                quit()
+            continue
+
+
+if __name__ == "__main__":
+    site_name = sys.argv[1]
+    img_type = sys.argv[2]
+    process_num = int(sys.argv[3])  # 参数1：进程数
+
+    processes = []
+    for p_num in range(process_num):  # 用于设定进程数量
+        process = multiprocessing.Process(target=main, args=(site_name, img_type, p_num))
+        process.start()
+        processes.append(process)
+
+    # 等待所有进程完成
+    for process in processes:
+        process.join()
--- a/Pyspark_job/img_search/img_extract_features.py
+++ b/Pyspark_job/img_search/img_extract_features.py
+import os
+import sys
+import threading
+import time
+import traceback
+import socket
+import uuid
+
+import numpy as np
+import pandas as pd
+import redis
+import logging
+
+os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
+sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
+# from utils.templates import Templates
+from sqlalchemy import text
+from vgg_model import VGGNet
+from utils.db_util import DbTypes, DBUtil, get_redis_h14
+logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s', level=logging.INFO)
+
+
+class ImgExtractFeatures(object):
+
+    def __init__(self, site_name='us', img_type="amazon_inv", thread_num=10, limit=1000):
+        # super(ImgFeatures, self).__init__()
+
+        self.site_name = site_name
+        self.img_type = img_type
+        self.thread_num = thread_num
+        self.limit = limit
+        self.engine_doris = DBUtil.get_db_engine(db_type=DbTypes.doris.name, site_name=self.site_name)
+        self.client_redis = get_redis_h14()
+        self.local_name = f"{self.site_name}_img_features"
+        self.vgg_model = VGGNet()
+        self.hostname = socket.gethostname()
+        self.read_table = f"img_local_path"
+        self.save_table = f"img_features"
+
+    def acquire_lock(self, lock_name, timeout=100):
+        """
+        尝试获取分布式锁, 能正常设置锁的话返回True, 不能设置锁的话返回None
+        lock_name: 锁的key, 建议和任务名称保持一致
+        """
+        lock_value = str(uuid.uuid4())
+        lock_acquired = self.client_redis.set(lock_name, lock_value, nx=True, ex=timeout)  # 可以不设置超时时间
+        # lock_acquired = self.client_redis.set(lock_name, lock_value, nx=True)
+        return lock_acquired, lock_value
+
+    def release_lock(self, lock_name, lock_value):
+        """释放分布式锁"""
+        script = """
+        if redis.call("get", KEYS[1]) == ARGV[1] then
+            return redis.call("del", KEYS[1])
+        else
+            return 0
+        end
+        """
+        result = self.client_redis.eval(script, 1, lock_name, lock_value)
+        return result
+
+    def read_data(self):
+        while True:
+            try:
+                lock_acquired, lock_value = self.acquire_lock(lock_name=self.local_name)
+                if lock_acquired:
+                    print("self.hostname:", self.hostname)
+                    with self.engine_doris.begin() as conn:
+                        sql_read = text(f"SELECT id, img_unique, local_path, img_type FROM selection.{self.read_table} WHERE site_name='{self.site_name}' and img_type='{self.img_type}' and state=1 LIMIT {self.limit};")
+                        # result = conn.execute(sql_read)
+                        # df = pd.DataFrame(result.fetchall())
+                        df = pd.read_sql(sql=sql_read, con=self.engine_doris)
+                        img_unique_tuple = tuple(df.img_unique)
+                        print(f"sql_read: {sql_read}, {df.shape}", img_unique_tuple[:10])
+                        if img_unique_tuple:
+                            img_unique_tuple_str = f"('{img_unique_tuple[0]}')" if len(img_unique_tuple) == 1 else f"{img_unique_tuple}"
+                            sql_update = text(f"UPDATE selection.{self.read_table} SET state=2 WHERE img_unique IN {img_unique_tuple_str};")
+                            print("sql_update:", sql_update)
+                            conn.execute(sql_update)
+                    self.release_lock(lock_name=self.local_name, lock_value=lock_value)
+                    return df
+                else:
+                    print(f"当前有其它进程占用redis的锁, 等待5秒继续获取数据")
+                    time.sleep(5)  # 等待5s继续访问锁
+                    continue
+            except Exception as e:
+                print(f"读取数据错误: {e}", traceback.format_exc())
+                time.sleep(5)
+                self.engine_doris = DBUtil.get_db_engine(db_type=DbTypes.doris.name, site_name=self.site_name)
+
+                continue
+
+    def handle_data(self, df, thread_id):
+        id_list = list(df.id)
+        img_unique_list = list(df.img_unique)
+        local_path_list = list(df.local_path)
+        data_list = []
+        for id, img_unique, local_path in zip(id_list, img_unique_list, local_path_list):
+            index = id_list.index(id)
+            print(f"thread_id, index, id, img_unique, local_path: {thread_id, index, id, img_unique, local_path}")
+            if self.hostname not in ['hadoop5', 'hadoop6', 'hadoop7', 'hadoop8']:
+                local_path = local_path.replace("/mnt", "/home")
+            try:
+                features = self.vgg_model.vgg_extract_feat(file=local_path)
+            except Exception as e:
+                print(e, traceback.format_exc())
+                features = list(np.zeros(shape=(512,)))
+            data_list.append([id, img_unique, str(features), self.img_type, self.site_name])
+        columns = ['id', 'img_unique', 'features', 'img_type', 'site_name']
+        df_save = pd.DataFrame(data_list, columns=columns)
+        return df_save
+
+    def save_data(self, df):
+        df.to_sql(self.save_table, con=self.engine_doris, if_exists="append", index=False)
+        with self.engine_doris.begin() as conn:
+            img_unique_tuple = tuple(df.img_unique)
+            if img_unique_tuple:
+                img_unique_tuple_str = f"('{img_unique_tuple[0]}')" if len(img_unique_tuple) == 1 else f"{img_unique_tuple}"
+                sql_update = f"update selection.{self.read_table} set state=3 where img_unique in {img_unique_tuple_str};"
+                print(f"sql_update: {sql_update}")
+                conn.execute(sql_update)
+
+    def run(self, thread_id=1):
+        while True:
+            try:
+                df = self.read_data()
+                if df.shape[0]:
+                    df_save = self.handle_data(df=df, thread_id=thread_id)
+                    self.save_data(df=df_save)
+                    # break
+                else:
+                    break
+            except Exception as e:
+                print(e, traceback.format_exc())
+                self.engine_doris = DBUtil.get_db_engine(db_type=DbTypes.doris.name, site_name=self.site_name)
+                self.client_redis = get_redis_h14()
+                self.vgg_model = VGGNet()
+                time.sleep(20)
+                continue
+
+    def run_thread(self):
+        thread_list = []
+        for thread_id in range(self.thread_num):
+            thread = threading.Thread(target=self.run, args=(thread_id, ))
+            thread_list.append(thread)
+            thread.start()
+        for thread in thread_list:
+            thread.join()
+        logging.info("所有线程处理完成")
+
+
+if __name__ == '__main__':
+    # handle_obj = PicturesFeatures(self_flag='_self')
+    # site_name = int(sys.argv[1])  # 参数1：站点
+    # site_name = 'us'
+    # img_type = "amazon_inv"
+    # limit = 100
+    # thread_num = 1
+    site_name = sys.argv[1]  # 参数1：站点
+    img_type = sys.argv[2]  # 参数2：图片来源类型
+    limit = int(sys.argv[3])  # 参数3：每次读取的数量--1000
+    thread_num = int(sys.argv[4])  # 参数4：线程数量--5
+    handle_obj = ImgExtractFeatures(site_name=site_name, img_type=img_type, thread_num=thread_num, limit=limit)
+    # handle_obj.run()
+    handle_obj.run_thread()
\ No newline at end of file
--- a/Pyspark_job/img_search/img_hdfs_index.py
+++ b/Pyspark_job/img_search/img_hdfs_index.py
--- a/Pyspark_job/img_search/img_id_index_to_doris.py
+++ b/Pyspark_job/img_search/img_id_index_to_doris.py
--- a/Pyspark_job/img_search/img_local_path.py
+++ b/Pyspark_job/img_search/img_local_path.py
--- a/Pyspark_job/img_search/img_search.py
+++ b/Pyspark_job/img_search/img_search.py
--- a/Pyspark_job/img_search/img_search_api.py
+++ b/Pyspark_job/img_search/img_search_api.py
--- a/Pyspark_job/img_search/img_to_h7.py
+++ b/Pyspark_job/img_search/img_to_h7.py
--- a/Pyspark_job/img_search/search_test.py
+++ b/Pyspark_job/img_search/search_test.py
--- a/Pyspark_job/img_search/vgg_model.py
+++ b/Pyspark_job/img_search/vgg_model.py
--- a/Pyspark_job/listen_program/dol_process_30day_or_month.py
+++ b/Pyspark_job/listen_program/dol_process_30day_or_month.py
--- a/Pyspark_job/listen_program/hadoop_health_check.py
+++ b/Pyspark_job/listen_program/hadoop_health_check.py
--- a/Pyspark_job/my_kafka/kafka_asin_detail.py
+++ b/Pyspark_job/my_kafka/kafka_asin_detail.py
--- a/Pyspark_job/my_kafka/kafka_flow_asin_detail.py
+++ b/Pyspark_job/my_kafka/kafka_flow_asin_detail.py
--- a/Pyspark_job/my_kafka/keyword_pcp_listener.py
+++ b/Pyspark_job/my_kafka/keyword_pcp_listener.py
--- a/Pyspark_job/script/auto_desploy.py
+++ b/Pyspark_job/script/auto_desploy.py
--- a/Pyspark_job/script/bsr_nsr_day_check.py
+++ b/Pyspark_job/script/bsr_nsr_day_check.py
--- a/Pyspark_job/script/export_asin_detail_base.py
+++ b/Pyspark_job/script/export_asin_detail_base.py
--- a/Pyspark_job/script/hadoop_health_check.py
+++ b/Pyspark_job/script/hadoop_health_check.py
--- a/Pyspark_job/script/re-run.py
+++ b/Pyspark_job/script/re-run.py
--- a/Pyspark_job/script/refreh-yarn-queue-config.sh
+++ b/Pyspark_job/script/refreh-yarn-queue-config.sh
--- a/Pyspark_job/script/rerun_dwt_flow_asin.py
+++ b/Pyspark_job/script/rerun_dwt_flow_asin.py
--- a/Pyspark_job/script/update_syn_pg14.py
+++ b/Pyspark_job/script/update_syn_pg14.py
--- a/Pyspark_job/sqoop_ct/__init__.py
+++ b/Pyspark_job/sqoop_ct/__init__.py
--- a/Pyspark_job/sqoop_ct/asin_state.py
+++ b/Pyspark_job/sqoop_ct/asin_state.py
--- a/Pyspark_job/sqoop_ct/de_st_month_2022_9_old.py
+++ b/Pyspark_job/sqoop_ct/de_st_month_2022_9_old.py
--- a/Pyspark_job/sqoop_ct/export_dwt_st_top_asin_info.py
+++ b/Pyspark_job/sqoop_ct/export_dwt_st_top_asin_info.py
--- a/Pyspark_job/sqoop_ct/st_2110_2208_in.py
+++ b/Pyspark_job/sqoop_ct/st_2110_2208_in.py
--- a/Pyspark_job/sqoop_ct/st_2110_2208_out.py
+++ b/Pyspark_job/sqoop_ct/st_2110_2208_out.py
--- a/Pyspark_job/sqoop_ct/st_2209_2303_in.py
+++ b/Pyspark_job/sqoop_ct/st_2209_2303_in.py
--- a/Pyspark_job/sqoop_ct/st_2209_2303_out.py
+++ b/Pyspark_job/sqoop_ct/st_2209_2303_out.py
--- a/Pyspark_job/sqoop_ct/test.py
+++ b/Pyspark_job/sqoop_ct/test.py
--- a/Pyspark_job/sqoop_ct/test_import.py
+++ b/Pyspark_job/sqoop_ct/test_import.py
--- a/Pyspark_job/sqoop_ct/test_utils.py
+++ b/Pyspark_job/sqoop_ct/test_utils.py
--- a/Pyspark_job/sqoop_ct/us_asin_image_pg14.py
+++ b/Pyspark_job/sqoop_ct/us_asin_image_pg14.py
--- a/Pyspark_job/sqoop_ct/us_asin_image_to_pg14.py
+++ b/Pyspark_job/sqoop_ct/us_asin_image_to_pg14.py
--- a/Pyspark_job/sqoop_ct/z_asin_b09_mysql_to_pg.py
+++ b/Pyspark_job/sqoop_ct/z_asin_b09_mysql_to_pg.py
--- a/Pyspark_job/sqoop_ct/z_asin_detail_trend_month_mysql_to_pg.py
+++ b/Pyspark_job/sqoop_ct/z_asin_detail_trend_month_mysql_to_pg.py
--- a/Pyspark_job/sqoop_ct/z_asin_state_copy.py
+++ b/Pyspark_job/sqoop_ct/z_asin_state_copy.py
--- a/Pyspark_job/sqoop_ct/z_de_asin_image.py
+++ b/Pyspark_job/sqoop_ct/z_de_asin_image.py
--- a/Pyspark_job/sqoop_ct/z_us_asin_image.py
+++ b/Pyspark_job/sqoop_ct/z_us_asin_image.py
--- a/Pyspark_job/sqoop_ct/z_us_asin_image_mysql_to_pg.py
+++ b/Pyspark_job/sqoop_ct/z_us_asin_image_mysql_to_pg.py
--- a/Pyspark_job/sqoop_ct/z_us_bs_category_asin_mysql_to_pg.py
+++ b/Pyspark_job/sqoop_ct/z_us_bs_category_asin_mysql_to_pg.py
--- a/Pyspark_job/sqoop_ct/z_us_st_year_week.py
+++ b/Pyspark_job/sqoop_ct/z_us_st_year_week.py
--- a/Pyspark_job/sqoop_export/aba_report_tmp.py
+++ b/Pyspark_job/sqoop_export/aba_report_tmp.py
--- a/Pyspark_job/sqoop_export/dwd_buyer_st_pg.py
+++ b/Pyspark_job/sqoop_export/dwd_buyer_st_pg.py
--- a/Pyspark_job/sqoop_export/dws_keepa_asin_bsr_rank.py
+++ b/Pyspark_job/sqoop_export/dws_keepa_asin_bsr_rank.py
--- a/Pyspark_job/sqoop_export/dwt_aba_st_analytics.py
+++ b/Pyspark_job/sqoop_export/dwt_aba_st_analytics.py
--- a/Pyspark_job/sqoop_export/dwt_amazon_report_pg.py
+++ b/Pyspark_job/sqoop_export/dwt_amazon_report_pg.py
--- a/Pyspark_job/sqoop_export/dwt_fb_base_report.py
+++ b/Pyspark_job/sqoop_export/dwt_fb_base_report.py
--- a/Pyspark_job/sqoop_export/dwt_keepa_asin_bsr_rank.py
+++ b/Pyspark_job/sqoop_export/dwt_keepa_asin_bsr_rank.py
--- a/Pyspark_job/sqoop_export/dwt_st_mt_christmas_info.py
+++ b/Pyspark_job/sqoop_export/dwt_st_mt_christmas_info.py
--- a/Pyspark_job/sqoop_export/dwt_st_pcp_current.py
+++ b/Pyspark_job/sqoop_export/dwt_st_pcp_current.py
--- a/Pyspark_job/sqoop_export/dwt_st_theme_agg.py
+++ b/Pyspark_job/sqoop_export/dwt_st_theme_agg.py
--- a/Pyspark_job/sqoop_export/dwt_zr_asin_info_pg.py
+++ b/Pyspark_job/sqoop_export/dwt_zr_asin_info_pg.py
--- a/Pyspark_job/sqoop_export/export_dwt_ai_asin_add.py
+++ b/Pyspark_job/sqoop_export/export_dwt_ai_asin_add.py
--- a/Pyspark_job/sqoop_export/export_dwt_asin_related_traffic.py
+++ b/Pyspark_job/sqoop_export/export_dwt_asin_related_traffic.py
--- a/Pyspark_job/sqoop_export/ods_st_key.py
+++ b/Pyspark_job/sqoop_export/ods_st_key.py
--- a/Pyspark_job/sqoop_import/ods_asin_detail.py
+++ b/Pyspark_job/sqoop_import/ods_asin_detail.py
--- a/Pyspark_job/sqoop_import/ods_asin_err_state.py
+++ b/Pyspark_job/sqoop_import/ods_asin_err_state.py
--- a/Pyspark_job/sqoop_import/ods_asin_variat.py
+++ b/Pyspark_job/sqoop_import/ods_asin_variat.py
--- a/Pyspark_job/sqoop_import/ods_bs_category.py
+++ b/Pyspark_job/sqoop_import/ods_bs_category.py
--- a/Pyspark_job/sqoop_import/ods_bs_category_asin_detail.py
+++ b/Pyspark_job/sqoop_import/ods_bs_category_asin_detail.py
--- a/Pyspark_job/sqoop_import/ods_bs_category_top100_asin.py
+++ b/Pyspark_job/sqoop_import/ods_bs_category_top100_asin.py
--- a/Pyspark_job/sqoop_import/ods_bsr_end.py
+++ b/Pyspark_job/sqoop_import/ods_bsr_end.py
--- a/Pyspark_job/sqoop_import/ods_keepa_asin_bsr_rank.py
+++ b/Pyspark_job/sqoop_import/ods_keepa_asin_bsr_rank.py
--- a/Pyspark_job/sqoop_import/ods_merchantwords_asin_detail_copy.py
+++ b/Pyspark_job/sqoop_import/ods_merchantwords_asin_detail_copy.py
--- a/Pyspark_job/sqoop_import/ods_merchantwords_asin_detail_test.py
+++ b/Pyspark_job/sqoop_import/ods_merchantwords_asin_detail_test.py
--- a/Pyspark_job/sqoop_import/ods_merchantwords_brand_analytics_copy.py
+++ b/Pyspark_job/sqoop_import/ods_merchantwords_brand_analytics_copy.py
--- a/Pyspark_job/sqoop_import/ods_merchantwords_other_search_term_data_copy.py
+++ b/Pyspark_job/sqoop_import/ods_merchantwords_other_search_term_data_copy.py
--- a/Pyspark_job/sqoop_import/ods_merchantwords_search_term_type_copy.py
+++ b/Pyspark_job/sqoop_import/ods_merchantwords_search_term_type_copy.py
--- a/Pyspark_job/sqoop_import/ods_new_releases_top100_asin.py
+++ b/Pyspark_job/sqoop_import/ods_new_releases_top100_asin.py
--- a/Pyspark_job/sqoop_import/ods_one_category_report.py
+++ b/Pyspark_job/sqoop_import/ods_one_category_report.py
--- a/Pyspark_job/sqoop_import/ods_other_search_term_data.py
+++ b/Pyspark_job/sqoop_import/ods_other_search_term_data.py
--- a/Pyspark_job/sqoop_import/ods_search_term_type.py
+++ b/Pyspark_job/sqoop_import/ods_search_term_type.py
--- a/Pyspark_job/sqoop_import/ods_self_asin.py
+++ b/Pyspark_job/sqoop_import/ods_self_asin.py
--- a/Pyspark_job/sqoop_import/ods_self_asin_detail.py
+++ b/Pyspark_job/sqoop_import/ods_self_asin_detail.py
--- a/Pyspark_job/sqoop_import/ods_self_asin_related_traffic.py
+++ b/Pyspark_job/sqoop_import/ods_self_asin_related_traffic.py
--- a/Pyspark_job/sqoop_import/ods_seller_account_feedback.py
+++ b/Pyspark_job/sqoop_import/ods_seller_account_feedback.py
--- a/Pyspark_job/sqoop_import/ods_seller_account_syn.py
+++ b/Pyspark_job/sqoop_import/ods_seller_account_syn.py
--- a/Pyspark_job/sqoop_import/ods_seller_asin_account.py
+++ b/Pyspark_job/sqoop_import/ods_seller_asin_account.py
--- a/Pyspark_job/sqoop_import/ods_seller_asin_product.py
+++ b/Pyspark_job/sqoop_import/ods_seller_asin_product.py
--- a/Pyspark_job/sqoop_import/ods_st_quantity_being_sold.py
+++ b/Pyspark_job/sqoop_import/ods_st_quantity_being_sold.py
--- a/Pyspark_job/sqoop_import/ods_theme.py
+++ b/Pyspark_job/sqoop_import/ods_theme.py
--- a/Pyspark_job/temp/get_asin_bsr1000.py
+++ b/Pyspark_job/temp/get_asin_bsr1000.py
--- a/Pyspark_job/utils/DorisHelper.py
+++ b/Pyspark_job/utils/DorisHelper.py
--- a/Pyspark_job/utils/common_util.py
+++ b/Pyspark_job/utils/common_util.py
--- a/Pyspark_job/utils/db_util.py
+++ b/Pyspark_job/utils/db_util.py
--- a/Pyspark_job/utils/es_util.py
+++ b/Pyspark_job/utils/es_util.py
--- a/Pyspark_job/utils/redis_utils.py
+++ b/Pyspark_job/utils/redis_utils.py
--- a/Pyspark_job/utils/secure_db_client.py
+++ b/Pyspark_job/utils/secure_db_client.py
--- a/Pyspark_job/utils/ssh_util.py
+++ b/Pyspark_job/utils/ssh_util.py
--- a/Pyspark_job/utils/templates.py
+++ b/Pyspark_job/utils/templates.py
--- a/Pyspark_job/yswg_utils/common_udf.py
+++ b/Pyspark_job/yswg_utils/common_udf.py
--- a/desploy.py
+++ b/desploy.py
--- a/run_shell/sqoop_shell/templates.sh
+++ b/run_shell/sqoop_shell/templates.sh