Merge branch 'developer' of http://47.106.101.75/abel_cjy/Amazon-Selection-Data into developer

cc38b788 · hejiangming · d27cbb47 · 621ded14 · cc38b788 · cc38b788
Commit cc38b788 authored Apr 30, 2026 by hejiangming
Showing with 52 additions and 21 deletions

dwt_asin_sync.py Pyspark_job/dwt/dwt_asin_sync.py +7 -0

search_test.py Pyspark_job/img_search/search_test.py +38 -17

export_dwt_asin_sync.py Pyspark_job/sqoop_export/export_dwt_asin_sync.py +7 -4

No files found.
--- a/Pyspark_job/dwt/dwt_asin_sync.py
+++ b/Pyspark_job/dwt/dwt_asin_sync.py
@@ -150,6 +150,13 @@ class DwtAsinSync(Templates):
            sql_asin_flow = f"select asin, 4 as asin_flag from dwt_flow_asin where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{prev_month}' and asin_bought_month>0"
            self.df_asin_flow = self.read_data_common(sql=sql_asin_flow, content="1.4. 读取dwt_flow_asin表的asin数据")

+            sql_fb_top_asin = f"select asin, 4 as asin_flag from ods_asin_detail_product where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{self.date_info}'"
+            df_fb_top_asin = self.read_data_common(sql=sql_fb_top_asin, content="1.5. 读取店铺第一页asin数据")
+
+            self.df_asin_flow = self.df_asin_flow.unionByName(
+                df_fb_top_asin, allowMissingColumns=True
+            ).drop_duplicates(['asin'])
+
        # 读取asin属性值
        sql_asin_variation = f"""select asin, 1 as asin_is_variation from dim_asin_variation_info where site_name="{self.site_name}";"""
        self.df_asin_variation = self.read_data_common(sql=sql_asin_variation, content="2.1 读取dim_asin_variation_info表的asin变体属性")

--- a/Pyspark_job/img_search/search_test.py
+++ b/Pyspark_job/img_search/search_test.py
+import os
 import requests
+from contextlib import ExitStack

-# 服务器地址
 url = 'http://192.168.10.217:10001/img_search'

-# 文件路径列表，上传多个文件
 file_paths = [
-    'D:\Amazon-Selection\pyspark_job\image_search\img/1.jpg',
-    # 'D:\Amazon-Selection\pyspark_job\image_search\img/1.png',
-    'D:\Amazon-Selection\pyspark_job\image_search\img/2.png',
+    # r'D:\Amazon-Selection\pyspark_job\image_search\img\t1_1.png',
+    r'D:\Amazon-Selection\pyspark_job\image_search\img\t1_2.png',
 ]

-# 将多个文件添加到请求中
-files = [('file', (open(file_path, 'rb'))) for file_path in file_paths]
-
-# 其他表单数据
 data = {
    'site_name': 'us',
    'img_type': 'amazon_inv',
-    'search_key': 'file',  # 使用文件方式进行查询
-    'search_value': '',  # 在图片查询中可忽略
-    'top_k': 5  # 设置查询结果的返回数量
+    'search_key': 'file',
+    'search_value': '',
+    'top_k': '5'
 }

-# 发送请求
-response = requests.post(url, files=files, data=data)
+with ExitStack() as stack:
+    files = []
+    for idx, file_path in enumerate(file_paths, start=1):
+        files.append(
+            (
+                f'file{idx}',
+                (
+                    os.path.basename(file_path),
+                    stack.enter_context(open(file_path, 'rb')),
+                    'image/png'
+                )
+            )
+        )
+
+    response = requests.post(url, files=files, data=data, timeout=120)

-# 打印响应
+# print(response.status_code)
+# print(response.text)
+
+# if response.status_code == 200:
+#     print(response.json())
 if response.status_code == 200:
-    print(response.json())
+    result_list = response.json()
+
+    for item in result_list:
+        print("id:", item.get("id"))
+        print("img_id:", item.get("img_id"))
+        print("img_type:", item.get("img_type"))
+        print("similarity:", item.get("similarity"))
+        print("img_url:", item.get("img_url"))
+        print("-" * 80)
 else:
-    print(f"Error: {response.status_code}, {response.text}")
+    print(f"Error: {response.status_code}, {response.text}")
\ No newline at end of file
--- a/Pyspark_job/sqoop_export/export_dwt_asin_sync.py
+++ b/Pyspark_job/sqoop_export/export_dwt_asin_sync.py
 import os
 import sys
 import time
+import traceback

 sys.path.append(os.path.dirname(sys.path[0]))
 from utils.secure_db_client import get_remote_engine
@@ -42,10 +43,10 @@ def update_workflow_manager(site_name, date_type, date_info):
                                'month',
                                '{date_info}',
                                {priority},
-                                'us_spider_asin',
+                                '{site_name}_spider_asin',
                                'yes',
                                1,
-                                'us_all_cal', 
+                                '{site_name}_all_cal', 
                                1
                            )
                            ON DUPLICATE KEY UPDATE
@@ -54,9 +55,10 @@ def update_workflow_manager(site_name, date_type, date_info):
                    """
                    print(f"workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow}")
                    conn.execute(update_sql_workflow)
+                    break
            except Exception as e:
+                print(f"失败：workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow}, {e}, 报错信息: {traceback.format_exc()}")
                time.sleep(300)
-
                continue


@@ -97,4 +99,5 @@ if __name__ == '__main__':
    date_type = sys.argv[2]  # 参数2：类型：week/4_week/month/quarter/day
    date_info = sys.argv[3]  # 参数3：年-周/年-月/年-季/年-月-日, 比如: 2022-1
    export_data(site_name, date_type, date_info)
-    update_workflow_manager(site_name, date_type, date_info)
+    # if site_name in ["us", "uk", "de"] and date_type == "month":
+    #     update_workflow_manager(site_name, date_type, date_info)