Commit cc38b788 by hejiangming

Merge branch 'developer' of http://47.106.101.75/abel_cjy/Amazon-Selection-Data into developer

parents d27cbb47 621ded14
...@@ -150,6 +150,13 @@ class DwtAsinSync(Templates): ...@@ -150,6 +150,13 @@ class DwtAsinSync(Templates):
sql_asin_flow = f"select asin, 4 as asin_flag from dwt_flow_asin where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{prev_month}' and asin_bought_month>0" sql_asin_flow = f"select asin, 4 as asin_flag from dwt_flow_asin where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{prev_month}' and asin_bought_month>0"
self.df_asin_flow = self.read_data_common(sql=sql_asin_flow, content="1.4. 读取dwt_flow_asin表的asin数据") self.df_asin_flow = self.read_data_common(sql=sql_asin_flow, content="1.4. 读取dwt_flow_asin表的asin数据")
sql_fb_top_asin = f"select asin, 4 as asin_flag from ods_asin_detail_product where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{self.date_info}'"
df_fb_top_asin = self.read_data_common(sql=sql_fb_top_asin, content="1.5. 读取店铺第一页asin数据")
self.df_asin_flow = self.df_asin_flow.unionByName(
df_fb_top_asin, allowMissingColumns=True
).drop_duplicates(['asin'])
# 读取asin属性值 # 读取asin属性值
sql_asin_variation = f"""select asin, 1 as asin_is_variation from dim_asin_variation_info where site_name="{self.site_name}";""" sql_asin_variation = f"""select asin, 1 as asin_is_variation from dim_asin_variation_info where site_name="{self.site_name}";"""
self.df_asin_variation = self.read_data_common(sql=sql_asin_variation, content="2.1 读取dim_asin_variation_info表的asin变体属性") self.df_asin_variation = self.read_data_common(sql=sql_asin_variation, content="2.1 读取dim_asin_variation_info表的asin变体属性")
......
import os
import requests import requests
from contextlib import ExitStack
# 服务器地址
url = 'http://192.168.10.217:10001/img_search' url = 'http://192.168.10.217:10001/img_search'
# 文件路径列表,上传多个文件
file_paths = [ file_paths = [
'D:\Amazon-Selection\pyspark_job\image_search\img/1.jpg', # r'D:\Amazon-Selection\pyspark_job\image_search\img\t1_1.png',
# 'D:\Amazon-Selection\pyspark_job\image_search\img/1.png', r'D:\Amazon-Selection\pyspark_job\image_search\img\t1_2.png',
'D:\Amazon-Selection\pyspark_job\image_search\img/2.png',
] ]
# 将多个文件添加到请求中
files = [('file', (open(file_path, 'rb'))) for file_path in file_paths]
# 其他表单数据
data = { data = {
'site_name': 'us', 'site_name': 'us',
'img_type': 'amazon_inv', 'img_type': 'amazon_inv',
'search_key': 'file', # 使用文件方式进行查询 'search_key': 'file',
'search_value': '', # 在图片查询中可忽略 'search_value': '',
'top_k': 5 # 设置查询结果的返回数量 'top_k': '5'
} }
# 发送请求 with ExitStack() as stack:
response = requests.post(url, files=files, data=data) files = []
for idx, file_path in enumerate(file_paths, start=1):
files.append(
(
f'file{idx}',
(
os.path.basename(file_path),
stack.enter_context(open(file_path, 'rb')),
'image/png'
)
)
)
response = requests.post(url, files=files, data=data, timeout=120)
# 打印响应 # print(response.status_code)
# print(response.text)
# if response.status_code == 200:
# print(response.json())
if response.status_code == 200: if response.status_code == 200:
print(response.json()) result_list = response.json()
for item in result_list:
print("id:", item.get("id"))
print("img_id:", item.get("img_id"))
print("img_type:", item.get("img_type"))
print("similarity:", item.get("similarity"))
print("img_url:", item.get("img_url"))
print("-" * 80)
else: else:
print(f"Error: {response.status_code}, {response.text}") print(f"Error: {response.status_code}, {response.text}")
\ No newline at end of file
import os import os
import sys import sys
import time import time
import traceback
sys.path.append(os.path.dirname(sys.path[0])) sys.path.append(os.path.dirname(sys.path[0]))
from utils.secure_db_client import get_remote_engine from utils.secure_db_client import get_remote_engine
...@@ -42,10 +43,10 @@ def update_workflow_manager(site_name, date_type, date_info): ...@@ -42,10 +43,10 @@ def update_workflow_manager(site_name, date_type, date_info):
'month', 'month',
'{date_info}', '{date_info}',
{priority}, {priority},
'us_spider_asin', '{site_name}_spider_asin',
'yes', 'yes',
1, 1,
'us_all_cal', '{site_name}_all_cal',
1 1
) )
ON DUPLICATE KEY UPDATE ON DUPLICATE KEY UPDATE
...@@ -54,9 +55,10 @@ def update_workflow_manager(site_name, date_type, date_info): ...@@ -54,9 +55,10 @@ def update_workflow_manager(site_name, date_type, date_info):
""" """
print(f"workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow}") print(f"workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow}")
conn.execute(update_sql_workflow) conn.execute(update_sql_workflow)
break
except Exception as e: except Exception as e:
print(f"失败:workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow}, {e}, 报错信息: {traceback.format_exc()}")
time.sleep(300) time.sleep(300)
continue continue
...@@ -97,4 +99,5 @@ if __name__ == '__main__': ...@@ -97,4 +99,5 @@ if __name__ == '__main__':
date_type = sys.argv[2] # 参数2:类型:week/4_week/month/quarter/day date_type = sys.argv[2] # 参数2:类型:week/4_week/month/quarter/day
date_info = sys.argv[3] # 参数3:年-周/年-月/年-季/年-月-日, 比如: 2022-1 date_info = sys.argv[3] # 参数3:年-周/年-月/年-季/年-月-日, 比如: 2022-1
export_data(site_name, date_type, date_info) export_data(site_name, date_type, date_info)
update_workflow_manager(site_name, date_type, date_info) # if site_name in ["us", "uk", "de"] and date_type == "month":
# update_workflow_manager(site_name, date_type, date_info)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment