Commit cc38b788 by hejiangming

Merge branch 'developer' of http://47.106.101.75/abel_cjy/Amazon-Selection-Data into developer

parents d27cbb47 621ded14
......@@ -150,6 +150,13 @@ class DwtAsinSync(Templates):
sql_asin_flow = f"select asin, 4 as asin_flag from dwt_flow_asin where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{prev_month}' and asin_bought_month>0"
self.df_asin_flow = self.read_data_common(sql=sql_asin_flow, content="1.4. 读取dwt_flow_asin表的asin数据")
sql_fb_top_asin = f"select asin, 4 as asin_flag from ods_asin_detail_product where site_name='{self.site_name}' and date_type='{self.date_type}' and date_info='{self.date_info}'"
df_fb_top_asin = self.read_data_common(sql=sql_fb_top_asin, content="1.5. 读取店铺第一页asin数据")
self.df_asin_flow = self.df_asin_flow.unionByName(
df_fb_top_asin, allowMissingColumns=True
).drop_duplicates(['asin'])
# 读取asin属性值
sql_asin_variation = f"""select asin, 1 as asin_is_variation from dim_asin_variation_info where site_name="{self.site_name}";"""
self.df_asin_variation = self.read_data_common(sql=sql_asin_variation, content="2.1 读取dim_asin_variation_info表的asin变体属性")
......
import os
import requests
from contextlib import ExitStack
# 服务器地址
url = 'http://192.168.10.217:10001/img_search'
# 文件路径列表,上传多个文件
file_paths = [
'D:\Amazon-Selection\pyspark_job\image_search\img/1.jpg',
# 'D:\Amazon-Selection\pyspark_job\image_search\img/1.png',
'D:\Amazon-Selection\pyspark_job\image_search\img/2.png',
# r'D:\Amazon-Selection\pyspark_job\image_search\img\t1_1.png',
r'D:\Amazon-Selection\pyspark_job\image_search\img\t1_2.png',
]
# 将多个文件添加到请求中
files = [('file', (open(file_path, 'rb'))) for file_path in file_paths]
# 其他表单数据
data = {
'site_name': 'us',
'img_type': 'amazon_inv',
'search_key': 'file', # 使用文件方式进行查询
'search_value': '', # 在图片查询中可忽略
'top_k': 5 # 设置查询结果的返回数量
'search_key': 'file',
'search_value': '',
'top_k': '5'
}
# 发送请求
response = requests.post(url, files=files, data=data)
with ExitStack() as stack:
files = []
for idx, file_path in enumerate(file_paths, start=1):
files.append(
(
f'file{idx}',
(
os.path.basename(file_path),
stack.enter_context(open(file_path, 'rb')),
'image/png'
)
)
)
response = requests.post(url, files=files, data=data, timeout=120)
# 打印响应
# print(response.status_code)
# print(response.text)
# if response.status_code == 200:
# print(response.json())
if response.status_code == 200:
print(response.json())
result_list = response.json()
for item in result_list:
print("id:", item.get("id"))
print("img_id:", item.get("img_id"))
print("img_type:", item.get("img_type"))
print("similarity:", item.get("similarity"))
print("img_url:", item.get("img_url"))
print("-" * 80)
else:
print(f"Error: {response.status_code}, {response.text}")
print(f"Error: {response.status_code}, {response.text}")
\ No newline at end of file
import os
import sys
import time
import traceback
sys.path.append(os.path.dirname(sys.path[0]))
from utils.secure_db_client import get_remote_engine
......@@ -42,10 +43,10 @@ def update_workflow_manager(site_name, date_type, date_info):
'month',
'{date_info}',
{priority},
'us_spider_asin',
'{site_name}_spider_asin',
'yes',
1,
'us_all_cal',
'{site_name}_all_cal',
1
)
ON DUPLICATE KEY UPDATE
......@@ -54,9 +55,10 @@ def update_workflow_manager(site_name, date_type, date_info):
"""
print(f"workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow}")
conn.execute(update_sql_workflow)
break
except Exception as e:
print(f"失败:workflow_manager进度表---重置爬虫的asin抓取进度: {update_sql_workflow}, {e}, 报错信息: {traceback.format_exc()}")
time.sleep(300)
continue
......@@ -97,4 +99,5 @@ if __name__ == '__main__':
date_type = sys.argv[2] # 参数2:类型:week/4_week/month/quarter/day
date_info = sys.argv[3] # 参数3:年-周/年-月/年-季/年-月-日, 比如: 2022-1
export_data(site_name, date_type, date_info)
update_workflow_manager(site_name, date_type, date_info)
# if site_name in ["us", "uk", "de"] and date_type == "month":
# update_workflow_manager(site_name, date_type, date_info)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment