from playwright.sync_api import sync_playwright
from sqlalchemy import create_engine
import pandas as pd
import urllib.parse
import json
import traceback
import time
import  random
from sqlalchemy.engine import URL
'商机探测器。根据关键词搜索'


class One688LoginSpider(object):
    def __init__(self, site='us'):
        self.site = site
        self.data = None
        month = time.strftime("%m")
        yaer = time.strftime('%Y', time.localtime(time.time()))
        self.y_w = f"{yaer}-{month}"
        self.mysql_connect()

    def mysql_connect(self):
        if self.site == 'us':
            db = 'selection'
        else:
            db = f'selection_{self.site}'
        DB_CONN_DICT = {
            "mysql_port": 3306,
            "mysql_user": "XP_Yswg2025_PY",
            "mysql_pwd": "Gd1pGJog1ysLMLBdML8w81",
            "mysql_host": "rm-wz9yg9bsb2zf01ea4yo.mysql.rds.aliyuncs.com",
        }
        url = URL.create(
            drivername="mysql+pymysql",
            username=DB_CONN_DICT["mysql_user"],
            password=DB_CONN_DICT["mysql_pwd"],  # 原始密码，含 @ 也没问题
            host=DB_CONN_DICT["mysql_host"],
            port=int(DB_CONN_DICT["mysql_port"]),
            database=db,
            query={"charset": "utf8mb4"}
        )
        self.engine_us_mysql = create_engine(url)

        self.engine_pg = create_engine(
            f"postgresql+psycopg2://postgres:F9kL2sXe81rZq@113.100.143.162:5432/{db}",
            encoding='utf-8')
        return self.engine_us_mysql

    def is_target_response(self, data):
        # 比如检测返回数据中是否包含特定字段或者特定值
        # 请根据实际数据结构进行调整
        if data and '.jpg' in str(data):
            return True
        return False

    def print_request_finished(self, request):
        # 拦截请求获取数据
        if self.post_url in request.url:
            response = request.response()
            if not response:
                return
            # 如果状态码在重定向系列，直接忽略
            if response.status == 200:
                try:
                    data_item = response.json()
                    # 根据返回数据判断需要拦截的请求
                    if self.is_target_response(data_item):
                        self.data = data_item
                        print("捕获的数据：", self.data)
                except Exception as e:
                    print('拦截url报错：', e, f"\n{traceback.format_exc()}")
                    self.data = None

    def select_category_json(self):
        sql = 'SELECT search_term,id FROM seller_theme_labe_opportunity_syn where state=1'
        engine_mysql = self.mysql_connect()
        df_category_json = pd.read_sql(sql, con=engine_mysql)
        category_data_list = list(df_category_json['search_term'] + '|=|=|' + df_category_json['id'].astype("U"))
        data_list = []
        for i in category_data_list:
            data = i.split('|=|=|')
            data_list.append(data)
        return data_list

    def get_category(self):
        data_list = self.select_category_json()
        for data in data_list:
            print('搜索词：', data)
            key_word = data[0]
            id = data[1]
            print('搜索关键词：', key_word)
            for i in range(3):
                try:
                    base_url = "https://sellercentral.amazon.com/opportunity-explorer/search?mons_redirect=stck_reroute&search="
                    encoded_data = urllib.parse.quote(key_word.strip())
                    full_url = f"{base_url}{encoded_data}"
                    print('分类url：', full_url)
                    self.post_url = '/ox-api/graphql'
                    self.crawl(full_url)
                    if self.data:
                        self.mysql_connect()
                        category_data_list = []
                        for i in self.data['data']['niches']:
                            i_json_data = json.dumps(i)
                            print('json_data:::', i_json_data)
                            niche_title = i['nicheTitle']
                            product_count = i['nicheSummary']['productCount']
                            search_volume_t360 = i['nicheSummary']['searchVolumeT360']
                            search_volume_t90 = i['nicheSummary']['searchVolumeT90']
                            search_volume_growth_t180 = i['nicheSummary']['searchVolumeGrowthT180']
                            search_volume_growth_t90 = i['nicheSummary']['searchVolumeGrowthT90']
                            minimum_units_sold_t90 = i['nicheSummary']['minimumUnitsSoldT90']
                            maximum_units_sold_t90 = i['nicheSummary']['maximumUnitsSoldT90']
                            minimum_units_sold_t360 = i['nicheSummary']['minimumUnitsSoldT360']
                            maximum_units_sold_t360 = i['nicheSummary']['maximumUnitsSoldT360']
                            minimum_average_units_sold_t360 = i['nicheSummary']['minimumAverageUnitsSoldT360']
                            maximum_average_units_sold_t360 = i['nicheSummary']['maximumAverageUnitsSoldT360']
                            minimum_price = i['nicheSummary']['minimumPrice']
                            maximum_price = i['nicheSummary']['maximumPrice']
                            avg_price = i['nicheSummary']['avgPrice']
                            avg_price_t360 = i['nicheSummary']['avgPriceT360']
                            demand = i['nicheSummary']['demand']
                            category = i['nicheSummary']['category']
                            return_rate_t360 = i['nicheSummary']['returnRateT360']
                            search_volume_growth_t360 = i['nicheSummary']['searchVolumeGrowthT360']
                            searchTerm_list = []
                            for i in i['topSearchTermMetrics']:
                                searchTerm_list.append(i['searchTerm'])
                            if searchTerm_list:
                                popular_search = ' && '.join(searchTerm_list)
                            else:
                                popular_search = None
                            category_data_list.append(
                                [key_word, i_json_data, niche_title, product_count, search_volume_t360,
                                 search_volume_t90, search_volume_growth_t180, search_volume_growth_t90,
                                 minimum_units_sold_t90, maximum_units_sold_t90,
                                 minimum_units_sold_t360, maximum_units_sold_t360, minimum_average_units_sold_t360,
                                 maximum_average_units_sold_t360, minimum_price, maximum_price,
                                 avg_price, avg_price_t360, demand, category, return_rate_t360,
                                 search_volume_growth_t360, self.site, self.y_w,
                                 popular_search])

                        df_category_data = pd.DataFrame(data=category_data_list,
                                                        columns=['key_word', 'category_data_json', 'niche_title',
                                                                 'product_count',
                                                                 'search_volume_t360', 'search_volume_t90',
                                                                 'search_volume_growth_t180',
                                                                 'search_volume_growth_t90', 'minimum_units_sold_t90',
                                                                 'maximum_units_sold_t90', 'minimum_units_sold_t360',
                                                                 'maximum_units_sold_t360',
                                                                 'minimum_average_units_sold_t360',
                                                                 'maximum_average_units_sold_t360',
                                                                 'minimum_price', 'maximum_price', 'avg_price',
                                                                 'avg_price_t360', 'demand', 'category',
                                                                 'return_rate_t360', 'search_volume_growth_t360',
                                                                 'site', 'date_info', 'search_term'])
                        df_category_data.to_sql('seller_search_term_opportunity', con=self.engine_pg, if_exists='append',
                                                index=False)
                        print('存储成功：', len(category_data_list))
                        with self.engine_us_mysql.begin() as conn:
                            sql_update = f"update seller_theme_labe_opportunity_syn set state=3 where id={int(id)};"
                            print('修改状态：', sql_update)
                            conn.execute(sql_update)
                        self.data = None
                    else:
                        with self.engine_us_mysql.begin() as conn:
                            sql_update = f"update seller_theme_labe_opportunity_syn set state=4 where id={int(id)};"
                            print('修改状态：', sql_update)
                            conn.execute(sql_update)
                    break
                except Exception as e:
                    print("并等待5s继续", e, f"\n{traceback.format_exc()}")
                    time.sleep(random.uniform(3, 7))
                    print('报错重试')
            time.sleep(random.uniform(3, 7))
            # break

    def crawl(self, url):
        self.page.on("requestfinished", self.print_request_finished)
        # 请求 指定的 URL
        self.page.goto(url)
        # self.page.on("requestfinished", self.print_request_finished)
        self.page.wait_for_timeout(15000)
        # 等待页面加载一段时间

    def run(self):
        self.page.goto('https://sellercentral.amazon.com/gp/homepage.html/ref=xx_home_logo_xx?')
        self.page.wait_for_timeout(20000)
        # # 开始持续拦截请求
        self.get_category()

    def main(self):
        # 初始化
        with sync_playwright() as _playwright:
            # _playwright.chromium.launch_persistent_context
            browser = _playwright.chromium.launch_persistent_context(
                # 指定本机用户缓存地址
                user_data_dir=r"C:\Users\Administrator\AppData\Local\Google\Chrome\User Data",
                # 指定本机google客户端exe的路径
                executable_path=r"C:\Program Files\Google\Chrome\Application\chrome.exe",
                # 要想通过这个下载文件这个必然要开  默认是False
                accept_downloads=True,
                # 设置不是无头模式
                headless=False,
                bypass_csp=True,
                locale='en-GB',
                ignore_https_errors=True,
                no_viewport=True,
                slow_mo=10,
                # 跳过检测
                args=['--disable-blink-features=AutomationControlled', '--remote-debugging-port=9222']
            )

            self.page = browser.new_page()
            js = """
                    Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
                    """
            self.page.add_init_script(js)
            self.page.evaluate_handle('''() =>{ window.chrome = { runtime: {}, }; }''')
            self.page.evaluate_handle(
                '''() =>{ Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], }); }''')
            # 模拟浏览器参数
            self.page.locator("body").click()
            js = """
                    Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
                    """
            self.page.add_init_script(js)

            self.run()
            self.page.close()


if __name__ == '__main__':
    One688 = One688LoginSpider()
    One688.main()
