import sys
import os

sys.path.append(os.path.dirname(sys.path[0]))  # 上级目录
from curl_cffi import requests
from utils.db_connect import BaseUtils
import time
import random
import pandas as pd
import json
from lxml import etree
from urllib.parse import urlencode
import html
import re
from html import unescape
import urllib.parse
from sqlalchemy import text

class recall_cases():

    def __init__(self):
        self.ec_europa_uk_url = 'https://www.gov.uk/product-safety-alerts-reports-recalls'
        self.ec_europa_uk_data_list = []
        self.uk_drug_device = []
        self.uk_drug_device_url = 'https://www.gov.uk/drug-device-alerts'
        self.mysql_connect1()
        week = time.strftime("%W")
        self.yer_week = f'2025_{week}'

    def mysql_connect1(self):
        self.mysql_db = BaseUtils().mysql_connect()

    def _parse_date_str(self, date_str):
        """
                支持三种日期格式：
                  - 中文或英文格式："十月 04, 2011" 或 "September 28, 2011" （月份在前）
                  - 英文格式："18 February 2025" （日在前）
                返回标准的 "YYYY-MM-DD" 格式日期。
                """
        # 定义中英文月份映射
        month_mapping = {
            "一月": "01", "二月": "02", "三月": "03", "四月": "04", "五月": "05", "六月": "06",
            "七月": "07", "八月": "08", "九月": "09", "十月": "10", "十一月": "11", "十二月": "12",
            "January": "01", "February": "02", "March": "03", "April": "04", "May": "05", "June": "06",
            "July": "07", "August": "08", "September": "09", "October": "10", "November": "11", "December": "12"
        }

        # 去除逗号，并按空格分割
        date_parts = date_str.replace(",", "").split()
        print("解析后:", date_parts)

        if len(date_parts) != 3:
            raise ValueError(f"无法解析日期格式: {date_str}")

        # 如果第一个部分为数字，则格式为 "日 月份 年份"
        if date_parts[0].isdigit():
            day, month, year = date_parts
        else:
            # 否则格式为 "月份 日 年份"
            month, day, year = date_parts

        if month not in month_mapping:
            raise ValueError(f"未知月份: {month}")

        month_num = month_mapping[month]
        standard_date = f"{year}-{month_num}-{day.zfill(2)}"
        return standard_date

    def _request(self, headers=None, url=None, data_type=None):
        for i in range(5):
            try:
                response = requests.get(url, headers=headers, impersonate="chrome120", timeout=120)
                if data_type is None:
                    rep = etree.HTML(response.text)
                    rep.xpath('//div')
                elif 'country' in response.text:
                    pass
                else:
                    rep = etree.HTML(response.text)
                    rep.xpath('//div')
                return response
            except:
                time.sleep(random.uniform(1, 5))

    def _parse_details(self, headers, a_href):
        if 'Referer' in headers:
            del headers['Referer']
        response = self._request(headers=headers, url=a_href)
        response_detail = etree.HTML(response.text)
        if response_detail:
            recall_date_list = response_detail.xpath("//div[contains(text(),'Recall Date:')]/parent::div/text()")
            product_title_list = response_detail.xpath("//div[contains(text(),'Name of Product:')]/parent::div/text()")
            if product_title_list:
                matches = re.findall(r'[A-Za-z\-®]+(?: [A-Za-z\-®]+)*', product_title_list[-1].strip())
                if matches:
                    brand = ','.join(matches)
                else:
                    brand = None

            else:
                brand = None
            hazard_list = response_detail.xpath("//div[contains(text(),'危险:')]/parent::div//p//text()")
            image_url_list = response_detail.xpath("//div[@id='recall-gallery-img']//li/img/@src")
            recall_date = recall_date_list[-1].strip() if recall_date_list else None  # 召回日期
            product_title = product_title_list[-1].strip() if product_title_list else None  # 产品标题
            hazard = ''.join(hazard_list) if hazard_list else None  # 风险描述
            image_url = 'https://www.cpsc.gov' + image_url_list[0].strip() if image_url_list else None  # 图片
            if recall_date:
                recall_date = self._parse_date_str(recall_date)
            data_list = ['us_recalls_product', recall_date, product_title, hazard, image_url, a_href,brand]
            return data_list
        else:
            return None

    def us_recalls(self):
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br, zstd',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Pragma': 'no-cache',
            'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
        }
        url = 'https://www.cpsc.gov/zh-CN/Recalls'
        response = self._request(headers=headers, url=url)
        print(response.text)
        resp = etree.HTML(response.text)
        while True:
            next_url_list = resp.xpath("//a[@aria-label='Next page']/@href")
            div_a_list = resp.xpath("//div[@class='recall-list']//div[@class='recall-list__title']/a/@href")
            div_a_title_list = resp.xpath("//div[@class='recall-list']//div[@class='recall-list__title']/a/@title")
            is_None = True
            if div_a_list:
                for index, div_a in enumerate(div_a_list):
                    save_data_list = []
                    time.sleep(random.uniform(1, 5))
                    a_title = div_a_title_list[index]
                    a_href = 'https://www.cpsc.gov' + div_a
                    print('详情url：', a_href)
                    data_list = self._parse_details(headers, a_href)
                    if data_list:
                        data_list.append(a_title)
                        data_list.append('us')
                        print(data_list)
                        save_data_list.append(data_list)
                        try:
                            print('存储数据：', len(save_data_list))
                            df = pd.DataFrame(data=save_data_list,
                                              columns=['data_type', 'recall_date', 'product_title', 'hazard',
                                                       'image_url',
                                                       'ext_url','brand', 'recall_title', 'country'])
                            df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
                        except:
                            is_None = False
                            break
                if is_None == False:
                    break
            print('请求下一页url:', next_url_list)
            if next_url_list:
                next_url = 'https://www.cpsc.gov/zh-CN/Recalls' + next_url_list[0]
                print('22请求下一页url:', next_url)

                if '?page=1' == next_url_list[0]:
                    headers['Referer'] = 'https://www.cpsc.gov/zh-CN/Recalls'
                else:
                    page = re.findall(r'page=(\d+)', next_url_list[0])[0]
                    headers['Referer'] = f'https://www.cpsc.gov/zh-CN/Recalls?page={int(page) - 1}'
                    print("headers['Referer']::", headers['Referer'])
                response = self._request(headers=headers, url=next_url)
                resp = etree.HTML(response.text)
            else:
                break
            time.sleep(random.uniform(1, 5))

    def us_fda_gov(self):
        base_url = "https://www.fda.gov/datatables/views/ajax?"
        headers = {
            'authority': 'www.fda.gov',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'accept-language': 'zh-CN,zh;q=0.9',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
        }

        # 固定参数
        params = {
            "search_api_fulltext": "",
            "field_regulated_product_field": "All",
            "field_terminated_recall": "All",
            "draw": 1,  # 初始请求
            "_drupal_ajax": "1",
            "_wrapper_format": "drupal_ajax",
            "pager_element": "0",
            "view_base_path": "safety/recalls-market-withdrawals-safety-alerts/datatables-data",
            "view_display_id": "recall_datatable_block_1",
            "view_name": "recall_solr_index",
            "view_path": "/safety/recalls-market-withdrawals-safety-alerts",
            "total_items": "912"
        }

        # 生成分页 URL
        def get_page_url(start, draw):
            params["start"] = start
            params["length"] = 100
            params["draw"] = draw  # draw 递增
            return base_url + urlencode(params)

        # 生成前 5 页的 URL

        for i in range(0, 1000, 100):
            is_None = True
            detail_url = get_page_url(i, i // 100 + 1)
            print('当前请求url:', detail_url)
            response = self._request(headers=headers, url=detail_url)
            if response.text:
                dict_item = response.json()
                data_lists = dict_item['data']
                for data in data_lists:
                    print(data,'344444444')
                    data_list = []
                    try:
                        # 逐项解码
                        decoded_data = [html.unescape(item.encode().decode("unicode_escape")) for item in data]
                        # 提取时间 (datetime 属性)
                        time_match = re.search(r'<time datetime="([\d-]+)T', decoded_data[0])
                        date = time_match.group(1) if time_match else None
                        print('时间', date)
                        # 提取 <a> 标签 URL
                        link_match = re.search(r'<a href="(.*?)">', decoded_data[1])
                        url = unescape(link_match.group(1)).replace("\\/", "/") if link_match else None  # 解析 HTML 转义字符
                        url = 'https://www.fda.gov' + url
                        print('链接', url)
                        # 提取 <a> 标签文本
                        text_match = re.search(r'>(.*?)<', data[1])
                        link_text = unescape(text_match.group(1)) if text_match else None
                        print('产品名称：', link_text)
                        print('召回标题', decoded_data[2])
                        recall_title = decoded_data[2]
                        print('产品类型', decoded_data[3])
                        product_category = decoded_data[3]
                        print('风险', decoded_data[4])
                        hazard = decoded_data[4]
                        print('\n')
                        response2 = self._request(headers=headers, url=url)
                        response_detail = etree.HTML(response2.text)
                        src_list = response_detail.xpath("//div[@id='recall-photos']//img/@src")
                        Brand_list = response_detail.xpath("//div[contains(text(),'Brand Name')]/following-sibling::div//text()")
                        if Brand_list:
                            brand = ''.join(Brand_list).strip()
                        else:
                            brand = None
                        print(brand,'Brand_list::',Brand_list)
                        if src_list:
                            image_url = 'https://www.fda.gov' + src_list[0]
                        else:
                            image_url = None
                        print('image_url:', image_url)
                        data_list.append(['us_fba_recalls', date, link_text, hazard, image_url, url, recall_title, 'us',
                                          product_category,brand])
                        try:
                            df = pd.DataFrame(data=data_list,
                                              columns=['data_type', 'recall_date', 'product_title', 'hazard',
                                                       'image_url',
                                                       'ext_url', 'recall_title', 'country', 'product_category','brand'])
                            df.drop_duplicates(['recall_date', 'product_title', 'ext_url'], inplace=True)
                            df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
                        except:
                            is_None = False
                            break
                    except:
                        print('解析报错')
                if is_None == False:
                    break
            else:
                break

    def ec_europa_eu(self):
        '欧盟召回'
        for i in range(1, 33):
            url = 'https://ec.europa.eu/safety-gate-alerts/public/api/notification/carousel/?'
            data = {"language": "en", "page": f"{i}"}
            headers = {
                'Accept': 'application/json, text/plain, */*',
                'Accept-Encoding': 'gzip, deflate, br, zstd',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Cache-Control': 'No-Cache',
                'Connection': 'keep-alive',
                'Content-Type': 'application/json',
                'Host': 'ec.europa.eu',
                'Origin': 'https://ec.europa.eu',
                'Pragma': 'no-cache',
                'Referer': 'https://ec.europa.eu/safety-gate-alerts/screen/search?resetSearch=true',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'

            }
            if i > 9:
                headers[
                    'cookie'] = 'dtCookie=v_4_srv_-2D75_sn_09DKOP0KLAKFPP43DLPEUMVNBCEPU72F; rxVisitor=1736736746251Q036364T31CSNJ20PFTFC3EJCVTG8JO6; SGR_lng=en; dtSa=-; rxvt=1739950709830|1739946650982; dtPC=-75$548510459_53h31vEAUWSKPPJUEIKTFEGODSHLDWUPRMCUMR-0e0'
                headers['X-Dtpc'] = '-75$548510459_53h31vEAUWSKPPJUEIKTFEGODSHLDWUPRMCUMR-0e0'
            print(data, '请求列表页url:', url)
            is_None = True
            response = requests.post(url, headers=headers, impersonate="chrome120", timeout=120, json=data)
            print(response.url)
            if response:
                content = response.json()['content']
                for ids in content:
                    data_list = []
                    time.sleep(random.uniform(1, 3))
                    url = f"https://ec.europa.eu/safety-gate-alerts/public/api/notification/{ids['id']}?language=en"
                    print('请求详情url:', url)
                    headers = {
                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                        'Accept-Encoding': 'ggzip, deflate, br, zstd',
                        'Accept-Language': 'zh-CN,zh;q=0.9',
                        'Cache-Control': 'No-Cache',
                        'Host': 'ec.europa.eu',
                        'Connection': 'keep-alive',
                        'Content-Type': 'application/json',
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'

                    }
                    resp_detail = self._request(headers=headers, url=url, data_type=1)
                    items = resp_detail.json()
                    data_json = json.dumps(items)
                    # print(items)
                    date = items['publicationDate']
                    product_category = items['product']['productCategory']['name']
                    print(date)
                    print(product_category)
                    product_title = items['product']['versions'][0]['name']
                    recall_title = items['product']['versions'][0]['description']
                    print(product_title)
                    print(recall_title)
                    brands = items['product']['brands']
                    if brands:
                        brand = brands[0].get('brand')
                    else:
                        brand = None
                    print('brand::1',brand)
                    hazard = items['risk']['versions'][0]['riskDescription']
                    print(hazard)
                    ext_url = 'https://ec.europa.eu/safety-gate-alerts/screen/webReport/alertDetail/' + str(
                        items['id']) + '?lang=en'
                    print('ext_url::', ext_url)
                    if items['product']['photos']:
                        image_id = items['product']['photos'][0]['id']
                        image_url = f'https://ec.europa.eu/safety-gate-alerts/public/api/notification/image/{image_id}'
                    else:
                        image_url = None
                    print(image_url)
                    data_list.append(
                        [date, product_category, product_title, recall_title, hazard, 'eu_recall', image_url, 'eu',
                         ext_url,data_json,brand])

                    keys = [
                        "recall_date", "product_category", "product_title", "recall_title",
                        "hazard", "data_type", "image_url", "country", "ext_url", "data_json", "brand"
                    ]

                    # 把 list of list 转成 list of dict
                    dict_list = [dict(zip(keys, row)) for row in data_list]

                    with self.mysql_db.begin() as conn:
                        conn.execute(
                            text("""
                                INSERT INTO recall_cases_data 
                                (recall_date, product_category, product_title, recall_title, hazard, data_type, image_url, country, ext_url, data_json, brand)
                                VALUES (:recall_date, :product_category, :product_title, :recall_title, :hazard, :data_type, :image_url, :country, :ext_url, :data_json, :brand)
                                ON DUPLICATE KEY UPDATE 
                                    recall_date = VALUES(recall_date),
                                    product_title = VALUES(product_title),
                                    ext_url = VALUES(ext_url)
                            """),
                            dict_list
                        )
                if is_None == False:
                    break
            else:
                break
            time.sleep(random.uniform(2, 8))

    def ec_europa_uk(self):
        'https://www.gov.uk/product-safety-alerts-reports-recalls?page=2'
        url = 'https://www.gov.uk/product-safety-alerts-reports-recalls'
        for i in range(50):
            headers = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signe',
                'Accept-Encoding': 'gzip, deflate, br, zstd',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Cache-Control': 'No-Cache',
                'Connection': 'keep-alive',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'

            }
            is_None = True
            print('请求列表页：', url)
            response = requests.get(url, headers=headers, impersonate="chrome120", timeout=120)
            resp = etree.HTML(response.text)
            next_url_list = resp.xpath("//div[@class='govuk-pagination__next']/a/@href")
            li_list = resp.xpath("//div[@id='js-results']//ul//li")
            for li in li_list:
                data_list = []
                time.sleep(random.uniform(1, 3))
                href_list = li.xpath('./div/a/@href')
                product_title_list = li.xpath('./div/a/text()')
                if href_list and product_title_list:
                    recall_title = product_title_list[0].replace('Product Safety Report:', '').replace(
                        'Product Recall:', '')
                    print('召回标题', recall_title)
                    detail_url = 'https://www.gov.uk' + href_list[0]
                    print('详情页url,', detail_url)
                    resp = self._request(url=detail_url, headers=headers)
                    resp_html = etree.HTML(resp.text)
                    recall_date = resp_html.xpath("//dt[contains(text(),'Published')]/following-sibling::dd/text()")
                    recall_date = self._parse_date_str(recall_date[0])
                    print('召回时间：', recall_date)
                    product_category = resp_html.xpath(
                        "//dt[contains(text(),'Product category:')]/following-sibling::dd/a/text()")
                    print('产品类型：', product_category)
                    product_title = resp_html.xpath("//p[contains(text(),'Product: ')]/text()")
                    print('产品标题：', product_title)
                    hazard_list = resp_html.xpath("//p[contains(text(),'Hazard:')]/text()")
                    print('风险：', hazard_list)
                    Brand_list = resp_html.xpath("//td[contains(text(),'Brand')]/following-sibling::td/text()")
                    brand = Brand_list[0].strip() if Brand_list else None
                    image_url_list = resp_html.xpath("//span[@class='attachment-inline']/a/@href")
                    product_category = product_category[0].strip() if product_category else None
                    product_title = product_title[0].strip().replace('Product:', '') if product_title else None
                    hazard_list = hazard_list[0].strip().replace('Hazard:', '') if hazard_list else None
                    image_url_list = image_url_list[0].strip() if image_url_list else None
                    data_list.append(
                        [recall_title, detail_url, recall_date, product_category, product_title,
                         hazard_list, image_url_list, 'uk_recall', 'uk',brand])
                    if data_list:
                        try:
                            df = pd.DataFrame(data=data_list,
                                              columns=['recall_title', 'ext_url', 'recall_date', 'product_category',
                                                       'product_title',
                                                       'hazard', 'image_url', 'data_type', 'country','brand'])

                            df.drop_duplicates(['recall_date', 'product_title', 'ext_url'], inplace=True)
                            df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
                        except:
                            is_None = False
                            break
            if is_None == False:
                break
            if next_url_list:
                url = 'https://www.gov.uk' + next_url_list[0]
                print('下一页url:', url)
                time.sleep(random.uniform(2, 8))
            else:
                break

    def gov_uk(self):
        url = 'https://www.gov.uk/drug-device-alerts'
        for i in range(50):
            headers = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signe',
                'Accept-Encoding': 'gzip, deflate, br, zstd',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Cache-Control': 'No-Cache',
                'Connection': 'keep-alive',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'

            }
            print('请求列表页：', url)
            is_None = True
            response = requests.get(url, headers=headers, impersonate="chrome120", timeout=120)
            resp = etree.HTML(response.text)
            next_url_list = resp.xpath("//div[@class='govuk-pagination__next']/a/@href")
            li_list = resp.xpath("//div[@id='js-results']//ul//li")
            for li in li_list:
                data_list = []
                time.sleep(random.uniform(1, 3))
                href_list = li.xpath('./div/a/@href')
                product_title_list = li.xpath('./div/a/text()')
                _datetime = li.xpath('./time/@datetime')
                if href_list and product_title_list and 'Field Safety Notices' not in product_title_list[0]:
                    recall_title = product_title_list[0]
                    print('召回标题', recall_title)
                    if ':' in recall_title:
                        product_title = re.findall(r':(.*)', recall_title)[0].strip()
                    else:
                        product_title = recall_title
                    print('产品标题：', product_title)
                    detail_url = 'https://www.gov.uk' + href_list[0]
                    print('详情页url,', detail_url)
                    resp = self._request(url=detail_url, headers=headers)
                    resp_html = etree.HTML(resp.text)
                    recall_date = resp_html.xpath("//dt[contains(text(),'Published')]/following-sibling::dd/text()")
                    if _datetime:
                        recall_date = _datetime[0]
                    else:

                        if recall_date:
                            recall_date = self._parse_date_str(recall_date[0])
                        else:
                            recall_date = None
                    print('召回时间：', recall_date)
                    product_category = resp_html.xpath(
                        "//dt[contains(text(),'Message type:')]/following-sibling::dd/a/text()")
                    print('产品类型：', product_category)

                    hazard_list = resp_html.xpath(
                        "//h2[contains(text(),'Brief')]/following-sibling::p[1]/text()|//h2[contains(text(),'Background')]/following-sibling::p[1]/text()")
                    print('风险：', hazard_list)
                    image_url_list = resp_html.xpath(
                        "//div[@id='contents']//img/@src|//span[@class='attachment-inline']/a/@href")
                    product_category = product_category[0].strip() if product_category else None
                    hazard_list = hazard_list[0].strip().replace('Hazard:', '') if hazard_list else None
                    image_url_list = image_url_list[0].strip() if image_url_list else None
                    data_list.append(
                        [recall_title, detail_url, recall_date, product_category,
                         product_title.replace('Class 4 Medicines Notification,', ''),
                         hazard_list, image_url_list, 'uk_drug_device', 'uk'])
                    if data_list:
                        try:
                            df = pd.DataFrame(data=data_list,
                                              columns=['recall_title', 'ext_url', 'recall_date', 'product_category',
                                                       'product_title',
                                                       'hazard', 'image_url', 'data_type', 'country'])

                            df.drop_duplicates(['recall_date', 'product_title', 'ext_url'], inplace=True)
                            df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
                        except:
                            is_None = False
                            break
            if is_None == False:
                break
            if next_url_list:
                url = 'https://www.gov.uk' + next_url_list[0]
                print('下一页url:', url)
                time.sleep(random.uniform(2, 8))

            else:
                break

    def global_recalls(self):
        '该函数是一次性获取所有历史数据。后面更新数据每日抓取不调用该函数'
        num_pages = 2418  # 总页数
        page_size = 20  # 每页大小

        urls = [
            f"https://globalrecalls.oecd.org/ws/search.xqy?end={i * page_size}&lang=en&order=desc&q=&sort=date&start={(i - 1) * page_size}&uiLang=en"
            for i in range(1, num_pages + 1)]
        print('请求总数：', len(urls))
        # 示例：打印前 5 页的 URL
        page = 0
        for url in urls:
            page += 1
            for i in range(5):
                try:
                    headers = {

                        'Accept': '*/*',
                        'Accept-Encoding': 'gzip, deflate, br, zstd',
                        'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
                        'Cache-Control': 'no-cache',
                        'Pragma': 'no-cache',
                        'Referer': 'https://globalrecalls.oecd.org/',
                        'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'

                    }
                    print('url,', url)
                    resp = requests.get(url, headers=headers, timeout=60)
                    print(resp.text)
                    print(url)
                    data_list = [[json.dumps(resp.json()), page]]
                    print('page:', page)
                    df = pd.DataFrame(data=data_list,
                                      columns=['data_json', 'page'])
                    df.to_sql('global_recalls_data', con=self.mysql_db, if_exists="append", index=False)
                    break
                except Exception as e:
                    wait_time = (i + 1) * 2
                    print(f"错误: {e}, {wait_time}秒后重试...")
                    time.sleep(wait_time)
                    print(f'请求报错。第{i}页', e)

                    self.mysql_connect1()
            break

    def get_globalrecalls(self):
        # sql = 'SELECT data_json FROM global_recalls_data'
        # df_data = pd.read_sql(sql, con=self.mysql_db)

        list_url = 'https://globalrecalls.oecd.org/ws/search.xqy?end=20&lang=en&order=desc&q=&sort=date&start=0&uiLang=en'
        # list_url = f'https://globalrecalls.oecd.org/ws/search.xqy?end={i}&lang=en&order=desc&q=&sort=date&start={i - 20}&uiLang=en'
        print('请求url', list_url)
        # 'https://globalrecalls.oecd.org/ws/search.xqy?end=200&lang=en&order=desc&q=&sort=date&start=180&uiLang=en'
        headers = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, br, zstd',
            'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
            'Cache-Control': 'no-cache',
            'Pragma': 'no-cache',
            'Referer': 'https://globalrecalls.oecd.org/',
            'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'

        }
        print('请求列表第一页list_url：', list_url)
        resp = requests.get(list_url, headers=headers, timeout=60)
        response = resp.json()
        result_list = response['results']
        for result in result_list:
            countryId = result['countryId']
            imageUri = result['imageUri']
            if countryId.lower() in ['us', 'ca', 'mx', 'nl', 'sa', 'se', 'pl', 'tr', 'be', 'uk', 'de', 'es', 'fr',
                                     'it',
                                     'jp']:
                date_time = result['date']
                extUrl = result['extUrl']
                title_name = result['product.name']
                _id = re.findall(r'/(\d+)$', extUrl)
                data_list = []
                headers = {
                    'Accept': '*/*',
                    'Accept-Encoding': 'gzip, deflate, br, zstd',
                    'Accept-Language': 'zh-CN,zh-TW;q=0.9,zh;q=0.8',
                    'Cache-Control': 'no-cache',
                    'Pragma': 'no-cache',
                    'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
                }
                if _id:
                    items = {}
                    encoded_url = urllib.parse.quote(imageUri)
                    imaurl = "https://globalrecalls.oecd.org/ws/getdocument.xqy?uri=" + encoded_url
                    url = f'https://ec.europa.eu/safety-gate-alerts/public/api/notification/{_id[0]}?language=en'
                    print('请求url:', url)
                    resp = requests.get(url, headers=headers, timeout=60)
                    if 'ENTITY_NOT_FOUN' in resp.text:
                        continue
                    items_data = resp.json()
                    brands = items_data['product']['brands']
                    if brands:
                        brand = brands[0].get('brand')
                    else:
                        brand = None
                    time.sleep(random.uniform(1, 3))
                    items['country'] = countryId
                    items['reacll_time'] = date_time
                    productCategory = items_data['product']['productCategory']['name']
                    if items_data['risk']['versions']:
                        riskDescription = items_data['risk']['versions'][0]['riskDescription']
                    else:
                        riskDescription = None
                    items['recall_title'] = items_data['product']['versions'][0]['description']
                    items['productCategory'] = productCategory
                    items['riskDescription'] = riskDescription
                    items['image_url'] = imaurl
                    items['data_type'] = 'global_recalls'
                    items['product_title'] = re.findall(r'^(.*?)\s*;', title_name + ';')[0]
                    items['ext_url'] = extUrl
                    items['brand'] = brand

                    data_json = json.dumps(items_data)
                    data_list.append([items['data_type'], items['product_title'], items['productCategory'],
                                      items['reacll_time'], items['riskDescription'], items['country'],
                                      items['image_url'],
                                      items['recall_title'], items['ext_url'], data_json,items['brand']])
                    print('itemsitems::',items)
                    try:
                        df = pd.DataFrame(data=data_list,
                                          columns=['data_type', 'product_title', 'product_category', 'recall_date',
                                                   'hazard',
                                                   'country', 'image_url', 'recall_title', 'ext_url', 'data_json','brand'])
                        df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
                    except:
                        print('数据重复=====')
                        continue
                else:
                    print('没有解析到id')
                    items = {}
                    url = result['uri']
                    items['country'] = countryId  # 站点
                    encoded_url = urllib.parse.quote(url)
                    _url = 'https://globalrecalls.oecd.org/ws/getrecall.xqy?uiLang=en&uri=' + encoded_url
                    print('_url::',_url)
                    resp = requests.get(_url, headers=headers, timeout=60)
                    result = resp.json()
                    print("result::", result)
                    time.sleep(random.uniform(1, 3))
                    extUrl = result['recall']['extUrl']  # 详情url
                    imageUri = result['recall']['images'][0]['imageUri']
                    encode_imageUri = urllib.parse.quote(imageUri)
                    imaurl = f"https://globalrecalls.oecd.org/ws/getdocument.xqy?uri={encode_imageUri}"  # 图片
                    date_time = result['recall']['date']
                    items['reacll_time'] = date_time
                    title_name = result['recall']['product.name']
                    recall_title = result['recall']['product.desc']
                    if recall_title is None:
                        recall_title = result['recall']['images'][0]['alt.text']
                    if recall_title:
                        recall_title.replace('Image of ', '')
                    hazard = result['recall']['hazard']
                    items['recall_title'] = recall_title
                    items['productCategory'] = result['recall']['product.type']
                    items['riskDescription'] = hazard
                    items['image_url'] = imaurl
                    items['data_type'] = 'global_recalls'
                    items['product_title'] = re.findall(r'^(.*?)\s*;', title_name + ';')[0]
                    items['ext_url'] = extUrl
                    data_json = json.dumps(result)
                    data_list.append([items['data_type'], items['product_title'], items['productCategory'],
                                      items['reacll_time'], items['riskDescription'], items['country'],
                                      items['image_url'],
                                      items['recall_title'], items['ext_url'], data_json])
                    print('没有解析到id的数据：', items)
                    try:
                        df = pd.DataFrame(data=data_list,
                                          columns=['data_type', 'product_title', 'product_category', 'recall_date',
                                                   'hazard',
                                                   'country', 'image_url', 'recall_title', 'ext_url', 'data_json'])
                        df.to_sql('recall_cases_data', con=self.mysql_db, if_exists="append", index=False)
                    except:
                        print('没有解析到id 存储 数据重复=====')
                        continue

    def run(self):
        # self.global_recalls()
        self.get_globalrecalls()
        self.us_recalls()
        self.us_fda_gov()
        self.ec_europa_eu()
        self.ec_europa_uk()
        self.gov_uk()
        # """

#         数据类型，属于那个国的
# eu_recall
# global_recalls
# uk_drug_device 1
# uk_recall 2
# us_fba_recalls 3
# us_recalls_product
#         """
#         with self.mysql_db.begin() as conn:
#             sql = "SELECT id,product_title FROM recall_cases_data WHERE data_type='us_recalls_product'"
#             df_data = pd.read_sql(sql, con=self.mysql_db)
#             df_data['id'] = df_data['id'].fillna('').astype(str)
#             df_data['product_title'] = df_data['product_title'].fillna('').astype(str)
#             data_json_list = list(df_data.id+ "|=||+||" + df_data.product_title)
#             for data_json_id in data_json_list:
#                 if data_json_id:
#                     data_json_id_list = data_json_id.split('|=||+||')
#                     id = data_json_id_list[0]
#                     product_title = data_json_id_list[1]
#                     print(product_title)
#                     if bool(re.search(r'[\u4e00-\u9fff]', product_title)):
#                         # print(product_title,'23333333')
#                         matches = re.findall(r'[A-Za-z\-®]+(?: [A-Za-z\-®]+)*', product_title.strip())
#                         if matches:
#                             brand = ','.join(matches)
#                         else:
#                             brand = None
#                         print(id, brand,'23444444444')
#                         if brand:
#                             brand = brand.replace('"','').replace("'","")
#                             up_sql = f"""update recall_cases_data set brand ="{brand}" WHERE id={id}"""
#                             print(up_sql)
#                             conn.execute(up_sql)


if __name__ == '__main__':
    recall_cases = recall_cases()
    recall_cases.run()
