
from all_connect import ConnectSpider
Con = ConnectSpider()
from lxml import etree
import requests
import json
import datetime
from urllib.parse import quote


class HomeDepot:
    def __init__(self):
        self.headers = {
        'Referer': 'https://www.homedepot.com/s/02%20sensor%20downstream?NCNI-5',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
        'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
    }
        self.cookies = {
    'AMCV_F6421253512D2C100A490D45%40AdobeOrg': 'MCMID|16129746417315936432497630021679731371',
}

    def fetch_page(self,keyword,page):
        encoded_keyword = quote(keyword)
        url = f'https://www.homedepot.com/s/{encoded_keyword}?NCNI-5&Nao={page}'
        response = requests.get(url, headers=self.headers,cookies=self.cookies)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Error fetching page {page}: Status code {response.status_code}")
            return None


    def parse_html(self,html_content,page):
        Html = etree.HTML(html_content)
        try:
            try:
                not_find = Html.xpath('//*[@id="root"]/div/div/div/div[2]/div/div/div/span[1]/text()')[0]
                print(not_find,page)
                if "Hmm...we couldn't find" in not_find and page ==0:
                    print('无效')
            except:
                data_list = json.loads(Html.xpath('//script[@id="thd-helmet__script--browseSearchStructuredData"]/text()')[0])
                if isinstance(data_list, list) and len(data_list) > 0:
                    return data_list[0]['mainEntity']['offers']['itemOffered']
                else:
                    return []

        except Exception as e:
            print(f"Parsing error: {e}")
            return []


    def save_data(self,project_list,keyword):
        # 在这里实现保存到数据库或其他存储方式的逻辑
        data_list = []
        for project in project_list:
            item = {}
            # 使用 .get() 方法并提供默认值以避免 KeyError
            item['title'] = project.get('name', '')
            offers = project.get('offers', {})
            price = offers.get('price')

            if isinstance(price, str):
                # 如果价格是字符串，则尝试将其转换为浮点数
                try:
                    # 尝试将价格转换为浮点数
                    price = float(price) if price.replace('.', '', 1).isdigit() or (
                            price.count('.') == 1 and price.replace('.', '', 1).isdigit()) else None
                except ValueError:
                    # 如果转换失败，设置为 None
                    price = None
            elif not isinstance(price, (int, float)):
                # 如果既不是字符串也不是数值类型，则设置为 None
                price = None

            item['price'] = price
            item['url'] = offers.get('url', '')
            item['image'] = project.get('image', '')
            item['sku'] = project.get('sku', '')
            item['search_term'] = keyword
            item['state'] = 1
            item['created_at'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            # print(item)
            data_list.append(item)
        print(data_list)

        Con.save_project_items(data_list)



    def process_keyword(self,keyword):
        max_pages = 3  # 只爬取前三页
        for page in range(0, max_pages * 24, 24):  # 每次增加24
            print(f"爬取{keyword}第{page // 24 + 1}页...")
            html_content = self.fetch_page(keyword, page)
            if html_content is None:
                break

            project_list = self.parse_html(html_content,page)
            if not project_list:
                print(f"{keyword} Reached the last page.")
                break

            # self.save_data(project_list, keyword)



    def main(self):
        keywords1 = ['apple watch bands for women red']
        for keyword in keywords1:
            self.process_keyword(keyword)


if __name__ == "__main__":
    depot = HomeDepot()
    depot.main()



