Commit 5438dab7 by Peng
parents 849cc1d0 7b5a1b06
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.8" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="module" module-name="other_job" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/amazon_spider.iml" filepath="$PROJECT_DIR$/.idea/amazon_spider.iml" />
<module fileurl="file://$PROJECT_DIR$/../other_job/.idea/other_job.iml" filepath="$PROJECT_DIR$/../other_job/.idea/other_job.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="AutoImportSettings">
<option name="autoReloadType" value="SELECTIVE" />
</component>
<component name="ChangeListManager">
<list default="true" id="b1be6619-1ed1-424a-a094-c2f1da810449" name="Changes" comment="" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="ProjectColorInfo">{
&quot;associatedIndex&quot;: 3
}</component>
<component name="ProjectId" id="2zrKWwnFl3ueJcI77QJE6KS1Q52" />
<component name="ProjectViewState">
<option name="hideEmptyMiddlePackages" value="true" />
<option name="showLibraryContents" value="true" />
</component>
<component name="PropertiesComponent"><![CDATA[{
"keyToString": {
"Python.1688_Api.executor": "Run",
"Python.get_cookie.executor": "Run",
"Python.real_new_spider.executor": "Run",
"Python.text_spider.executor": "Run",
"RunOnceActivity.ShowReadmeOnStart": "true",
"last_opened_file_path": "C:/Users/Administrator/Desktop/spider",
"node.js.detected.package.eslint": "true",
"node.js.detected.package.tslint": "true",
"node.js.selected.package.eslint": "(autodetect)",
"node.js.selected.package.tslint": "(autodetect)",
"nodejs_package_manager_path": "npm",
"settings.editor.selected.configurable": "com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable",
"vue.rearranger.settings.migration": "true"
}
}]]></component>
<component name="RunManager" selected="Python.1688_Api">
<configuration name="1688_Api" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="other_job" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/../other_job/spider" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/../other_job/spider/1688_Api.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="get_cookie" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="amazon_spider" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/amazon_spider/spiders" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/amazon_spider/spiders/get_cookie.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="real_new_spider" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="amazon_spider" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/amazon_spider/spiders" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/amazon_spider/spiders/real_new_spider.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="text_spider" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="amazon_spider" />
<option name="ENV_FILES" value="" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/amazon_spider/spiders" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/amazon_spider/spiders/text_spider.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<recent_temporary>
<list>
<item itemvalue="Python.1688_Api" />
<item itemvalue="Python.text_spider" />
<item itemvalue="Python.real_new_spider" />
<item itemvalue="Python.get_cookie" />
</list>
</recent_temporary>
</component>
<component name="SharedIndexes">
<attachedChunks>
<set>
<option value="bundled-js-predefined-1d06a55b98c1-0b3e54e931b4-JavaScript-PY-241.17890.14" />
<option value="bundled-python-sdk-5b207ade9991-7e9c3bbb6e34-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-241.17890.14" />
</set>
</attachedChunks>
</component>
<component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="b1be6619-1ed1-424a-a094-c2f1da810449" name="Changes" comment="" />
<created>1752483789512</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1752483789512</updated>
<workItem from="1752483793068" duration="8302000" />
<workItem from="1752546543687" duration="1436000" />
</task>
<servers />
</component>
<component name="TypeScriptGeneratedFilesManager">
<option name="version" value="3" />
</component>
<component name="XDebuggerManager">
<breakpoint-manager>
<breakpoints>
<line-breakpoint enabled="true" suspend="THREAD" type="python-line">
<url>file://$PROJECT_DIR$/amazon_spider/spiders/text_spider.py</url>
<line>167</line>
<option name="timeStamp" value="1" />
</line-breakpoint>
</breakpoints>
</breakpoint-manager>
</component>
<component name="com.intellij.coverage.CoverageDataManagerImpl">
<SUITE FILE_PATH="coverage/amazon_spider$text_spider.coverage" NAME="text_spider Coverage Results" MODIFIED="1752546561145" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/amazon_spider/spiders" />
<SUITE FILE_PATH="coverage/amazon_spider$1688_Api.coverage" NAME="1688_Api Coverage Results" MODIFIED="1752546914648" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/../other_job/spider" />
<SUITE FILE_PATH="coverage/amazon_spider$get_cookie.coverage" NAME="get_cookie Coverage Results" MODIFIED="1752545852691" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/amazon_spider/spiders" />
<SUITE FILE_PATH="coverage/amazon_spider$real_new_spider.coverage" NAME="real_new_spider Coverage Results" MODIFIED="1752545904717" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="false" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/amazon_spider/spiders" />
</component>
</project>
\ No newline at end of file
__title__ = 'amazoncaptcha'
__description__ = "Pure Python, lightweight, Pillow-based solver for the Amazon text captcha."
__url__ = 'https://github.com/a-maliarov/amazoncaptcha'
__version__ = '0.5.0'
__author__ = 'Anatolii Maliarov'
__author_email__ = 'tly.mov@gmail.com'
__license__ = 'MIT'
__copyright__ = 'Copyright 2020 Anatolii Maliarov'
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
amazoncaptcha.devtools
~~~~~~~~~~~~~~~~~~~~~~
This module contains the set of amazoncaptcha's devtools.
"""
from .solver import AmazonCaptcha
from .exceptions import NotFolderError
from .__version__ import __version__
from io import BytesIO
import multiprocessing
import requests
import os
#--------------------------------------------------------------------------------------------------------------
class AmazonCaptchaCollector(object):
def __init__(self, output_folder_path, keep_logs=True, accuracy_test=False):
"""
Initializes the AmazonCaptchaCollector instance.
Args:
output_folder (str): Folder where images or logs should be stored.
keep_logs (bool, optional): If set to True, unsolved captcha links
will be stored separately.
accuracy_test (bool, optional): If set to True, AmazonCaptchaCollector
will not download images but just solve them and log the results.
"""
self.output_folder = output_folder_path
self.keep_logs = keep_logs
self.accuracy_test = accuracy_test
if not os.path.exists(self.output_folder):
os.mkdir(self.output_folder)
elif not os.path.isdir(self.output_folder):
raise NotFolderError(self.output_folder)
self.collector_logs = os.path.join(self.output_folder, f'collector-logs-{__version__.replace(".", "")}.log')
self.test_results = os.path.join(self.output_folder, 'test-results.log')
self.not_solved_logs = os.path.join(self.output_folder, 'not-solved-captcha.log')
def _extract_captcha_link(self, captcha_page):
"""Extracts a captcha link from an html page.
Args:
captcha_page (str): A page's html in string format.
Returns:
str: Captcha link.
"""
return captcha_page.text.split('<img src="')[1].split('">')[0]
def _extract_captcha_id(self, captcha_link):
"""
Extracts a captcha id from a captcha link.
Args:
captcha_link (str): A link to the captcha image.
Returns:
str: Captcha ID.
"""
return ''.join(captcha_link.split('/captcha/')[1].replace('.jpg', '').split('/Captcha_'))
def get_captcha_image(self):
"""
Requests the page with Amazon's captcha, gets random captcha.
Creates AmazonCaptcha instance, stores an original image before solving.
If it is not an accuracy test, the image will be stored in a specified
folder with the solution within its name. Otherwise, only the logs
will be stored, mentioning the captcha link being processed and the result.
"""
captcha_page = requests.get('https://www.amazon.com/errors/validateCaptcha')
captcha_link = self._extract_captcha_link(captcha_page)
response = requests.get(captcha_link)
captcha = AmazonCaptcha(BytesIO(response.content))
captcha._image_link = captcha_link
original_image = captcha.img
solution = captcha.solve(keep_logs=self.keep_logs, logs_path=self.not_solved_logs)
log_message = f'{captcha.image_link}::{solution}'
if solution != 'Not solved' and not self.accuracy_test:
print(log_message)
captcha_name = 'dl_' + self._extract_captcha_id(captcha.image_link) + '_' + solution + '.png'
original_image.save(os.path.join(self.output_folder, captcha_name))
else:
print(log_message)
with open(self.collector_logs, 'a', encoding='utf-8') as f:
f.write(log_message + '\n')
def _distribute_collecting(self, milestone):
"""Distribution function for multiprocessing."""
for step in milestone:
self.get_captcha_image()
def start(self, target, processes):
"""
Starts the process of collecting captchas or conducting a test.
Args:
target (int): Number of captchas to be processed.
processes (int): Number of simultaneous processes.
"""
goal = list(range(target))
milestones = [goal[x: x + target // processes] for x in range(0, len(goal), target // processes)]
jobs = list()
for j in range(processes):
p = multiprocessing.Process(target=self._distribute_collecting, args=(milestones[j], ))
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
if self.accuracy_test:
with open(self.collector_logs, 'r', encoding='utf-8') as f:
output = f.readlines()
all_captchas = len(output)
solved_captchas = len([i for i in output if 'Not solved' not in i])
success_percentage = round((solved_captchas / all_captchas) * 100, 5)
result = f'::Test::Ver{__version__}::Cap{all_captchas}::Per{success_percentage}::'
with open(self.test_results, 'w', encoding='utf-8') as f:
print(result)
f.write(result)
#--------------------------------------------------------------------------------------------------------------
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
amazoncaptcha.exceptions
~~~~~~~~~~~~~~~~~~~~~~~~
This module contains the set of amazoncaptcha's exceptions.
"""
#--------------------------------------------------------------------------------------------------------------
class ContentTypeError(Exception):
"""
Requested url, which was supposed to be the url to the captcha image,
contains unsupported content type within response headers.
"""
def __init__(self, content_type, message='is not supported as a Content-Type. Cannot extract the image.'):
self.content_type = content_type
self.message = message
def __str__(self):
return f'"{self.content_type}" {self.message}'
class NotFolderError(Exception):
"""
Given path, which was supposed to be the path to the folder where
script can store images, is not a folder.
"""
def __init__(self, path, message='is not a folder. Cannot store images there.'):
self.path = path
self.message = message
def __str__(self):
return f'"{self.path}" {self.message}'
#--------------------------------------------------------------------------------------------------------------
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
amazoncaptcha.solver
~~~~~~~~~~~~~~~~~~~~
This module contains AmazonCaptcha instance and all the requiries for it.
Attributes:
MONOWEIGHT (int): The bigger this number - the thicker a monochromed picture
MAXIMUM_LETTER_LENGTH (int): Maximum letter length by X axis
MINIMUM_LETTER_LENGTH (int): Minimum letter length by X axis
SUPPORTED_CONTENT_TYPES (list of str): Used when requesting a captcha url
to check if Content-Type in the headers is valid
"""
from .utils import cut_the_white, merge_horizontally, find_letter_boxes
from .exceptions import ContentTypeError
from PIL import Image, ImageChops
from io import BytesIO
import warnings
# import requests
import json
import zlib
import os
from curl_cffi import requests
os.environ['NO_PROXY'] = 'stackoverflow.com'
MONOWEIGHT = 1
MAXIMUM_LETTER_LENGTH = 33
MINIMUM_LETTER_LENGTH = 14
SUPPORTED_CONTENT_TYPES = ['image/jpeg']
#--------------------------------------------------------------------------------------------------------------
class AmazonCaptcha(object):
def __init__(self, img, image_link=None, devmode=False):
"""
Initializes the AmazonCaptcha instance.
Args:
img (str or io.BytesIO): Path to an input image OR an instance
of BytesIO representing this image.
image_link (str, optional): Used if `AmazonCaptcha` was created
using `from_webdriver` class method. Defaults to None.
devmode (bool, optional): If set to True, instead of 'Not solved',
unrecognised letters will be replaced with dashes.
"""
self.img = Image.open(img, 'r')
self._image_link = image_link
self.devmode = devmode
self.letters = dict()
self.result = dict()
package_directory_path = os.path.abspath(os.path.dirname(os.path.abspath(__file__)))
self.training_data_folder = os.path.join(package_directory_path, 'training_data')
self.alphabet = [filename.split('.')[0] for filename in os.listdir(self.training_data_folder)]
@property
def image_link(self):
"""
Image link property is being assigned only if the instance was
created using `fromdriver` or `fromlink` class methods.
If you have created an AmazonCaptcha instance using the constructor,
the property will be equal to None which triggers the warning.
"""
if not self._image_link:
warnings.warn("Seems like you are trying to pull out the image link while not having it.", Warning, stacklevel=2)
return self._image_link
def _monochrome(self):
"""
Makes a captcha pure monochrome.
Literally says: "for each pixel of an image turn codes 0, 1 to a 0,
while everything in range from 2 to 255 should be replaced with 255".
*All the numbers stay for color codes.
"""
self.img = self.img.convert('L')
self.img = Image.eval(self.img, lambda a: 0 if a <= MONOWEIGHT else 255)
def _find_letters(self):
"""
Extracts letters from an image using found letter boxes.
Populates 'self.letters' with extracted letters being PIL.Image instances.
"""
letter_boxes = find_letter_boxes(self.img, MAXIMUM_LETTER_LENGTH)
letters = [self.img.crop((letter_box[0], 0, letter_box[1], self.img.height)) for letter_box in letter_boxes]
if (len(letters) == 6 and letters[0].width < MINIMUM_LETTER_LENGTH) or (len(letters) != 6 and len(letters) != 7):
letters = [Image.new('L', (200, 70)) for i in range(6)]
if len(letters) == 7:
letters[6] = merge_horizontally(letters[6], letters[0])
del letters[0]
letters = [cut_the_white(letter) for letter in letters]
self.letters = {str(k): v for k, v in zip(range(1, 7), letters)}
def _save_letters(self):
"""
Transforms separated letters into pseudo binary.
Populates 'self.letters' with pseudo binaries.
"""
for place, letter in self.letters.items():
letter_data = list(letter.getdata())
letter_data_string = ''.join(['1' if pix == 0 else '0' for pix in letter_data])
pseudo_binary = str(zlib.compress(letter_data_string.encode('utf-8')))
self.letters[place] = pseudo_binary
def _translate(self):
"""
Finds patterns to extracted pseudo binary strings from data folder.
Literally says: "for each pseudo binary scan every stored letter
pattern and find a match".
Returns:
str: a solution if there is one OR
'Not solved' if devmode set to False OR
a solution where unrecognised letters will be replaces with dashes
"""
for place, pseudo_binary in self.letters.items():
for letter in self.alphabet:
with open(os.path.join(self.training_data_folder, letter + '.json'), 'r', encoding = 'utf-8') as js:
data = json.loads(js.read())
if pseudo_binary in data:
self.result[place] = letter
break
else:
self.result[place] = '-'
if not self.devmode:
return 'Not solved'
return ''.join(self.result.values())
def solve(self, keep_logs=False, logs_path='not-solved-captcha.log'):
"""
Runs the sequence of solving a captcha.
Args:
keep_logs (bool): Not solved captchas will be logged if True.
Defaults to False.
logs_path (str): Path to the file where not solved captcha
links will be stored. Defaults to "not-solved-captcha.log".
Returns:
str: Result of the sequence.
"""
self._monochrome()
self._find_letters()
self._save_letters()
solution = self._translate()
if solution == 'Not solved' and keep_logs and self.image_link:
with open(logs_path, 'a', encoding='utf-8') as f:
f.write(self.image_link + '\n')
return solution
@classmethod
def fromdriver(cls, driver, devmode=False):
"""
Takes a screenshot from your webdriver, crops the captcha, and stores
it into bytes array, which is then used to create an AmazonCaptcha instance.
This also means avoiding any local savings.
Args:
driver (selenium.webdriver.*): Webdriver with opened captcha page.
devmode (bool, optional): If set to True, instead of 'Not solved',
unrecognised letters will be replaced with dashes.
Returns:
AmazonCaptcha: Instance created based on webdriver.
"""
png = driver.get_screenshot_as_png()
element = driver.find_element_by_tag_name('img')
image_link = element.get_attribute('src')
location = element.location
size = element.size
left = location['x']
top = location['y']
right = location['x'] + size['width']
bottom = location['y'] + size['height']
img = Image.open(BytesIO(png))
img = img.crop((left, top, right, bottom))
bytes_array = BytesIO()
img.save(bytes_array, format='PNG')
image_bytes_array = BytesIO(bytes_array.getvalue())
return cls(image_bytes_array, image_link, devmode)
@classmethod
def from_webdriver(cls, driver, devmode=False):
warnings.warn("from_webdriver() is deprecated; use fromdriver() instead.", DeprecationWarning, stacklevel=2)
return cls.fromdriver(driver, devmode)
@classmethod
def fromlink(cls, image_link, proxy, devmode=False):
"""
Requests the given link and stores the content of the response
as `io.BytesIO` that is then used to create AmazonCaptcha instance.
This also means avoiding any local savings.
Args:
link (str): Link to Amazon's captcha image.
devmode (bool, optional): If set to True, instead of 'Not solved',
unrecognised letters will be replaced with dashes.
Returns:
AmazonCaptcha: Instance created based on the image link.
Raises:
ContentTypeError: If response headers contain unsupported
content type.
"""
response = requests.get(image_link, proxies=proxy, verify=False)
if response.headers['Content-Type'] not in SUPPORTED_CONTENT_TYPES:
raise ContentTypeError(response.headers['Content-Type'])
image_bytes_array = BytesIO(response.content)
return cls(image_bytes_array, image_link, devmode)
#--------------------------------------------------------------------------------------------------------------
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
# -*- coding: utf-8 -*-
"""
amazoncaptcha.utils
~~~~~~~~~~~~~~~~~~~
This module contains the set of amazoncaptcha's utilities.
"""
from PIL import Image, ImageChops
#--------------------------------------------------------------------------------------------------------------
def cut_the_white(letter):
"""
Cuts white spaces/borders to get a clear letter.
Args:
letter (PIL.Image): Letter to be processed.
Returns:
PIL.Image: The letter without white spaces.
"""
background = Image.new(letter.mode, letter.size, 255)
diff = ImageChops.difference(letter, background)
bbox = diff.getbbox()
return letter.crop(bbox)
def merge_horizontally(img1, img2):
"""
Merges two letters horizontally.
Created in case an image is corrupted and the last letter ends at the
beginning of the image, causing letter to be unreadable.
Args:
img1 (PIL.Image): First letter.
img2 (PIL.Image): Second letter.
Returns:
PIL.Image: Two merged letters.
"""
merged = Image.new('L', (img1.width + img2.width, img1.height))
merged.paste(img1, (0, 0))
merged.paste(img2, (img1.width, 0))
return merged
def find_letter_boxes(img, maxlength):
"""
Finds and separates letters from a captcha image.
Args:
img (PIL.Image): Monochromed captcha.
maxlength (int): Maximum letter length by X axis.
Returns:
letter_boxes (:obj:`list` of :obj:`tuple`): List with X coords of each letter.
"""
image_columns = [[img.getpixel((x, y)) for y in range(img.height)] for x in range(img.width)]
image_code = [1 if 0 in column else 0 for column in image_columns]
xpoints = [d for d, s in zip(range(len(image_code)), image_code) if s]
xcoords = [x for x in xpoints if x - 1 not in xpoints or x + 1 not in xpoints]
if len(xcoords) % 2:
xcoords.insert(1, xcoords[0])
letter_boxes = list()
for s, e in zip(xcoords[0::2], xcoords[1::2]):
start, end = s, min(e + 1, img.width - 1)
if end - start <= maxlength:
letter_boxes.append((start, end))
else:
two_letters = {k: v.count(0) for k, v in enumerate(image_columns[start + 5:end - 5])}
divider = min(two_letters, key=two_letters.get) + 5
letter_boxes.extend([(start, start + divider), (start + divider + 1, end)])
return letter_boxes
#--------------------------------------------------------------------------------------------------------------
\ No newline at end of file
import json
import time
import redis
import logging
import sys, os
import platform
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
# logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
# level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
# seeds = pd.read_excel('./comment_error_asin.xlsx', dtype={'asin': str})
# seed = list(list(i) for i in seeds.values)
seed = list(list(i) for i in ReadDb(site).asin_comment_seeds().values)
pipe = redis_client.pipeline(transaction=False)
for job in seed:
meta = {
"asin": job[0],
# "history_comment_count": job[2]
}
k = json.dumps(meta)
pipe.zadd(f"{site}_asin_comment_seed", {k: int(time.time())})
# pipe.sadd(f"{site}_real_seed", k)
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
# spiders = {}
# module_content = import_module(f'amazon_spider.spiders.test_redis')
# # print(module_content)
# # print(vars(module_content))
# # obj = getattr(module_content, 'TestSpider')
# for block_name, block_type in vars(module_content).items():
# if block_name == 'TestSpider':
# print(block_name, block_type)
# spiders.update({block_type.name: block_type})
# print("-----")
# print(spiders)
# spider = spiders['test']().custom_settings
# seeds_generate = seeds(spider)
# count = 0
# for seed in seeds_generate:
# try:
# spider.push_request(seed)
# count += 1
# except Exception as e:
# logging.exception(f'error push')
import json
import time
import redis
import logging
import sys, os
import platform
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
r_db = ReadDb(site)
seed = list(list(i) for i in r_db.read_db_variat().values)
pipe = redis_client.pipeline(transaction=False)
for job in seed:
meta = {
"asin": job[0],
"comment_new_time": job[1]
}
k = json.dumps(meta)
pipe.zadd(f"{site}_day_comment_seed", {k: int(time.time())})
# pipe.sadd(f"{site}_real_seed", k)
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
# spiders = {}
# module_content = import_module(f'amazon_spider.spiders.test_redis')
# # print(module_content)
# # print(vars(module_content))
# # obj = getattr(module_content, 'TestSpider')
# for block_name, block_type in vars(module_content).items():
# if block_name == 'TestSpider':
# print(block_name, block_type)
# spiders.update({block_type.name: block_type})
# print("-----")
# print(spiders)
# spider = spiders['test']().custom_settings
# seeds_generate = seeds(spider)
# count = 0
# for seed in seeds_generate:
# try:
# spider.push_request(seed)
# count += 1
# except Exception as e:
# logging.exception(f'error push')
import json
import time
import redis
import logging
import sys, os
import platform
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb
from scrapy.utils.project import get_project_settings
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
r_db = ReadDb(site)
seed = list(list(i) for i in r_db.read_db_real_collect_keepa().values)
print(len(seed))
pipe = redis_client.pipeline(transaction=False)
for job in seed:
meta = {
"asin": job[0],
"asin_type": job[2],
"is_variation": job[3],
"date_info": job[4],
"site": job[5],
"account_id": job[6],
"priority": job[7]
}
k = json.dumps(meta)
pipe.zadd(f"{site}_real_zset_seed", {k: job[7]})
# pipe.sadd(f"{site}_real_seed", k)
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
# spiders = {}
# module_content = import_module(f'amazon_spider.spiders.test_redis')
# # print(module_content)
# # print(vars(module_content))
# # obj = getattr(module_content, 'TestSpider')
# for block_name, block_type in vars(module_content).items():
# if block_name == 'TestSpider':
# print(block_name, block_type)
# spiders.update({block_type.name: block_type})
# print("-----")
# print(spiders)
# spider = spiders['test']().custom_settings
# seeds_generate = seeds(spider)
# count = 0
# for seed in seeds_generate:
# try:
# spider.push_request(seed)
# count += 1
# except Exception as e:
# logging.exception(f'error push')
import sys, os, platform
import json
import logging
import time
import redis
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
r_db = ReadDb(site)
seeds = r_db.read_db_seller_asin()
seeds = seeds[~seeds["updated_at"].isna()].fillna("")
seeds = [list(v) for v in seeds.values]
pipe = redis_client.pipeline(transaction=False)
for job in seeds[0:200000]:
meta = {
"asin": job[1],
"asin_type": job[3],
"is_variation": job[4],
"date_info": job[5],
"site": job[2],
"account_id": job[6],
"priority": job[7],
"updated_at": str(job[8]),
"other_sellers_id": job[9],
"other_seller_name": job[10],
"other_seller_buy_boy_type": job[11],
}
k = json.dumps(meta)
pipe.zadd(f"{site}_real_zset_seed", {k: job[7]})
# pipe.sadd(f"{site}_real_seed", k)
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
import json
import time
import redis
import logging
import sys, os
import platform
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
seed = list(list(i) for i in ReadDb(site).read_db_self_asin_temporary().values)
# seeds = pd.read_excel('./aaaa.xlsx', dtype={'asin': str})
# seed = list(list(i) for i in seeds.values)
pipe = redis_client.pipeline(transaction=False)
for job in seed[10000: 10300]:
print(job)
meta = {
"asin": job[0],
"site": site,
}
k = json.dumps(meta)
pipe.zadd(f"{site}_self_asin_temporary_seed", {k: int(time.time())})
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
# spiders = {}
# module_content = import_module(f'amazon_spider.spiders.test_redis')
# # print(module_content)
# # print(vars(module_content))
# # obj = getattr(module_content, 'TestSpider')
# for block_name, block_type in vars(module_content).items():
# if block_name == 'TestSpider':
# print(block_name, block_type)
# spiders.update({block_type.name: block_type})
# print("-----")
# print(spiders)
# spider = spiders['test']().custom_settings
# seeds_generate = seeds(spider)
# count = 0
# for seed in seeds_generate:
# try:
# spider.push_request(seed)
# count += 1
# except Exception as e:
# logging.exception(f'error push')
import sys, os
import json
import logging
import time
import redis
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb, ReadCookie
from scrapy.utils.project import get_project_settings
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
seed = list(list(i) for i in ReadDb(site).read_db_self_return().values)
pipe = redis_client.pipeline(transaction=False)
for job in seed:
meta = {
"asin": job[1],
"site": job[2],
}
k = json.dumps(meta)
# pipe.sadd(f"{site}_variat_seed", k)
pipe.zadd(f"{site}_detail_returns_seed", {k: int(time.time())})
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "fr", "de", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
import json
import time
import redis
import logging
import sys, os
import platform
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.utils import send_mg
from amazon_spider.utils.read_db_data import ReadDb
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
seed = list(list(i) for i in ReadDb(site).read_self_asin_seeds().values)
logging.info(f"{site} 站点 数据量为 {len(seed)}")
pipe = redis_client.pipeline(transaction=False)
for job in seed:
meta = {
"asin": job[0],
}
k = json.dumps(meta)
# pipe.sadd(f"{site}_variat_seed", k)
pipe.zadd(f"{site}_variat_seed", {k: int(time.time())})
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
for num in range(0, 3):
try:
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
break
except Exception as e:
print(e)
account = 'hezhe'
title = f'{i} 站点 内部asin 调度 数据插入失败'
content = f"{i} 站点 内部asin 调度 数据插入失败 时间:{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}"
send_mg(account, title, content)
continue
import json
import time
import redis
import logging
import sys, os
import platform
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
seed = list(list(i) for i in ReadDb(site).read_db_tautology_keepa().values)
pipe = redis_client.pipeline(transaction=False)
for job in seed:
meta = {
"asin": job[0],
"asin_type": job[2],
"is_variation": job[3],
"date_info": job[4],
"site": job[5],
"priority": job[6],
}
k = json.dumps(meta)
pipe.zadd(f"{site}_day_seed", {k: job[6]})
# pipe.sadd(f"{site}_day_seed", k)
pipe.execute()
if __name__ == '__main__':
# for i in ["us"]:
for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
import sys, os
import platform
import json
import logging
import time
import redis
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
if "Windows" == platform.system():
print("windows")
sys.path = [
'C:\\Users\\Administrator\\Desktop\\Amazon-Selection\\amazon_spider\\amazon_spider\\amazon_spider\\spiders',
'D:\\Program Files\\JetBrains\\PyCharm 2024.1.3\\plugins\\python\\helpers\\pycharm_display',
'D:\\Users\\Administrator\\AppData\\Local\\Programs\\Python\\Python38\\python38.zip',
'D:\\Users\\Administrator\\AppData\\Local\\Programs\\Python\\Python38\\DLLs',
'D:\\Users\\Administrator\\AppData\\Local\\Programs\\Python\\Python38\\lib',
'D:\\Users\\Administrator\\AppData\\Local\\Programs\\Python\\Python38',
'D:\\Users\\Administrator\\AppData\Local\\Programs\\Python\\Python38',
'D:\\Users\\Administrator\\AppData\Local\\Programs\\Python\\Python38\\Lib\\site-packages',
'D:\\Program Files\\JetBrains\\PyCharm 2024.1.3\\plugins\\python\\helpers\\pycharm_matplotlib_backend',
'C:\\Users\\Administrator\\Desktop\\Amazon-Selection\\amazon_spider\\amazon_spider']
else:
time.tzset()
from amazon_spider.utils.read_db_data import ReadDb, ReadCookie
from scrapy.utils.project import get_project_settings
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
r_db = ReadDb(site)
seed = list(list(i) for i in r_db.read_db_keepa().values)
pipe = redis_client.pipeline(transaction=False)
for job in seed:
meta = {
"asin": job[0],
"asin_type": job[2],
"is_variation": job[3],
"date_info": job[4],
"site": job[5],
"priority": job[6],
}
k = json.dumps(meta)
pipe.zadd(f"{site}_day_seed", {k: job[6]})
# pipe.sadd(f"{site}_day_seed", k)
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
# spiders = {}
# module_content = import_module(f'amazon_spider.spiders.test_redis')
# # print(module_content)
# # print(vars(module_content))
# # obj = getattr(module_content, 'TestSpider')
# for block_name, block_type in vars(module_content).items():
# if block_name == 'TestSpider':
# print(block_name, block_type)
# spiders.update({block_type.name: block_type})
# print("-----")
# print(spiders)
# spider = spiders['test']().custom_settings
# seeds_generate = seeds(spider)
# count = 0
# for seed in seeds_generate:
# try:
# spider.push_request(seed)
# count += 1
# except Exception as e:
# logging.exception(f'error push')
import json
import time
import redis
import logging
import sys, os
import platform
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
seed = ReadDb(site).read_db_self_other()
pipe_count = redis_client.zcard(f"other_self_asin_seed")
if pipe_count:
logging.info("队列内数据未消费完")
return
for name, seed in seed.groupby("site")['asin']:
name = name.split(".")[-1]
# priority = 1 if name in ['ca', 'mx'] else int(seed.count())
priority = int(seed.count())
pipe = redis_client.pipeline(transaction=False)
print(name, seed.count())
if name != 'jp':
for job in seed:
meta = {
"asin": job,
"site": name,
"priority": priority,
}
k = json.dumps(meta)
pipe.zadd(f"other_self_asin_seed", {k: meta['priority']})
else:
print(f"非需要站点直接跳过{name}")
continue
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
# spiders = {}
# module_content = import_module(f'amazon_spider.spiders.test_redis')
# # print(module_content)
# # print(vars(module_content))
# # obj = getattr(module_content, 'TestSpider')
# for block_name, block_type in vars(module_content).items():
# if block_name == 'TestSpider':
# print(block_name, block_type)
# spiders.update({block_type.name: block_type})
# print("-----")
# print(spiders)
# spider = spiders['test']().custom_settings
# seeds_generate = seeds(spider)
# count = 0
# for seed in seeds_generate:
# try:
# spider.push_request(seed)
# count += 1
# except Exception as e:
# logging.exception(f'error push')
import json
import time
import redis
import logging
import sys, os
import platform
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
pipe_count = redis_client.scard(f"{site}_search_seed")
if pipe_count:
logging.info("队列内数据未消费完")
return
seed = list(list(i) for i in ReadDb(site).read_db_other_search_asin_spider().values)
pipe = redis_client.pipeline(transaction=False)
for job in seed:
# id, asin, username, sku, site
if job[1]:
meta = {
"asin": job[1],
"sku": job[2],
"site": 'us' if job[3].split('.')[-1] == 'com' else job[3].split('.')[-1],
}
k = json.dumps(meta)
pipe.sadd(f"{site}_search_seed", k)
else:
continue
# pipe.zadd(f"{site}_search_seed", {k: int(time.time())})
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "fr", "de", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
import json
import time
import redis
import logging
import sys, os
import platform
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
r_db = ReadDb(site)
seed = list(list(i) for i in r_db.read_db_temu_detail().values)
pipe = redis_client.pipeline(transaction=False)
for job in seed:
meta = {
"goodid": job[1],
}
k = json.dumps(meta)
pipe.sadd(f"{site}_temu_detail_seed", k)
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
import json
import time
import redis
import logging
import sys, os
import platform
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
r_db = ReadDb(site)
seed = list(list(i) for i in r_db.read_db_temu_img_search_keyword().values)
pipe = redis_client.pipeline(transaction=False)
for job in seed:
meta = {
"asin": job[1],
"search_term": job[2],
"site": job[3],
}
k = json.dumps(meta)
pipe.sadd(f"{site}_temu_img_search_seed", k)
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
import json
import time
import redis
import logging
import sys, os
import platform
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def search_seeds(site="us"):
r_db = ReadDb(site)
seed = list(list(i) for i in r_db.read_db_temu_search_keyword().values)
pipe = redis_client.pipeline(transaction=False)
for job in seed:
meta = {
"search_term": job[1],
}
k = json.dumps(meta)
pipe.sadd(f"{site}_temu_search_seed", k)
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
search_seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
import sys, os
import json
import logging
import json
import time
import redis
import logging
import sys, os
import platform
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
r_db = ReadDb(site)
seed = list(list(i) for i in r_db.read_db_top_asin().values)
pipe = redis_client.pipeline(transaction=False)
for job in seed:
meta = {
"asin": job[1],
"site": job[2],
}
k = json.dumps(meta)
pipe.sadd(f"{site}_top_asin_spider", k)
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
# spiders = {}
# module_content = import_module(f'amazon_spider.spiders.test_redis')
# # print(module_content)
# # print(vars(module_content))
# # obj = getattr(module_content, 'TestSpider')
# for block_name, block_type in vars(module_content).items():
# if block_name == 'TestSpider':
# print(block_name, block_type)
# spiders.update({block_type.name: block_type})
# print("-----")
# print(spiders)
# spider = spiders['test']().custom_settings
# seeds_generate = seeds(spider)
# count = 0
# for seed in seeds_generate:
# try:
# spider.push_request(seed)
# count += 1
# except Exception as e:
# logging.exception(f'error push')
import json
import time
import redis
import logging
import sys, os
import platform
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
seed = list(list(i) for i in ReadDb("us").read_db_asin_video_bsr_spider(site).values)
pipe = redis_client.pipeline(transaction=False)
for job in seed:
meta = {
"asin": job[0],
"site": job[2],
"source_asin": job[3],
"bsr_url": job[4],
}
k = json.dumps(meta)
pipe.sadd(f"{site}_upc_video", k)
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
# spiders = {}
# module_content = import_module(f'amazon_spider.spiders.test_redis')
# # print(module_content)
# # print(vars(module_content))
# # obj = getattr(module_content, 'TestSpider')
# for block_name, block_type in vars(module_content).items():
# if block_name == 'TestSpider':
# print(block_name, block_type)
# spiders.update({block_type.name: block_type})
# print("-----")
# print(spiders)
# spider = spiders['test']().custom_settings
# seeds_generate = seeds(spider)
# count = 0
# for seed in seeds_generate:
# try:
# spider.push_request(seed)
# count += 1
# except Exception as e:
# logging.exception(f'error push')
import json
import time
import redis
import logging
import sys, os
import platform
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
seed = list(list(i) for i in ReadDb("us").read_db_video_asin(site).values)
pipe = redis_client.pipeline(transaction=False)
for job in seed:
meta = {
"asin": job[0],
"site": job[2],
"sku": job[3],
}
k = json.dumps(meta)
pipe.sadd(f"{site}_upc_video", k)
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
# spiders = {}
# module_content = import_module(f'amazon_spider.spiders.test_redis')
# # print(module_content)
# # print(vars(module_content))
# # obj = getattr(module_content, 'TestSpider')
# for block_name, block_type in vars(module_content).items():
# if block_name == 'TestSpider':
# print(block_name, block_type)
# spiders.update({block_type.name: block_type})
# print("-----")
# print(spiders)
# spider = spiders['test']().custom_settings
# seeds_generate = seeds(spider)
# count = 0
# for seed in seeds_generate:
# try:
# spider.push_request(seed)
# count += 1
# except Exception as e:
# logging.exception(f'error push')
import json
import time
import redis
import logging
import sys, os
import platform
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(sys.path[0]))))
from amazon_spider.utils.read_db_data import ReadDb
if "Windows" == platform.system():
print("windows")
else:
time.tzset()
REDIS_CONF = get_project_settings().get('REDIS')
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
level=logging.INFO)
def get_redis_client():
client = redis.StrictRedis(**REDIS_CONF)
return client
redis_client = get_redis_client()
def seeds(site="us"):
seed = list(list(i) for i in ReadDb("us").read_db_video_bsr_cate_asin(site).values)
pipe = redis_client.pipeline(transaction=False)
for job in seed:
# 'asin', 'id', 'site', 'rank', 'bsr_url'
meta = {
"asin": job[0],
"site": job[2],
"rank": job[3],
"bsr_url": job[4],
}
k = json.dumps(meta)
pipe.sadd(f"{site}_upc_cate_video", k)
pipe.execute()
if __name__ == '__main__':
for i in ["us"]:
# for i in ["us", "uk", "fr", "de", "it", "es"]:
logging.info(f"推送{i}任务——————")
seeds(i)
logging.info(f"推送{i}任务完成——————")
time.sleep(5)
# spiders = {}
# module_content = import_module(f'amazon_spider.spiders.test_redis')
# # print(module_content)
# # print(vars(module_content))
# # obj = getattr(module_content, 'TestSpider')
# for block_name, block_type in vars(module_content).items():
# if block_name == 'TestSpider':
# print(block_name, block_type)
# spiders.update({block_type.name: block_type})
# print("-----")
# print(spiders)
# spider = spiders['test']().custom_settings
# seeds_generate = seeds(spider)
# count = 0
# for seed in seeds_generate:
# try:
# spider.push_request(seed)
# count += 1
# except Exception as e:
# logging.exception(f'error push')
from scrapy.statscollectors import StatsCollector
class YswgCollector(StatsCollector):
def del_key(self, key):
del self._stats[key]
selection_table_name = {
"us_asin_table": "us_self_asin_spider",
"us_asin_variat": "us_self_variat",
"us_comment_num_table": "us_asin_comment_num",
"us_comment_cookies_table": "us_comment_cookies",
"us_comment_table": "us_asin_comment",
"us_asin_detail": "us_asin_detail",
"us_all_syn_st_hezhe_test": "us_all_syn_st_hezhe_test",
"de_asin_table": "de_self_asin_spider",
"de_asin_variat": "de_self_variat",
"de_comment_num_table": "de_asin_comment_num",
"de_comment_cookies_table": "de_comment_cookies",
"de_comment_table": "de_asin_comment",
"de_asin_detail": "de_asin_detail",
"de_all_syn_st_hezhe_test": "de_all_syn_st_hezhe_test",
"uk_asin_table": "uk_self_asin_spider",
"uk_asin_variat": "uk_self_variat",
"uk_comment_num_table": "uk_asin_comment_num",
"uk_comment_cookies_table": "uk_comment_cookies",
"uk_comment_table": "uk_asin_comment",
"uk_asin_detail": "uk_asin_detail",
"uk_all_syn_st_hezhe_test": "uk_all_syn_st_hezhe_test",
"it_asin_table": "it_self_asin_spider",
"it_asin_variat": "it_self_variat",
"it_comment_num_table": "it_asin_comment_num",
"it_comment_cookies_table": "it_comment_cookies",
"it_comment_table": "it_asin_comment",
"it_asin_detail": "it_asin_detail",
"it_all_syn_st_hezhe_test": "it_all_syn_st_hezhe_test",
"es_asin_table": "es_self_asin_spider",
"es_asin_variat": "es_self_variat",
"es_comment_num_table": "es_asin_comment_num",
"es_comment_cookies_table": "es_comment_cookies",
"es_comment_table": "es_asin_comment",
"es_asin_detail": "es_asin_detail",
"es_all_syn_st_hezhe_test": "es_all_syn_st_hezhe_test",
"fr_asin_table": "fr_self_asin_spider",
"fr_asin_variat": "fr_self_variat",
"fr_comment_num_table": "fr_asin_comment_num",
"fr_comment_cookies_table": "fr_comment_cookies",
"fr_comment_table": "fr_asin_comment",
"fr_asin_detail": "fr_asin_detail",
"fr_all_syn_st_hezhe_test": "fr_all_syn_st_hezhe_test",
}
\ No newline at end of file
inner_item_fidle = {
"title": 800,
"img_url": 300,
"category": 200,
"volume": 50,
"video_url": 400,
"add_url": 400,
"material": 150,
"brand": 100,
"ac_name": 100,
"mpn": 1000,
"weight_str": 100,
"account_name": 200,
"other_seller_name": 200,
}
fidle_count_monitor = {
'img_url': 0.8,
'title': 0.8,
'price': 0.8,
'rating': 0.8,
'total_comments': 0.8,
'buy_box_seller_type': 0.8,
'page_inventory': 0.8,
'category': 0.8,
'volume': 0.8,
'weight': 0.8,
'rank': 0.8,
'launch_time': 0.8,
'video_url': 0.8,
'add_url': 0.8,
'material': 0.8,
'img_num': 0.8,
'img_type': 0.8,
'qa_num': 0.8,
'brand': 0.8,
'ac_name': 0.8,
'node_id': 0.8,
'sp_num': 0.8,
'mpn': 0.8,
'online_time': 0.8,
'describe': 0.8,
'one_star': 0.8,
'two_star': 0.8,
'three_star': 0.8,
'four_star': 0.8,
'five_star': 0.8,
'low_star': 0.8,
'asin_type': 0.8,
'is_coupon': 0.8,
'search_category': 0.8,
'weight_str': 0.8,
'date_info': 0.8,
'site': 0.8,
'account_name': 0.8,
'other_seller_name': 0.8,
'bsr_date_info': 0.8,
'account_id': 0.8,
}
\ No newline at end of file
# import psycopg2
import logging
from sqlalchemy import create_engine
from sqlalchemy.pool import NullPool
# h6_pg_us = {
# 'user': 'postgres',
# 'password': 'fazAqRRVV9vDmwDNRNb593ht5TxYVrfTyHJSJ3BS',
# 'host': '113.100.143.162',
# 'port': 5432,
# 'database': 'selection',
# # 'encoding': 'utf-8',
# }
h6_pg_us = {
'user': 'postgres',
# 'password': urllib.parse.quote('T#4$4%qPbR7mJx'),
'password': "F9kL2sXe81rZq",
'host': "113.100.143.162",
'port': 5432,
'database': 'selection',
# 'encoding': 'utf-8',
}
def get_14pg_country_engine(site_name="us"):
h14_pg_us = {
"user": "postgres",
# "password": urllib.parse.quote("G8m!Q2p9D#f%5x"),
"password": "F9kL2sXe81rZq",
"host": "61.145.136.61",
"port": 54328,
"database": "selection",
}
if site_name not in ['uk', 'fr', 'es', 'it', 'de']:
h14_pg_us["database"] = f"selection"
db_ = 'postgresql+psycopg2://{}:{}@{}:{}/{}'.format(*h14_pg_us.values())
else:
h14_pg_us["database"] = f"selection_{site_name}"
db_ = 'postgresql+psycopg2://{}:{}@{}:{}/{}'.format(*h14_pg_us.values())
# engine = create_engine(db_, poolclass=NullPool, connect_args={'connect_timeout': 10}) # , pool_recycle=3600
engine = create_engine(db_, pool_timeout=300, connect_args={'connect_timeout': 10}) # , pool_recycle=3600
return engine
def get_pg_country_engine(site_name="us"):
if site_name not in ['uk', 'fr', 'es', 'it', 'de']:
h6_pg_us["database"] = f"selection"
db_ = 'postgresql+psycopg2://{}:{}@{}:{}/{}'.format(*h6_pg_us.values())
else:
h6_pg_us["database"] = f"selection_{site_name}"
db_ = 'postgresql+psycopg2://{}:{}@{}:{}/{}'.format(*h6_pg_us.values())
# engine = create_engine(db_, poolclass=NullPool, connect_args={'connect_timeout': 10}) # , pool_recycle=3600
engine = create_engine(db_, pool_timeout=300, connect_args={'connect_timeout': 10}) # , pool_recycle=3600
return engine
def get_pg_con(site_name="us"):
if site_name not in ['uk', 'fr', 'es', 'it', 'de']:
h6_pg_us["database"] = f"selection"
db_ = 'postgresql+psycopg2://{}:{}@{}:{}/{}'.format(*h6_pg_us.values())
else:
h6_pg_us["database"] = f"selection_{site_name}"
db_ = 'postgresql+psycopg2://{}:{}@{}:{}/{}'.format(*h6_pg_us.values())
engine = create_engine(db_, poolclass=NullPool, connect_args={'connect_timeout': 10})
conn = engine.connect()
return conn
def get_14pg_psy_country_engine(site_name="us"):
if site_name not in ['uk', 'fr', 'es', 'it', 'de']:
database = f"selection"
else:
database = f"selection_{site_name}"
engine = psycopg2.connect(
dbname=database,
user="postgres",
password="G8m!Q2p9D#f%5x",
host="61.145.136.61",
port="54328",
connect_timeout=300,
)
return engine
def get_6pg_psy_country_engine(site_name="us"):
if site_name not in ['uk', 'fr', 'es', 'it', 'de']:
database = f"selection"
else:
database = f"selection_{site_name}"
engine = psycopg2.connect(
dbname=database,
user="postgres",
password="G8m!Q2p9D#f%5x",
host="113.100.143.162",
port="5432",
connect_timeout=300,
)
return engine
# @func_set_timeout(300)
def updatas_pg_asin(sql, data=None, site="us", db='pg14'):
engine = get_14pg_psy_country_engine(site) if db == 'pg14' else get_6pg_psy_country_engine()
try:
cur = engine.cursor()
if data!= None:
cur.executemany(sql, data)
else:
cur.execute(sql)
# msg = execute_values(psycopg2.extensions.cursor(engine), sql, data, page_size=1000)
# print(msg, data[0][0:2])
engine.commit()
except psycopg2.OperationalError as e:
logging.info(f"失败sql为 {sql} {e}")
return False
except Exception as e:
logging.info(f"失败sql为 {sql} {e}")
return False
finally:
engine.close()
return True
# async def insert_data(sql, data=None, site="us", db='pg14'):
# # Establish a connection to the PostgreSQL database
# async with asyncpg.create_pool(**h6_pg_us) as pool:
# # Execute the insert statements concurrently
# async with pool.acquire() as conn:
# async with conn.transaction():
# await conn.executemany(sql, data)
import json
import os, sys
import redis as rd
from func_timeout import func_set_timeout
from scrapy.utils.project import get_project_settings
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0]))) # 上级目录
from amazon_spider.utils.common import singleton, md5
REDIS = get_project_settings().get('REDIS')
@singleton
class Redis(object):
def __init__(self):
self.host = REDIS['host']
self.port = REDIS['port']
self.db = REDIS['db']
self.password = REDIS['password']
def get_instance(self, db=10):
self.pool = rd.ConnectionPool(
host=self.host,
port=self.port,
db=db,
password=self.password,
max_connections=3,
socket_timeout=5,
socket_connect_timeout=5,
retry_on_timeout=True,
)
return rd.Redis(connection_pool=self.pool)
@func_set_timeout(10)
def set_string(key, value):
"""set string
:param key: key
:param value: value
:return:
"""
if type(value) is dict:
value = json.dumps(value)
r = Redis().get_instance()
r.set(key, value)
@func_set_timeout(10)
def get(key):
"""get string
:param key: key
:return:
"""
r = Redis().get_instance()
value = r.get(key)
r.close()
if value:
return json.loads(value)
@func_set_timeout(10)
def delete(key):
"""del a key
:param key: key
:return: True for success.
"""
r = Redis().get_instance()
r.delete(key)
r.close()
@func_set_timeout(10)
def exists(key):
"""check if a key exists.
:param key: key
:return: True for exists, False for not.
"""
r = Redis().get_instance()
d = r.exists(key)
r.close()
return d
@func_set_timeout(10)
def expire(key, time):
"""set expire time for a key.
:param key: key
:param time: expire time
:return: True for success.
"""
r = Redis().get_instance()
d = r.expire(key, time)
r.close()
return d
@func_set_timeout(10)
def smembers(key):
"""smembers
:param key: key
:return:
"""
r = Redis().get_instance()
d = r.smembers(key)
r.close()
return d
@func_set_timeout(10)
def sadd(key, value, use_md5=True):
"""add key-value to the sorted set.
:param key: key
:param value: value
:param use_md5: weather use md5.
:return: True for done, False for not.
"""
r = Redis().get_instance()
if use_md5:
added = r.sadd(key, md5(value, digits=16))
else:
added = r.sadd(key, value)
r.close()
return added == 1
@func_set_timeout(10)
def spop(key, count=20) -> list:
"""spop
:param count:
:param key:
:return:
"""
r = Redis().get_instance()
d = r.spop(key, count)
r.close()
return d
@func_set_timeout(10)
def sismember(key, value, md5=True):
"""check weather a key-value exists in sorted set.
:param key: key
:param value: value
:return: True for exists, False for not.
"""
r = Redis().get_instance()
member = md5(value) if md5 else value
res = r.sismember(key, member)
r.close()
return res
@func_set_timeout(10)
def srem(key, value, md5=True):
"""remove a key-value from set.
:param key: key
:param value: value
:return: boolean
"""
r = Redis().get_instance()
member = md5(value, digits=16) if md5 else value
res = r.srem(key, member)
r.close()
return res
@func_set_timeout(10)
def hget(name, key):
"""hash get
:param name: hash name
:param key: key
:return: boolean
"""
r = Redis().get_instance()
res = r.hget(name, key)
r.close()
if res:
d = res.decode()
return d
@func_set_timeout(10)
def hdel(name, key):
"""hash del
:param name: hash name
:param key: key
:return: boolean
"""
r = Redis().get_instance()
d = r.hdel(name, key)
r.close()
return d
@func_set_timeout(10)
def hexists(name, key):
"""hash exist.
:param name: hash name
:param key: key
:return: boolean
"""
r = Redis().get_instance()
d = r.hexists(name, key)
r.close()
return d
@func_set_timeout(10)
def hset(name, key, value):
"""hash set
:param name: hash name
:param key: key
:param value: value
:return: boolean
"""
r = Redis().get_instance()
return r.hset(name, key, value)
@func_set_timeout(10)
def zadd(key, value):
"""add key-value to the sorted set.
:param key: key
:param value: value
:param use_md5: weather use md5.
:return: True for done, False for not.
"""
# value dict {value: score}
r = Redis().get_instance()
added = r.zadd(key, value)
r.close()
return added == 1
@func_set_timeout(10)
def zpop(key, count):
# value dict {value: score}
r = Redis().get_instance()
pipe = r.pipeline(transaction=False)
pipe.multi()
pipe.zrange(key, 0, count-1).zremrangebyrank(key, 0, count-1)
results, count = pipe.execute()
r.close()
return results
@func_set_timeout(10)
def hincrby(key, field, count):
"""
:param key: key
:param key: field
:param key: count
:return: count.
"""
r = Redis().get_instance()
added = r.hincrby(key, field, count)
r.close()
return added
@func_set_timeout(10)
def hgetall(key):
r = Redis().get_instance()
added = r.hgetall(key)
r.close()
return added
def srandmembers(key, count):
"""SRANDMEMBER
:param key: key
:return:
"""
r = Redis().get_instance()
return r.srandmember(key, count)
def lpop(key) -> list:
"""lpop
:param key:
:return:
"""
r = Redis().get_instance()
return r.lpop(key)
@func_set_timeout(10)
def lpush(key, value):
"""add key-value to the sorted set.
:param key: key
:param value: value
:return: True for done, False for not.
"""
# value dict {value: score}
r = Redis().get_instance()
added = r.lpush(key, value)
r.close()
return added == 1
@func_set_timeout(30)
def xadd(key, data):
r = Redis().get_instance()
added = r.xadd(key, data)
r.close()
return added == 1
@func_set_timeout(30)
def xadd_db0(key, data):
r = Redis().get_instance(0)
added = r.xadd(key, data)
r.close()
return added == 1
@func_set_timeout(10)
def hexists(name, key):
"""hash exist.
:param name: hash name
:param key: key
:return: boolean
"""
r = Redis().get_instance()
d = r.hexists(name, key)
r.close()
return d
def hset(name, key, value):
"""hash set
:param name: hash name
:param key: key
:param value: value
:return: boolean
"""
r = Redis().get_instance()
return r.hset(name, key, value)
@func_set_timeout(10)
def zadd(key, value):
"""add key-value to the sorted set.
:param key: key
:param value: value
:param use_md5: weather use md5.
:return: True for done, False for not.
"""
# value dict {value: score}
r = Redis().get_instance()
added = r.zadd(key, value)
r.close()
return added == 1
@func_set_timeout(10)
def zpop(key, count):
# value dict {value: score}
r = Redis().get_instance()
pipe = r.pipeline(transaction=False)
pipe.multi()
pipe.zrange(key, 0, count-1).zremrangebyrank(key, 0, count-1)
results, count = pipe.execute()
r.close()
return results
@func_set_timeout(10)
def hincrby(key, field, count):
"""
:param key: key
:param key: field
:param key: count
:return: count.
"""
r = Redis().get_instance()
added = r.hincrby(key, field, count)
r.close()
return added
@func_set_timeout(10)
def hgetall(key):
r = Redis().get_instance()
added = r.hgetall(key)
r.close()
return added
\ No newline at end of file
import random
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.core.downloader.contextfactory import ScrapyClientContextFactory
# ORIGIN_CIPHERS = 'TLS13-AES-256-GCM-SHA384:TLS13-CHACHA20-POLY1305-SHA256:TLS13-AES-128-GCM-SHA256:ECDH+AESGCM:ECDH+CHACHA20:DH+AESGCM:DH+CHACHA20:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:RSA+AESGCM:RSA+AES'
requests_ciphers = 'ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES'
scrapy_ciphers = 'TLS13-AES-256-GCM-SHA384:TLS13-CHACHA20-POLY1305-SHA256:TLS13-AES-128-GCM-SHA256:ECDH+AESGCM:ECDH+CHACHA20:DH+AESGCM:DH+CHACHA20:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:RSA+AESGCM:RSA+AES'
origin_ciphers_lists = [requests_ciphers, scrapy_ciphers]
ORIGIN_CIPHERS = random.choice(origin_ciphers_lists)
def shuffle_ciphers():
CIPHERS = ORIGIN_CIPHERS.split(":")
C = []
for i in range(0, random.randint(3, len(CIPHERS))):
C.append(random.choice(CIPHERS))
# random.shuffle(C)
CIPHERS = ":".join(C)
return CIPHERS + ":!aNULL:!MD5:!DSS"
class MyHTTPDownloadHandler(HTTPDownloadHandler):
def download_request(self, request, spider):
tls_cliphers = shuffle_ciphers()
self._contextFactory = ScrapyClientContextFactory(tls_ciphers=tls_cliphers)
# # 配置TLS连接参数
# # request.meta['download_slot'] = spider.url_
# request.meta['ssl'] = {'verify': False}
return super().download_request(request, spider)
# -*- coding: utf-8 -*-
import logging
from scrapy import signals
import sys, os
from func_timeout import func_set_timeout
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0]))) # 上级目录
class AddLastEventIdMiddleware:
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
@func_set_timeout(3)
def get_cookie(self, data):
import execjs
import platform
logging.info(f"请求参数{data}")
cwd = os.path.dirname(__file__)
# with open(cwd + '\\js\\17encrypt.js', 'r') as f:
if "Windows" == platform.system():
with open(cwd + '\\js\\17encrypt.js', 'r') as f:
ctx = execjs.compile(f.read())
anti_content = ctx.call('getCookie', data)
return anti_content
else:
with open(cwd + '/js/17encrypt.js', 'r') as f:
ctx = execjs.compile(f.read())
anti_content = ctx.call('getCookie', data)
return anti_content
def process_request(self, request, spider):
request.cookies = {"Last-Event-ID": self.get_cookie(request.meta.get("fdata"))}
logging.info(f"添加Last-Event-ID参数:{request.cookies.get('Last-Event-ID')[0:10]}")
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
import time
import logging
import aiohttp
import scrapy
from scrapy import signals
from amazon_spider.utils.random_ssl import sslgen
logger = logging.getLogger(__name__)
class AiohttpMiddleware:
"""
scrapy timeout就用aiohttp试试
用于解决一些蜜汁bug
"""
def __init__(self, timeout=10):
self.timeout = timeout
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
timeout = crawler.settings.get('DOWNLOAD_TIMEOUT', 10)
s = cls(timeout)
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def spider_opened(self, spider):
spider.logger.info(
"Spider %s opened middleware: %s" % (spider.name, self.__class__.__name__)
)
async def process_request(self, request: scrapy.Request, spider):
from scrapy.http.headers import Headers
# logger.info("进来的meta是 {0}".format(request.meta))
if request.meta.get("use_aiohttp", False):
proxies = {
'http': request.meta.get('proxy'),
'https': request.meta.get('proxy')
}
logger.debug("使用aiohttp进行尝试")
# url = request.url
headers = Headers(request.headers or {}, encoding='utf-8').to_unicode_dict()
start_time = time.time()
async with aiohttp.ClientSession() as session:
async with session.request(
request.method,
request.url,
data=request.body,
headers=headers,
proxy=proxies.get("https"),
timeout=self.timeout,
cookies=request.cookies,
ssl=sslgen()) as resp:
html: bytes = await resp.read()
end_time = time.time() # 记录结束时间
response_time = end_time - start_time # 计算响应时间
logger.info(f"aiohttp Response time: {response_time} seconds")
stats = spider.crawler.stats
stats.inc_value("downloader/request_count")
stats.inc_value(f"downloader/request_method_count/{request.method}")
return scrapy.http.HtmlResponse(
url=request.url,
status=resp.status,
headers=request.headers,
body=html,
request=request,
# encoding=resp.get_encoding(),
encoding="utf-8",
)
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
logging.info(f"middleware error {spider.r_utils(request)} {exception}")
return None
\ No newline at end of file
# -*- coding: utf-8 -*-
import time
import scrapy
import logging
import sys, os
from scrapy import signals
from func_timeout import func_set_timeout
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0]))) # 上级目录
@func_set_timeout(3)
def get_sign(data, token, t):
import execjs
import platform
import js2py
cwd = os.path.dirname(__file__)
logging.info(f"{data}, {token}, {t}")
# with open(cwd + '\\js\\17encrypt.js', 'r') as f:
if "Windows" == platform.system():
with open(cwd + '\\js\\sign_1688.js', 'r', encoding="utf-8") as f:
# ctx = execjs.compile(f.read())
# anti_content = ctx.call('getCookie', data, token, t)
# return anti_content
m = js2py.EvalJs()
m.execute(f.read())
anti_content = m.get_cc(data, token, t)
return anti_content
else:
with open(cwd + '/js/sign_1688.js', 'r', encoding="utf-8") as f:
# ctx = execjs.compile(f.read())
# anti_content = ctx.call('getCookie', data, token, t)
# return anti_content
m = js2py.EvalJs()
m.execute(f.read())
anti_content = m.get_cc(data, token, t)
return anti_content
class AddSignMiddleware:
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def get_cookies(self, request):
import requests
headers = {
"accept": "*/*",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "no-cache",
"pragma": "no-cache",
"referer": "https://sale.1688.com/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
}
b = "undefined"
new_time = round(time.time() * 1000)
data = "^%^7B^%^22type^%^22^%^3A^%^22canView^%^22^%^7D^"
logging.info("获取参数")
sign = get_sign(data, b, new_time)
logging.info("获取参数成功")
params = {
"appKey": "12574478",
"t": f'{new_time}',
"sign": f'{sign}',
"api": "mtop.cbu.overseas.site.ip.business.check",
"v": "1.0",
"timeout": "3000",
"data": "^%^7B^%^22type^%^22^%^3A^%^22canView^%^22^%^7D^"
}
url = "https://h5api.m.1688.com/h5/mtop.cbu.overseas.site.ip.business.check/1.0/"
response = requests.get(url, headers=headers, params=params, verify=False, timeout=6)
cookies = dict(response.cookies)
headers = {
"Host": "dcms.1688.com",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
"accept": "*/*",
"referer": "https://shop3413081512086.1688.com/",
"accept-language": "zh-CN,zh;q=0.9"
}
url = "https://dcms.1688.com/open/jsonp/old-pm-qyzx.json"
params = {
}
response = requests.get(url, headers=headers, params=params, verify=False, timeout=6)
cookies.update(dict(response.cookies))
cookies = dict(response.cookies)
return cookies
def get_cookies_news(self, request):
import requests
os.environ['NO_PROXY'] = 'stackoverflow.com'
requests = requests.session()
headers = {
"accept": "*/*",
"accept-language": "zh-CN,zh;q=0.9",
"cache-control": "no-cache",
"pragma": "no-cache",
"referer": "https://sale.1688.com/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
}
url = "https://h5api.m.1688.com/h5/mtop.cbu.overseas.site.ip.business.check/1.0/"
b = "undefined"
new_time = round(time.time() * 1000)
sign = get_sign("^%^7B^%^22type^%^22^%^3A^%^22canView^%^22^%^7D^", b, new_time)
params = {
"appKey": "12574478",
"t": f'{new_time}',
"sign": f'{sign}',
"api": "mtop.cbu.overseas.site.ip.business.check",
"v": "1.0",
"timeout": "3000",
"data": "^%^7B^%^22type^%^22^%^3A^%^22canView^%^22^%^7D^"
}
response = requests.get(url, headers=headers, params=params, verify=False, timeout=6)
cookies = dict(response.cookies)
_m_h5_tk = response.cookies['_m_h5_tk']
headers = {
"Host": "dcms.1688.com",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
"accept": "*/*",
"referer": "https://shop3413081512086.1688.com/",
"accept-language": "zh-CN,zh;q=0.9"
}
url = "https://dcms.1688.com/open/jsonp/old-pm-qyzx.json"
params = {
}
response = requests.get(url, headers=headers, params=params, verify=False, timeout=6)
cookies.update(dict(response.cookies))
b = _m_h5_tk.split("_")[0]
new_time = round(time.time() * 1000)
memberid = request.meta.get('memberId')
data = '{"componentKey":"wp_pc_contactsmall","params":"{\\"memberId\\":\\"%s\\"}"}' % memberid
sign = get_sign(data, b, new_time)
url = 'https://h5api.m.1688.com/h5/mtop.alibaba.alisite.cbu.server.pc.moduleasyncservice/1.0/'
params = (
('appKey', '12574478'),
('t', str(new_time)),
('sign', sign),
('api', 'mtop.alibaba.alisite.cbu.server.pc.ModuleAsyncService'),
('data', data),
)
headers = {
"Host": "h5api.m.1688.com",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
"accept": "*/*",
"referer": "https://shop3413081512086.1688.com/",
"accept-language": "zh-CN,zh;q=0.9"
}
response = requests.get(url, headers=headers, params=params, cookies=cookies, timeout=6, verify=False)
return response
def process_request(self, request, spider):
if request.meta.get('sign_1688'):
logging.info("开始获取response -->")
cookies = self.get_cookies_news(request)
logging.info("response --> scrapy response")
response = scrapy.http.HtmlResponse(
url=request.url,
status=cookies.status_code,
headers=request.headers,
body=cookies.text,
request=request,
# encoding=resp.get_encoding(),
encoding="utf-8",
)
# logging.info(f"获取cookie成功 {cookies}")
return response
# request.cookies = cookies
# new_time = round(time.time() * 1000)
# b = cookies["_m_h5_tk"].split("_")[0]
# data = '{"componentKey":"wp_pc_contactsmall","params":"{\\"memberId\\":\\"%s\\"}"}' % request.meta.get('memberId')
# logging.info("获取sign成功")
# sign = get_sign(data, b, new_time)
# logging.info("开始获取sign")
# url = f'https://h5api.m.1688.com/h5/mtop.alibaba.alisite.cbu.server.pc.moduleasyncservice/1.0/?appKey=12574478&t={new_time}&sign={sign}&api=mtop.alibaba.alisite.cbu.server.pc.ModuleAsyncService&data={data}'
# request._set_url(url)
# logging.info(f"装载 sign参数:{sign} 拼接 url 添加cookie: {cookies}")
# return None
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# -*- coding: utf-8 -*-
import logging
import time
import httpx
import scrapy
from scrapy.http import HtmlResponse
from amazon_spider.utils.random_ssl import ssl_context
logger = logging.getLogger(__name__)
class HttpxMiddleware:
"""
可以下载http2 处理socks代理
"""
def __init__(self, delay=0):
self.delay = delay
@classmethod
def from_crawler(cls, crawler):
s = crawler.settings
delay = s.get('DOWNLOAD_DELAY', 0)
return cls(delay)
async def process_request(self, request: scrapy.Request, spider):
if request.meta.get("use_httpx", False):
logging.getLogger().info("Start to set httpx")
proxies = {
"http://": request.meta.get('proxy'),
"https://": request.meta.get('proxy'),
}
start_time = time.time()
async with httpx.AsyncClient(http2=True, verify=ssl_context, proxies=proxies) as client:
httpx_request = client.build_request(request.method, request.url, headers=request.headers.to_unicode_dict(),
cookies=request.cookies,
data=request.body if request.method in ('POST', 'PUT', 'PATCH') else None)
response = await client.send(httpx_request)
end_time = time.time() # 记录结束时间
response_time = end_time - start_time # 计算响应时间
logger.info(f"httpx Response time: {response_time} seconds")
response = HtmlResponse(
request.url,
encoding=response.encoding,
status=response.status_code,
# headers=response.headers,
body=response.content,
request=request
)
stats = spider.crawler.stats
stats.inc_value("downloader/request_count")
stats.inc_value(f"downloader/request_method_count/{request.method}")
return response
# def process_request(self, request, spider):
# if request.meta.get("use_httpx", False):
# proxies = {
# "http://": request.meta.get('proxy'),
# "https://": request.meta.get('proxy'),
# }
# with httpx.Client(http2=True, verify=ssl_context, proxies=proxies) as client:
# httpx_request = client.build_request(request.method, request.url, headers=request.headers.to_unicode_dict(),
# cookies=request.cookies,
# data=request.body if request.method in ('POST', 'PUT', 'PATCH') else None)
# response = client.send(httpx_request)
# response = HtmlResponse(
# request.url,
# encoding=response.encoding,
# status=response.status_code,
# # headers=response.headers,
# body=response.content,
# request=request
# )
# return response
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
logging.info(f"middleware error {spider.r_utils(request)} {exception}")
return None
window = {
innerHeight: 478
}
function array_j(y, z){
var A = 1315423911 ^ (z << 16)
var B;
var C;
for (B = y.length - 1; B >= 0; B--) {
C = y.charCodeAt(B);
A ^= (((A << 5) + C) + (A >> 2));
}
// 192801281 191891864
return Math.abs(A & 2147483647);
}
function po(d){
var n = ''
for (var T = 0; T < d.length; T++) {
if (true) {
if (n == '')
n = d["charCodeAt"](T)["toString"](16);
else
n += d["charCodeAt"](T)["toString"](16);
} else {
console.log("----")
}
}
return n
}
function getCookie(d){
var n = d.length
var j = array_j(d, n)
// console.log(array_j(d, n))
var p = []
// console.log(j['toString'](16))
// 0b7dea01
p[3] = 4
p[5] = 0+j['toString'](16);
// 1854735704
// V += B[rt(0x6c2, '0x458', 0x68d, '0x7e7', '^lQm', 0x783, 0x879)](B[rF(-'0xcb', -0x3f1, -'0x126', '5Y6#', -'0x255', -'0x31f', -'0x47')](B[rw('ccAY', '0x75c', 0x725, 0x6c7, '0x560', 0x7d4, '0x85b')](':' + B[rX(-0x4b, 0x22, 'YwvS', -0x89, '0x45', -0x22d, -0x22c)](o), ':'), A) + ':' + T + ':', s);
// V += B[rt(0x6c2, '0x458', 0x68d, '0x7e7', '^lQm', 0x783, 0x879)](B[rF(-'0xcb', -0x3f1, -'0x126', '5Y6#', -'0x255', -'0x31f', -'0x47')](B[rw('ccAY', '0x75c', 0x725, 0x6c7, '0x560', 0x7d4, '0x85b')](':' + B[rX(-0x4b, 0x22, 'YwvS', -0x89, '0x45', -0x22d, -0x22c)](o), ':'), A) + ':' + T + ':', s);
T = Math.round(Math.random() * 43)
// console.log(T)
// p[1] = T[r7(-0x170, -'0x68', 0xc6, 0xc3, '[07)', -'0x1db', '0x18e') + r7(-'0xc4', '0x7e', -0x13f, -0x16e, 'Lon#', 0x209, -'0x1a6')](0x315 + -0x3e * 0x63 + 0x14f5),
p[1] = T['toString'](16),
// p[2] = T[r3(0x45c, '0x2b7', '0x502', 0x3d2, 0x37a, 0x570, '9^IW') + 'ing'](16)['length'],
p[2] = T['toString'](16)['length']
// 0b700998
// ":false:1854735704:13:416"
V = ":" + ['G-9F25EB6BA0A44B64', 'redefine', 1854735704, T, T*32].join(":")
// window['Date']['now']()
// console.log(Date.now()['toString'](16))
// "G-9F25EB6BA0A44B64:redefine:1854735704:13:416/18b37b14126/11/true/-480/1854735704/f1759ff"
v = '/' + [Date.now()['toString'](16), 11, true, -480, 1854735704, 'f1759ff'].join("/")
V += v
// "G-9F25EB6BA0A44B64:redefine:1854735704:13:416/18b37b14126/11/true/-480/1854735704/f1759ff"
// console.log(V)
V = V['split']('').reverse().join('')
// console.log(V)
p[0] = po(V)
p[4] = 0+ array_j(V, 43)['toString'](16);
// var j = Math.random()
// console.log(p.join(""))
return p.join("")
}
// getCookie('{"data":[{"num":"5004339145","fc":0,"sc":0}],"guid":"","timeZoneOffset":-480}')
// var d = '{"data":[{"num":"5004339145","fc":0,"sc":0}],"guid":"","timeZoneOffset":-480}'
// var j = window[r3(0x1d1, 0x2ca, '0xdb', '0x2e7', '0xde', 0x1bb, '9^IW')][r5('0x577', '0x375', '0x379', '0x3b2', '0x4ec', '0x348', 'QC#j') + 'm']()
// var j = window['Math']['random']()
// T = window[r6('0x2fd', 0x40d, 'ccAY', 0x21a, 0x486, '0x3f3', '0x228')][r5(0x5d6, '0x4b2', 0x42f, 0x603, '0x6c2', 0x3b0, 'n]Ty')](B[rB('0x4d9', 0x3b3, 0x154, '0x2b1', '9^IW', 0x355, '0x20a')](j, d)),
// T = window['Math']['round'](window['Math']['random']() * d),
// B[Pm('0x438', 'YwvS', '0x3c3', 0x247, 0x355, 0x1c0, '0x453')](S, j[Pu('f(an', '0x4a8', 0x449, '0x44e', 0x562, 0x3e5, '0x696') + Pu('g[!k', 0x5c8, '0x505', '0x627', 0x408, 0x5f3, 0x384)](-0x1c9 + -0x1 * 0x1a5d + -0x2 * -0xe1b));
// B['diZdD'](S, j['toString'](16));
// Math['abs'](B[Ph('Lon#', 0x1b3, -0xc7, '0x1e1', -'0x1bb', -'0x133', 0x15)](T, -0x598e1871 + 0xed6b9b57 * -0x1 + 0x1c6f9b3c7))
// W = YQ['configs']['md5'];
//
// W = 'f1759ff'
//
//
// // 'rjzeU': function(d, n) {
// // return d ^ n;
// // },
// // var T = B['rjzeU'](-0x68dfaa8d * 0x1 + 0x65c169ff + 0x51860735, n << -0x757 + 0x249d + -0x1d36), j, V;
// // 1311426215
// var T = 1311426215
//
// // for (j = B[PP('0x6ff', '0x768', '0x5ba', '$9co', 0x858, 0x89c, '0x910')](d[PP(0x58e, 0x75d, '0x852', '8Sai', '0x79d', '0x95c', 0x555) + 'h'], -0x1b9 * 0x15 + 0x88 + -0xbe2 * -0x3); j >= 0x9ea + -0x1f42 + 0x1558; j--) {
// // V = d['charC' + Pi(0x1cc, 0x410, 0x1fe, '0x7e', '0x121', '0x35', '[07)')](j),
// // T ^= B[Ph('&9)Y', '0x1b4', 0x194, -'0x44', '0x9e', 0x292, '0x156')](B[Pi(-'0x56', 0x3ec, '0x1ef', '0x23d', 0x304, 0x48, 'QC#j')](B[PP('0x7a6', 0x5b7, '0x5dd', 'n]Ty', 0x571, '0x682', '0x60c')](T, 0xc6d * 0x3 + -0x13af + 0x1193 * -0x1), V), T >> -0xaa7 + 0xd03 + -0x25a);
// // }
// //
// // for (j = d['length']- 1; j >= 0; j--) {
// // V = d['charCodeAt'](j),
// // T ^= B['MrTte'](B['Bkxgz'](B['whAbN'](T, 0xc6d * 0x3 + -0x13af + 0x1193 * -0x1), V), T >> -0xaa7 + 0xd03 + -0x25a);
// // }
//
//
//
// // var p = []
// // T = 35
// // p[2] = T['toString'](16)['length']
// //
// // console.log(p)
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
function get_cc(data, token, tttt) {
var o = {
"data": data
}
, n = {
"token": token
};
// console.log("js---------", o, n)
if (true) {
// var u = (new Date).getTime()
var u = tttt
// console.log(u)
var uuuuuu = u
// console.log("new_time", u, uuuuuu)
var i = "//h5api.m.1688.com/h5/mtop.alibaba.alisite.cbu.server.pc.moduleasyncservice/1.0/"
, s = "12574478"
, l = function(e) {
function t(e, t) {
return e << t | e >>> 32 - t
}
function o(e, t) {
var o, n, r, i, a;
return r = 2147483648 & e,
i = 2147483648 & t,
a = (1073741823 & e) + (1073741823 & t),
(o = 1073741824 & e) & (n = 1073741824 & t) ? 2147483648 ^ a ^ r ^ i : o | n ? 1073741824 & a ? 3221225472 ^ a ^ r ^ i : 1073741824 ^ a ^ r ^ i : a ^ r ^ i
}
function n(e, n, r, i, a, s, u) {
return o(t(e = o(e, o(o(function(e, t, o) {
return e & t | ~e & o
}(n, r, i), a), u)), s), n)
}
function r(e, n, r, i, a, s, u) {
return o(t(e = o(e, o(o(function(e, t, o) {
return e & o | t & ~o
}(n, r, i), a), u)), s), n)
}
function i(e, n, r, i, a, s, u) {
return o(t(e = o(e, o(o(function(e, t, o) {
return e ^ t ^ o
}(n, r, i), a), u)), s), n)
}
function a(e, n, r, i, a, s, u) {
return o(t(e = o(e, o(o(function(e, t, o) {
return t ^ (e | ~o)
}(n, r, i), a), u)), s), n)
}
function s(e) {
var t, o = "", n = "";
for (t = 0; 3 >= t; t++)
o += (n = "0" + (e >>> 8 * t & 255).toString(16)).substr(n.length - 2, 2);
return o
}
var u, l, d, c, p, f, h, m, y, g;
for (g = function(e) {
for (var t = e.length, o = t + 8, n = 16 * ((o - o % 64) / 64 + 1), r = Array(n - 1), i = 0, a = 0; t > a; )
i = a % 4 * 8,
r[(a - a % 4) / 4] |= e.charCodeAt(a) << i,
a++;
return i = a % 4 * 8,
r[(a - a % 4) / 4] |= 128 << i,
r[n - 2] = t << 3,
r[n - 1] = t >>> 29,
r
}(e = function(e) {
var t = String.fromCharCode;
e = e.replace(/\r\n/g, "\n");
for (var o, n = "", r = 0; r < e.length; r++)
128 > (o = e.charCodeAt(r)) ? n += t(o) : o > 127 && 2048 > o ? (n += t(o >> 6 | 192),
n += t(63 & o | 128)) : (n += t(o >> 12 | 224),
n += t(o >> 6 & 63 | 128),
n += t(63 & o | 128));
// console.log(e)
return n
}(e)),
f = 1732584193,
h = 4023233417,
m = 2562383102,
y = 271733878,
u = 0; u < g.length; u += 16)
l = f,
d = h,
c = m,
p = y,
h = a(h = a(h = a(h = a(h = i(h = i(h = i(h = i(h = r(h = r(h = r(h = r(h = n(h = n(h = n(h = n(h, m = n(m, y = n(y, f = n(f, h, m, y, g[u + 0], 7, 3614090360), h, m, g[u + 1], 12, 3905402710), f, h, g[u + 2], 17, 606105819), y, f, g[u + 3], 22, 3250441966), m = n(m, y = n(y, f = n(f, h, m, y, g[u + 4], 7, 4118548399), h, m, g[u + 5], 12, 1200080426), f, h, g[u + 6], 17, 2821735955), y, f, g[u + 7], 22, 4249261313), m = n(m, y = n(y, f = n(f, h, m, y, g[u + 8], 7, 1770035416), h, m, g[u + 9], 12, 2336552879), f, h, g[u + 10], 17, 4294925233), y, f, g[u + 11], 22, 2304563134), m = n(m, y = n(y, f = n(f, h, m, y, g[u + 12], 7, 1804603682), h, m, g[u + 13], 12, 4254626195), f, h, g[u + 14], 17, 2792965006), y, f, g[u + 15], 22, 1236535329), m = r(m, y = r(y, f = r(f, h, m, y, g[u + 1], 5, 4129170786), h, m, g[u + 6], 9, 3225465664), f, h, g[u + 11], 14, 643717713), y, f, g[u + 0], 20, 3921069994), m = r(m, y = r(y, f = r(f, h, m, y, g[u + 5], 5, 3593408605), h, m, g[u + 10], 9, 38016083), f, h, g[u + 15], 14, 3634488961), y, f, g[u + 4], 20, 3889429448), m = r(m, y = r(y, f = r(f, h, m, y, g[u + 9], 5, 568446438), h, m, g[u + 14], 9, 3275163606), f, h, g[u + 3], 14, 4107603335), y, f, g[u + 8], 20, 1163531501), m = r(m, y = r(y, f = r(f, h, m, y, g[u + 13], 5, 2850285829), h, m, g[u + 2], 9, 4243563512), f, h, g[u + 7], 14, 1735328473), y, f, g[u + 12], 20, 2368359562), m = i(m, y = i(y, f = i(f, h, m, y, g[u + 5], 4, 4294588738), h, m, g[u + 8], 11, 2272392833), f, h, g[u + 11], 16, 1839030562), y, f, g[u + 14], 23, 4259657740), m = i(m, y = i(y, f = i(f, h, m, y, g[u + 1], 4, 2763975236), h, m, g[u + 4], 11, 1272893353), f, h, g[u + 7], 16, 4139469664), y, f, g[u + 10], 23, 3200236656), m = i(m, y = i(y, f = i(f, h, m, y, g[u + 13], 4, 681279174), h, m, g[u + 0], 11, 3936430074), f, h, g[u + 3], 16, 3572445317), y, f, g[u + 6], 23, 76029189), m = i(m, y = i(y, f = i(f, h, m, y, g[u + 9], 4, 3654602809), h, m, g[u + 12], 11, 3873151461), f, h, g[u + 15], 16, 530742520), y, f, g[u + 2], 23, 3299628645), m = a(m, y = a(y, f = a(f, h, m, y, g[u + 0], 6, 4096336452), h, m, g[u + 7], 10, 1126891415), f, h, g[u + 14], 15, 2878612391), y, f, g[u + 5], 21, 4237533241), m = a(m, y = a(y, f = a(f, h, m, y, g[u + 12], 6, 1700485571), h, m, g[u + 3], 10, 2399980690), f, h, g[u + 10], 15, 4293915773), y, f, g[u + 1], 21, 2240044497), m = a(m, y = a(y, f = a(f, h, m, y, g[u + 8], 6, 1873313359), h, m, g[u + 15], 10, 4264355552), f, h, g[u + 6], 15, 2734768916), y, f, g[u + 13], 21, 1309151649), m = a(m, y = a(y, f = a(f, h, m, y, g[u + 4], 6, 4149444226), h, m, g[u + 11], 10, 3174756917), f, h, g[u + 2], 15, 718787259), y, f, g[u + 9], 21, 3951481745),
f = o(f, l),
h = o(h, d),
m = o(m, c),
y = o(y, p);
return (s(f) + s(h) + s(m) + s(y)).toLowerCase()
}(n.token + "&" + u + "&" + s + "&" + o.data)
return l
}
}
// console.log(get_cc('{"componentKey":"wp_pc_contactsmall","params":"{\\"memberId\\":\\"b2b-221508014933339588\\"}"}', undefined, 1694402662622))
// b60c3de40116fe884e6c675b8883e67d
// 5a38b5406b24efd85255a0bfc0c89d97
// index.web.cmd.js
\ No newline at end of file
import time
import sys, os
import logging
import requests
import pandas as pd
from urllib.parse import urlparse
from func_timeout import func_set_timeout
from sqlalchemy.exc import OperationalError
from func_timeout.exceptions import FunctionTimedOut
sys.path.append(os.path.dirname(os.path.dirname(sys.path[0]))) # 上级目录
from amazon_spider.utils.utils import send_mg
from amazon_spider.db.redis_db import sadd, expire
from amazon_spider.db.mysql_db import get_country_engine
class AsinStateFind:
"""
清理_self_all_syn内非(每周和每月)asin
每月1次(广告) 、每周1次(竞品)、 BSR榜单每日1次、新品榜单每日1次、erp优惠券每日1次、erp前台异常每日1次
数据分组 聚合 data_type字段 去重后插入到 _self_all_syn表
"""
def __init__(self, site):
self.site = site
self.conn = None
self.sites = {
"us": "Amazon.com",
"uk": "Amazon.co.uk",
"de": "Amazon.de",
"es": "Amazon.es",
"it": "Amazon.it",
"fr": "Amazon.fr",
"mx": "Amazon.com.mx",
"ca": "Amazon.ca",
}
# logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s %(message)s',
# level=logging.INFO)
@func_set_timeout(10)
def get_bsr_day_asin(self):
# BSR榜单每日1次、新品榜单每日1次
# sql = f"SELECT distinct asin from {self.site}_self_all_syn WHERE state in (1, 2) and site='{self.site}' and data_type like '%%4%%' and date_info is not null;"
sql = f"SELECT distinct asin from {self.site}_self_all_syn WHERE state in (1, 2) and site='{self.site}';"
# 测试
# sql = f"-- SELECT distinct asin, date_info from bsr_day_asin WHERE created_at>='2023-05-15' and site_name='{self.site}';"
df_bsr_asin = pd.read_sql(sql, con=get_country_engine(self.site))
if df_bsr_asin.shape[0] > 200:
return False
else:
return True
def if_bsr_spider_state(self):
while True:
try:
bl = self.get_bsr_day_asin()
break
except OperationalError as e:
logging.info(f"查看每日bsr是否爬取完成失败 连接错误{e}")
continue
except FunctionTimedOut as e:
logging.info(f"查看每日bsr是否爬取完成超时 连接错误{e}")
continue
if bl:
sql = f"SELECT date_info from bsr_day_asin order by date_info desc limit 1;"
while True:
try:
df_bsr_asin = pd.read_sql(sql, con=get_country_engine(self.site))
break
except OperationalError as e:
logging.info(f"查看每日bsr是否爬取完成失败 连接错误{e}")
continue
str_time = time.strftime("%Y-%m-%d", time.localtime())
print(list(df_bsr_asin["date_info"])[0])
url = "http://selection.yswg.com.cn:8080/soundasia_selection/workflow/emit"
if not sadd("bsr_day_asin", url+f"#{str_time}"):
print("已经发送过请求")
else:
seconds = 82800
expire(f'bsr_day_asin', seconds)
data = {
"siteName": "us",
"tableName": "self_asin_day",
"dateType": "day",
"reportDate": list(df_bsr_asin["date_info"])[0],
"status": "asin爬取完成",
"statusVal": 10,
"remark": "",
"isEnd": "否"
}
headers = {
'Connection': 'close',
'authority': urlparse(url).hostname,
'accept': 'text/html,*/*',
'accept-language': '*',
"Content-Type": 'application/json',
'origin': url,
# 'referer': f'{url}/Bosch-ROS20VSK-Palm-Sander-Collector/product-reviews/B0018Z8D64/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3',
}
responses = requests.post(url, json=data, headers=headers)
print(responses.text)
print(responses.status_code)
if "操作失败" in responses.json().get("message"):
send_mg("hezhe", "【bsr爬取提醒接口调用失败】", "请求bsr爬取提醒接口失败")
else:
print("爬取完成")
else:
print("state未爬取完")
# AsinStateFind("us").if_bsr_spider_state()
import time
import logging
import pandas as pd
from queue import Queue
from func_timeout import func_set_timeout
from sqlalchemy.exc import OperationalError
from amazon_spider.utils.utils import send_mg
from func_timeout.exceptions import FunctionTimedOut
from amazon_spider.db.mysql_db import get_country_engine, df_to_sql
from amazon_spider.utils.common import is_internet_available
from amazon_spider.db.pg_db import get_pg_country_engine, get_14pg_country_engine, updatas_pg_asin
# useful for handling different item types with a single interface
class AmazonVariatSpiderPipeline:
def __init__(self, site):
self.site = site
self.q_dict = {
"inner_item_queue": Queue(),
# "variat_item_queue": Queue(),
# "asin_img_queue": Queue(),
"error_queue": Queue(),
# "self_variat_queue": Queue(),
}
self.num = 50
self.save_num = 20
@classmethod
def from_crawler(cls, crawler):
return cls(
site=crawler.spider.site
)
@func_set_timeout(300)
def up_del_dis(self, sql, data=None, site="us", db="mysql"):
if db == "mysql":
e = get_country_engine(site)
elif db == "pg":
e = get_pg_country_engine(site)
elif db == "pg14":
e = get_14pg_country_engine(site)
try:
if not is_internet_available():
return False
with e.connect() as conn:
if data != None:
if data:
conn.execute(sql, data)
else:
conn.execute(sql)
e.dispose()
# conn.commit()
# conn.close()
return True
except OperationalError as e:
logging.info(f"error sql is {sql} {str(e)}")
return False
def up_del_db(self, sql, data=None, site="us", db="mysql"):
if 'delete' in sql.lower():
sql_msg = "delete"
else:
sql_msg = "update"
while True:
try:
if is_internet_available():
if self.up_del_dis(sql, data=data, site=site, db=db):
logging.info(f"{sql_msg} {db} asin state 3 ok ^_^ -----{len(data or []) or sql}---------{[][0:5] if data is None else data[0:5]}")
break
else:
time.sleep(3)
logging.info(
f"{sql_msg} {db} asin state 3 error T_T --> {len(data or []) or sql}---------{[][0:5] if data is None else data[0:5]}")
continue
else:
time.sleep(3)
logging.info(
f"{sql_msg} {db} asin state 3 network error T_T --> {len(data or []) or sql}---------{[][0:5] if data is None else data[0:5]}")
continue
except FunctionTimedOut as e:
if "pg" in db and 'asin_image' in sql:
sql_backend = f"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE query={sql} AND query not like '%pg_terminate_backend%';"
logging.info(f"sql_backend --> {sql_backend}")
if is_internet_available():
if updatas_pg_asin(sql_backend, site=site, db=db):
logging.info(f"{sql_msg} {db} pg_terminate_backend ok ^_^ -----{data[0][0]}---------")
continue
else:
time.sleep(3)
logging.info(
f"{sql_msg} {db} pg_terminate_backend asin T_T --> {data[0][0]}---------")
continue
else:
time.sleep(3)
logging.info(
f"{sql_msg} {db} asin state 3 time out T_T --> {e}----{len(data or []) or sql}---------{[][0:5] if data is None else data[0:5]}")
continue
def asin_state_to_list(self, df):
df_9 = df.loc[df.volume.isna() & df.weight.isna() & df[
"rank"].isna() & df.launch_time.isna() & df.price.isna() & df.rating.isna() & df.total_comments.isna()]
df_ = df.loc[~(df.volume.isna() & df.weight.isna() & df[
"rank"].isna() & df.launch_time.isna() & df.price.isna() & df.rating.isna() & df.total_comments.isna())]
df_7 = df_.loc[df.volume.isna() & df.weight.isna() & df["rank"].isna() & df.launch_time.isna()]
df_3 = df_.loc[~(df.volume.isna() & df.weight.isna() & df["rank"].isna() & df.launch_time.isna())]
df_9["state"] = 9
df_7["state"] = 7
df_3["state"] = 3
df_9 = df_9.loc[:, ["state", "asin", "site"]]
df_7 = df_7.loc[:, ["state", "asin", "site"]]
df_3 = df_3.loc[:, ["state", "asin", "site"]]
asin_list = []
asin_list += [list(i) for i in df_9.values]
asin_list += [list(i) for i in df_7.values]
asin_list += [list(i) for i in df_3.values]
return asin_list
def save_db(self, table, df, site, db):
# 入库报错重试
while True:
try:
if df_to_sql(table, df, site=site, db=db):
logging.info(
f"更新 {db} 数据库 {table} -----{df.shape}---------{df.head()}")
break
else:
logging.info(f"更新 {db} 数据库 {table} -----失败")
continue
except OperationalError as e:
logging.info(f"更新 {db} 数据库 {table} 失败 连接错误{e}")
continue
except FunctionTimedOut as e:
logging.info(
f"更新 {db} 数据库 {table} -超时-{e}---{df.shape}---------{df.head()}")
continue
def queue_consumer(self, q_size):
for k, v in self.q_dict.items():
if q_size == "max":
if v.qsize():
dates = [v.get() for i in range(0, v.qsize())]
else:
dates = []
else:
if v.qsize() >= self.num:
dates = [v.get() for i in range(0, self.num)]
else:
dates = []
if dates:
df = pd.DataFrame(dates)
if k == "inner_item_queue":
if dates:
if df.shape[0]:
self.save_db(f"{self.site}_self_detail_returns", df, self.site, "mysql")
logging.info(f"{self.site}_self_detail_returns {df.shape}")
# if self.site == "us":
# self.save_db(f"{self.site}_self_asin_detail_{time.gmtime().tm_year}", df, self.site, "pg")
sql_up = f"UPDATE `{self.site}_self_asin_returns_new` set `state`=(%s) where asin=(%s) and site=(%s);"
asin_list = self.asin_state_to_list(df)
# 暑假还有多久 放完啊
if len(asin_list) == 1:
d = asin_list[0]
else:
d = asin_list
self.up_del_db(sql_up, d, self.site, db="mysql")
elif k == "error_queue":
if dates:
# 表名需要改
sql_up = f"UPDATE `{self.site}_self_asin_returns_new` set `state`=(%s) where asin=(%s) and site=(%s);"
up_datas = [list(i) for i in df.values]
if len(up_datas) == 1:
d = up_datas[0]
else:
d = up_datas
self.up_del_db(sql_up, d, self.site, db="mysql")
logging.info(f"{self.site}_self_asin_returns_new {df.shape}")
def process_item(self, item, spider):
if item.get("finish_spider"):
print('等待时 将队列数据存储', {k: v.qsize() for k, v in self.q_dict.items()})
self.queue_consumer(q_size="max")
if item.get("inner_item"):
self.q_dict.get("inner_item_queue").put(item.get('inner_item'))
elif item.get("error_asin"):
self.q_dict.get("error_queue").put(item.get("asin"))
self.queue_consumer("min")
def close_spider(self, spider):
print('爬虫结束,存储最后 数据', {k: v.qsize() for k, v in self.q_dict.items()})
self.queue_consumer("max")
send_mg("hezhe", "【内部asin爬取进程退出】", "内部asin爬取进程退出")
\ No newline at end of file
import logging
import pandas as pd
from queue import Queue
from sqlalchemy.exc import OperationalError
from func_timeout.exceptions import FunctionTimedOut
from amazon_spider.db.mysql_db import df_to_sql
from amazon_spider.db.mysql_db import del_mysql_asin
# useful for handling different item types with a single interface
class AmazonMxSpiderPipeline:
def __init__(self, site):
self.site = site
self.q_dict = {"asin_queue": Queue(), "error_queue": Queue()}
self.num = 1
self.save_num = 20
@classmethod
def from_crawler(cls, crawler):
return cls(
site=crawler.spider.site
)
def process_item(self, item, spider):
if item.get("error_asin"):
self.q_dict.get("error_queue").put((1, item.get("asin"), spider.short_site_to_dict(item.get("site"))))
else:
self.q_dict.get("asin_queue").put(list(item.values()))
sql_up = f"UPDATE `mx_self_asin` set state=(%s) where asin=(%s) and site=(%s);"
for k, v in self.q_dict.items():
if k == "asin_queue" and v.qsize() >= self.num:
state_num = 3
dates = [v.get() for i in range(0, self.num)]
df = pd.DataFrame(dates, columns=spider.col)
try:
df_to_sql("mx_self_asin_detail", df, site=self.site, db="mysql")
logging.info(f"入库成功-----{len(dates)}---------{dates}")
except OperationalError as e:
logging.info(f"入库失败 连接错误{e}")
state_num = 1
except FunctionTimedOut as e:
logging.info(f"入库-超时{e}----{len(dates)}---------{dates}")
state_num = 1
asin_list = [(state_num, i[0], i[-1]) for i in dates]
if len(asin_list) == 1:
d = asin_list[0]
else:
d = asin_list
while True:
try:
if del_mysql_asin(sql_up, data=d, site=self.site):
logging.info(f"修改asin状态3-----{len(d)}---------{d}")
break
else:
logging.info(f"修改asin状态3-失败----{len(d)}---------{d}")
continue
except FunctionTimedOut as e:
logging.info(f"修改asin状态3-超时{e}----{len(d)}---------{d}")
continue
elif k == "error_queue" and v.qsize() >= self.num:
dates = [v.get() for i in range(0, self.num)]
print(dates)
if len(dates) == 1:
d = dates[0]
else:
d = dates
while True:
try:
if del_mysql_asin(sql_up, data=d, site=self.site):
logging.info(f"修改asin状态3-----{len(d)}---------{d}")
break
else:
logging.info(f"修改asin状态3-失败----{len(d)}---------{d}")
continue
except FunctionTimedOut as e:
logging.info(f"修改asin状态3-超时{e}----{len(d)}---------{d}")
continue
def close_spider(self, spider):
print('爬虫结束,存储最后 数据', {k: v.qsize() for k, v in self.q_dict.items()})
sql_up = f"UPDATE `mx_self_asin` set state=(%s) where asin=(%s) and site=(%s);"
for k, v in self.q_dict.items():
if k == "asin_queue":
state_num = 3
dates = [v.get() for i in range(0, v.qsize())]
df = pd.DataFrame(dates, columns=spider.col)
try:
df_to_sql("mx_self_asin_detail", df, site=self.site, db="mysql")
logging.info(f"入库成功-----{len(dates)}---------{dates}")
except OperationalError as e:
logging.info(f"入库失败 连接错误{e}")
state_num = 1
except FunctionTimedOut as e:
logging.info(f"入库-超时{e}----{len(dates)}---------{dates}")
state_num = 1
asin_list = [(state_num, i[0]) for i in dates]
if len(asin_list) == 1:
d = asin_list[0]
else:
d = asin_list
while True:
try:
if del_mysql_asin(sql_up, data=d, site=self.site):
logging.info(f"修改asin状态3-----{len(d)}---------{d}")
break
else:
logging.info(f"修改asin状态3-失败----{len(d)}---------{d}")
continue
except FunctionTimedOut as e:
logging.info(f"修改asin状态3-超时{e}----{len(d)}---------{d}")
continue
elif k == "error_queue" and v.qsize() >= self.num:
dates = [v.get() for i in range(0, self.num)]
if len(dates) == 1:
d = dates[0]
else:
d = dates
while True:
try:
if del_mysql_asin(sql_up, data=d, site=self.site):
logging.info(f"修改asin状态3-----{len(d)}---------{d}")
break
else:
logging.info(f"修改asin状态3-失败----{len(d)}---------{d}")
continue
except FunctionTimedOut as e:
logging.info(f"修改asin状态3-超时{e}----{len(d)}---------{d}")
continue
\ No newline at end of file
import logging
import time
import pandas as pd
from queue import Queue
# useful for handling different item types with a single interface
from amazon_spider.conf.db import selection_table_name
from amazon_spider.utils.common import is_internet_available
from amazon_spider.db.mysql_db import sql_update, sql_update_many, sql_connect, sql_insert_many, sql_insert
class AmazonCommentNewsSpiderPipeline:
def __init__(self, site="us"):
self.site = site
self.comment_table_name = f"{self.site}_asin_comment"
# self.comment_table_name = selection_table_name.get(f"{self.site}_comment_table")
self.asin_table_name = selection_table_name.get(f"{self.site}_asin_variat")
self.comment_count_table = selection_table_name.get(f"{self.site}_comment_num_table")
self.q_dict = {
"comment_count_queue": Queue(),
"error_queue": Queue(),
"comment_queue": Queue(),
}
self.num = 50
self.cols_comment_num_list = [
'parent_asin', 'comment_num'
]
sql_connect(self.site)
@classmethod
def from_crawler(cls, crawler):
return cls(
site=crawler.spider.site
)
def queue_consumer(self, q_size):
for k, v in self.q_dict.items():
if q_size == "max":
if v.qsize():
if k == 'comment_queue':
dates = []
for i in range(0, v.qsize()):
dates += v.get()
else:
dates = [v.get() for i in range(0, v.qsize())]
else:
dates = []
else:
if v.qsize() >= self.num:
if k == 'comment_queue':
dates = []
for i in range(0, self.num):
dates += v.get()
else:
dates = [v.get() for i in range(0, self.num)]
else:
dates = []
if dates:
if k == 'comment_count_queue':
inset_sql = f"insert into `{self.comment_count_table}` (`parent_asin`, `comment_num`, `star`) values (%s, %s, %s) ON DUPLICATE KEY UPDATE `parent_asin` = values(`parent_asin`), `comment_num` = values(`comment_num`), `star` = values(`star`);"
while True:
if is_internet_available():
if len(dates) == 1:
sql_insert(inset_sql, dates[0])
logging.info(f"asin_comment_num {dates[0]}")
break
else:
sql_insert_many(inset_sql, dates)
logging.info(f"asin_comment_num {dates}")
break
else:
time.sleep(3)
logging.info(f"requests baidu error --> T_T")
continue
elif k == 'error_queue':
sql_up = f"UPDATE `{self.asin_table_name}` set `state`=(%s), `comment_new_time`=(%s) where `parent_asin`=(%s);"
while True:
if is_internet_available():
if len(dates) == 1:
sql_update(sql_up, dates[0])
logging.info(f"{self.asin_table_name} {dates[0]}")
break
else:
sql_update_many(sql_up, dates)
logging.info(f"{self.asin_table_name} {dates}")
break
else:
time.sleep(3)
logging.info(f"requests baidu error --> T_T")
elif k == 'comment_queue':
cols_list = ['asin', 'parent_asin', 'title', 'content', 'is_vp', 'model', 'rating', 'agree_num',
'img_num',
'img_url', 'is_video', 'video_url', 'comment_url', 'user_name', 'user_img', 'country',
'user_page', 'is_earns_commissions', 'comment_time', 'page', 'star', 'vine_review_flag', 'comment_id', 'page_state']
df = pd.DataFrame(dates, columns=cols_list)
logging.info(f"去重前长度 {df.shape} ")
df.drop_duplicates(subset=["comment_id"], inplace=True)
logging.info(f"去重后长度 {df.shape} ")
inset_sql = f"insert into `{self.comment_table_name}` (`asin`, `parent_asin`, `title`, `content`, `is_vp`, `model`, `rating`,`agree_num`, `img_num`, `img_url`, `is_video`, `video_url`,`comment_url`, `user_name`, `user_img`, `country`, `user_page`,`is_earns_commissions`, `comment_time`, `page`, `star`, `vine_review_flag`, `comment_id`,`page_state`) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE `asin` = values(`asin`), `parent_asin` = values(`parent_asin`), `title` = values(`title`), `content` = values(`content`), `is_vp` = values(`is_vp`), `model` = values(`model`), `rating` = values(`rating`), `agree_num` = values(`agree_num`), `img_num` = values(`img_num`), `img_url` = values(`img_url`), `is_video` = values(`is_video`), `video_url` = values(`video_url`), `comment_url` = values(`comment_url`), `user_name` = values(`user_name`), `user_img` = values(`user_img`), `country` = values(`country`), `user_page` = values(`user_page`), `is_earns_commissions` = values(`is_earns_commissions`), `comment_time` = values(`comment_time`), `page` = values(`page`), `star` = values(`star`), `vine_review_flag` = values(`vine_review_flag`), `comment_id` = values(`comment_id`), `page_state` = values(`page_state`);"
while True:
if is_internet_available():
if len(dates) == 1:
sql_insert(inset_sql, dates[0])
logging.info(
f"入库成功--> 长度: {len(list(set(df.asin)))} asin: {list(set(df.asin))[:5]} parent_asin: {list(set(df.parent_asin))[:5]}")
break
else:
sql_insert_many(inset_sql, dates)
logging.info(
f"入库成功--> 长度: {len(list(set(df.asin)))} asin: {list(set(df.asin))[:5]} parent_asin: {list(set(df.parent_asin))[:5]}")
break
else:
time.sleep(3)
logging.info(f"requests baidu error --> T_T")
continue
def process_item(self, item, spider):
if item.get("finish_spider"):
x = {k: v.qsize() for k, v in self.q_dict.items()}
logging.info(f'sleep to queue data save {x}')
self.queue_consumer(q_size="max")
if q := item.get('queues_'):
data = []
for i in range(0, q.qsize()):
data += q.get()
max_comment_time = max([d[18] for d in data])
self.q_dict.get("comment_queue").put(data)
self.q_dict.get("error_queue").put((3, max_comment_time, item.get("asin")))
self.q_dict.get("comment_count_queue").put((item.get("asin"), item.get("comment_count"), item.get("star")))
elif item.get("count_max") and (not item.get("error_asin")):
# 将数据添加到队列
self.q_dict.get("comment_count_queue").put((item.get("asin"), item.get("comment_count"), item.get("star")))
self.q_dict.get("error_queue").put(item.get("sql_data"))
elif item.get("error_asin"):
self.q_dict.get("error_queue").put(item.get("sql_data"))
self.queue_consumer("min")
#
def close_spider(self, spider):
print(f'{self.site} 爬虫结束,存储最后 数据', {k: v.qsize() for k, v in self.q_dict.items()})
self.queue_consumer("max")
import logging
import pandas as pd
from queue import Queue
from sqlalchemy.exc import OperationalError
from func_timeout.exceptions import FunctionTimedOut
# useful for handling different item types with a single interface
from amazon_spider.db.mysql_db import df_to_sql
from amazon_spider.db.mysql_db import del_mysql_asin
class AmazonUpcSpiderPipeline:
def __init__(self, site):
self.site = site
self.q_dict = {"asin_queue": Queue(), "error_queue": Queue()}
self.num = 150
self.save_num = 150
@classmethod
def from_crawler(cls, crawler):
return cls(
site=crawler.spider.site
)
def process_item(self, item, spider):
if item.get("error_asin"):
self.q_dict.get("error_queue").put(item.get("asin"))
else:
self.q_dict.get("asin_queue").put(item)
sql_up = f"UPDATE `asin_video_spider` set state=(%s) where asin=(%s) and site=(%s);"
for k, v in self.q_dict.items():
if k == "asin_queue" and v.qsize() >= self.num:
state_num = 3
dates = [v.get() for i in range(0, self.num)]
df = pd.DataFrame(dates)
try:
df_to_sql("asin_video_detail", df, site="us", db="mysql")
logging.info(f"入库成功-----{len(dates)}---------{dates}")
except OperationalError as e:
logging.info(f"入库失败 连接错误{e}")
state_num = 1
except FunctionTimedOut as e:
logging.info(f"入库-超时{e}----{len(dates)}---------{dates}")
state_num = 1
asin_list = [(state_num, i["asin"], i['site']) for i in dates]
if len(asin_list) == 1:
d = asin_list[0]
else:
d = asin_list
while True:
try:
if del_mysql_asin(sql_up, data=d, site="us"):
logging.info(f"修改asin状态3-----{len(d)}---------{d}")
break
else:
logging.info(f"修改asin状态3-失败----{len(d)}---------{d}")
continue
except FunctionTimedOut as e:
logging.info(f"修改asin状态3-超时{e}----{len(d)}---------{d}")
continue
elif k == "error_queue" and v.qsize() >= self.num:
dates = [v.get() for i in range(0, self.num)]
if len(dates) == 1:
d = dates[0]
else:
d = dates
while True:
try:
if del_mysql_asin(sql_up, data=d, site="us"):
logging.info(f"修改asin状态3-----{len(d)}---------{d}")
break
else:
logging.info(f"修改asin状态3-失败----{len(d)}---------{d}")
continue
except FunctionTimedOut as e:
logging.info(f"修改asin状态3-超时{e}----{len(d)}---------{d}")
continue
def close_spider(self, spider):
print('爬虫结束,存储最后 数据', {k: v.qsize() for k, v in self.q_dict.items()})
sql_up = f"UPDATE `asin_video_spider` set state=(%s) where asin=(%s) and site=(%s);"
for k, v in self.q_dict.items():
if k == "asin_queue":
state_num = 3
dates = [v.get() for i in range(0, v.qsize())]
df = pd.DataFrame(dates)
try:
df_to_sql("asin_video_detail", df, site="us", db="mysql")
logging.info(f"入库成功-----{len(dates)}---------{dates}")
except OperationalError as e:
logging.info(f"入库失败 连接错误{e}")
state_num = 1
except FunctionTimedOut as e:
logging.info(f"入库-超时{e}----{len(dates)}---------{dates}")
state_num = 1
asin_list = [(state_num, i["asin"], i["site"]) for i in dates]
if len(asin_list) == 1:
d = asin_list[0]
else:
d = asin_list
while True:
try:
if del_mysql_asin(sql_up, data=d, site="us"):
logging.info(f"修改asin状态3-----{len(d)}---------{d}")
break
else:
logging.info(f"修改asin状态3-失败----{len(d)}---------{d}")
continue
except FunctionTimedOut as e:
logging.info(f"修改asin状态3-超时{e}----{len(d)}---------{d}")
continue
elif k == "error_queue" and v.qsize() >= self.num:
dates = [v.get() for i in range(0, self.num)]
if len(dates) == 1:
d = dates[0]
else:
d = dates
while True:
try:
if del_mysql_asin(sql_up, data=d, site="us"):
logging.info(f"修改asin状态3-----{len(d)}---------{d}")
break
else:
logging.info(f"修改asin状态3-失败----{len(d)}---------{d}")
continue
except FunctionTimedOut as e:
logging.info(f"修改asin状态3-超时{e}----{len(d)}---------{d}")
continue
\ No newline at end of file
import logging
import pandas as pd
from queue import Queue
from sqlalchemy.exc import OperationalError
from func_timeout.exceptions import FunctionTimedOut
# useful for handling different item types with a single interface
from amazon_spider.db.mysql_db import df_to_sql
from amazon_spider.db.mysql_db import del_mysql_asin
class AmazonUpcSpiderPipeline:
def __init__(self, site):
self.site = site
self.q_dict = {"asin_queue": Queue(), "error_queue": Queue()}
self.num = 80
self.save_num = 80
@classmethod
def from_crawler(cls, crawler):
return cls(
site=crawler.spider.site
)
def process_item(self, item, spider):
if item.get("error_asin"):
self.q_dict.get("error_queue").put(item.get("asin"))
else:
self.q_dict.get("asin_queue").put(item)
sql_up = f"UPDATE `asin_video_spider` set state=(%s) where asin=(%s) and site=(%s);"
for k, v in self.q_dict.items():
if k == "asin_queue" and v.qsize() >= self.num:
state_num = 3
dates = [v.get() for i in range(0, self.num)]
df = pd.DataFrame(dates)
try:
df_to_sql("asin_bsr__video_url", df, site="us", db="mysql")
logging.info(f"入库成功-----{len(dates)}---------{dates}")
except OperationalError as e:
logging.info(f"入库失败 连接错误{e}")
state_num = 1
except FunctionTimedOut as e:
logging.info(f"入库-超时{e}----{len(dates)}---------{dates}")
state_num = 1
asin_list = [(state_num, i["asin"], i['site']) for i in dates]
if len(asin_list) == 1:
d = asin_list[0]
else:
d = asin_list
while True:
try:
if del_mysql_asin(sql_up, data=d, site="us"):
logging.info(f"修改asin状态3-----{len(d)}---------{d}")
break
else:
logging.info(f"修改asin状态3-失败----{len(d)}---------{d}")
continue
except FunctionTimedOut as e:
logging.info(f"修改asin状态3-超时{e}----{len(d)}---------{d}")
continue
elif k == "error_queue" and v.qsize() >= self.num:
dates = [v.get() for i in range(0, self.num)]
if len(dates) == 1:
d = dates[0]
else:
d = dates
while True:
try:
if del_mysql_asin(sql_up, data=d, site="us"):
logging.info(f"修改asin状态3-----{len(d)}---------{d}")
break
else:
logging.info(f"修改asin状态3-失败----{len(d)}---------{d}")
continue
except FunctionTimedOut as e:
logging.info(f"修改asin状态3-超时{e}----{len(d)}---------{d}")
continue
def close_spider(self, spider):
print('爬虫结束,存储最后 数据', {k: v.qsize() for k, v in self.q_dict.items()})
sql_up = f"UPDATE `asin_video_spider` set state=(%s) where asin=(%s) and site=(%s);"
for k, v in self.q_dict.items():
if k == "asin_queue":
state_num = 3
dates = [v.get() for i in range(0, v.qsize())]
df = pd.DataFrame(dates)
try:
df_to_sql("asin_bsr__video_url", df, site="us", db="mysql")
logging.info(f"入库成功-----{len(dates)}---------{dates}")
except OperationalError as e:
logging.info(f"入库失败 连接错误{e}")
state_num = 1
except FunctionTimedOut as e:
logging.info(f"入库-超时{e}----{len(dates)}---------{dates}")
state_num = 1
asin_list = [(state_num, i["asin"], i["site"]) for i in dates]
if len(asin_list) == 1:
d = asin_list[0]
else:
d = asin_list
while True:
try:
if del_mysql_asin(sql_up, data=d, site="us"):
logging.info(f"修改asin状态3-----{len(d)}---------{d}")
break
else:
logging.info(f"修改asin状态3-失败----{len(d)}---------{d}")
continue
except FunctionTimedOut as e:
logging.info(f"修改asin状态3-超时{e}----{len(d)}---------{d}")
continue
elif k == "error_queue" and v.qsize() >= self.num:
dates = [v.get() for i in range(0, self.num)]
if len(dates) == 1:
d = dates[0]
else:
d = dates
while True:
try:
if del_mysql_asin(sql_up, data=d, site="us"):
logging.info(f"修改asin状态3-----{len(d)}---------{d}")
break
else:
logging.info(f"修改asin状态3-失败----{len(d)}---------{d}")
continue
except FunctionTimedOut as e:
logging.info(f"修改asin状态3-超时{e}----{len(d)}---------{d}")
continue
\ No newline at end of file
import time
import logging
import pandas as pd
from queue import Queue
# useful for handling different item types with a single interface
from amazon_spider.utils.common import is_internet_available
from amazon_spider.db.mysql_db import sql_update, sql_update_many, sql_connect, sql_insert_many, sql_insert
class ContactInfoPipeline:
def __init__(self, site):
self.site = site
self.q_dict = {
"inner_item_queue": Queue(),
"error_queue": Queue(),
}
self.num = 1
sql_connect(self.site)
@classmethod
def from_crawler(cls, crawler):
return cls(
site=crawler.spider.site
)
def sql_error_retry(self, f, sql, d):
while True:
if is_internet_available():
return f(sql, d)
else:
time.sleep(3)
logging.info(f"requests baidu error --> T_T")
continue
def queue_consumer(self, q_size):
for k, v in self.q_dict.items():
if q_size == "max":
if v.qsize():
dates = [v.get() for i in range(0, v.qsize())]
else:
dates = []
else:
if v.qsize() >= self.num:
dates = [v.get() for i in range(0, self.num)]
else:
dates = []
if dates:
df = pd.DataFrame(dates)
if k == "inner_item_queue":
if dates:
inset_sql = f"insert into `company_info_1688` (`company_name`, `mobileNo`, `phoneNum`, `fax`, `contact_name`, `position`, `memberId`, `card_url`, `home_url`, `address`, `state`, `updated_at`) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE `company_name` = values(`company_name`), `mobileNo` = values(`mobileNo`), `phoneNum` = values(`phoneNum`), `fax` = values(`fax`), `contact_name` = values(`contact_name`), `position` = values(`position`), `card_url` = values(`card_url`), `home_url` = values(`home_url`), `address` = values(`address`), `state` = values(`state`), `updated_at` = values(`updated_at`);"
d = [list(i) for i in df.values]
if len(d) == 1:
sql_id = self.sql_error_retry(sql_insert, inset_sql, d[0])
else:
sql_id = self.sql_error_retry(sql_insert_many, inset_sql, d)
logging.info(f"sql_id {sql_id} save company_info_1688 succeed {d[0:3]}")
elif k == "error_queue":
if dates:
up_datas = [list(i) for i in df.values]
# 表名需要改
sql_up = f"UPDATE `company_info_1688` set `state`=(%s) where memberId=(%s);"
if len(up_datas) == 1:
sql_id = self.sql_error_retry(sql_update, sql_up, up_datas[0])
else:
sql_id = self.sql_error_retry(sql_update_many, sql_up, up_datas)
logging.info(f"sql_id {sql_id} update company_info_1688 succeed {up_datas[0:3]}")
def process_item(self, item, spider):
if item.get("finish_spider"):
print('等待时 将队列数据存储', {k: v.qsize() for k, v in self.q_dict.items()})
self.queue_consumer(q_size="max")
if item.get("inner_item"):
self.q_dict.get("inner_item_queue").put(item.get('inner_item'))
elif item.get("error_asin"):
self.q_dict.get("error_queue").put(item.get("memberId"))
self.queue_consumer("min")
def close_spider(self, spider):
print('爬虫结束,存储最后 数据', {k: v.qsize() for k, v in self.q_dict.items()})
self.queue_consumer(q_size="max")
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment