Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
S
spider
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
selection-new
spider
Commits
8dd9963f
Commit
8dd9963f
authored
Jul 18, 2025
by
hezhe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
''
parent
bab731a0
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
85 additions
and
133 deletions
+85
-133
gen_code.py
amazon_spider/other_job/spider/gen_code.py
+85
-133
No files found.
amazon_spider/other_job/spider/gen_code.py
View file @
8dd9963f
...
@@ -6,13 +6,9 @@ from playwright.sync_api import Browser, BrowserContext, Page, sync_playwright,
...
@@ -6,13 +6,9 @@ from playwright.sync_api import Browser, BrowserContext, Page, sync_playwright,
import
redis
as
rd
import
redis
as
rd
import
ddddocr
import
ddddocr
import
re
import
re
import
cv2
import
base64
import
base64
from
io
import
BytesIO
from
io
import
BytesIO
from
PIL
import
Image
from
PIL
import
Image
import
numpy
as
np
from
func_timeout
import
func_set_timeout
from
func_timeout.exceptions
import
FunctionTimedOut
# REDIS = {
# REDIS = {
...
@@ -30,7 +26,7 @@ REDIS = {
...
@@ -30,7 +26,7 @@ REDIS = {
'password'
:
'yswg@2019'
,
'password'
:
'yswg@2019'
,
'db'
:
1
'db'
:
1
}
}
#
def
singleton
(
cls
,
*
args
,
**
kw
):
def
singleton
(
cls
,
*
args
,
**
kw
):
"""singleton mode.
"""singleton mode.
...
@@ -67,6 +63,7 @@ def md5(src: str, algorithm: str = "md5", digits: int = 32) -> str:
...
@@ -67,6 +63,7 @@ def md5(src: str, algorithm: str = "md5", digits: int = 32) -> str:
else
:
else
:
return
algorithm
.
hexdigest
()
return
algorithm
.
hexdigest
()
@singleton
@singleton
class
Redis
(
object
):
class
Redis
(
object
):
def
__init__
(
self
):
def
__init__
(
self
):
...
@@ -74,18 +71,21 @@ class Redis(object):
...
@@ -74,18 +71,21 @@ class Redis(object):
self
.
port
=
REDIS
[
'port'
]
self
.
port
=
REDIS
[
'port'
]
self
.
db
=
REDIS
[
'db'
]
self
.
db
=
REDIS
[
'db'
]
self
.
password
=
REDIS
[
'password'
]
self
.
password
=
REDIS
[
'password'
]
def
get_instance
(
self
):
self
.
pool
=
rd
.
ConnectionPool
(
self
.
pool
=
rd
.
ConnectionPool
(
host
=
self
.
host
,
host
=
self
.
host
,
port
=
self
.
port
,
port
=
self
.
port
,
db
=
self
.
db
,
db
=
self
.
db
,
password
=
self
.
password
,
password
=
self
.
password
,
max_connections
=
3
,
socket_timeout
=
5
,
socket_connect_timeout
=
5
,
retry_on_timeout
=
True
,
)
)
def
get_instance
(
self
):
return
rd
.
Redis
(
connection_pool
=
self
.
pool
)
return
rd
.
Redis
(
connection_pool
=
self
.
pool
)
@func_set_timeout
(
10
)
def
sadd
(
key
,
value
,
use_md5
=
True
):
def
sadd
(
key
,
value
,
use_md5
=
True
):
"""add key-value to the sorted set.
"""add key-value to the sorted set.
...
@@ -132,7 +132,6 @@ def ladd(key, value, use_md5=True):
...
@@ -132,7 +132,6 @@ def ladd(key, value, use_md5=True):
return
added
==
1
return
added
==
1
@func_set_timeout
(
30
)
def
listpop
(
key
)
->
list
:
def
listpop
(
key
)
->
list
:
"""lpop
"""lpop
...
@@ -161,7 +160,16 @@ class ChinataxSpider(object):
...
@@ -161,7 +160,16 @@ class ChinataxSpider(object):
self
.
context
:
BrowserContext
=
None
self
.
context
:
BrowserContext
=
None
self
.
page
:
Page
=
None
self
.
page
:
Page
=
None
self
.
padding_error
=
1
self
.
padding_error
=
1
self
.
seeds
=
[{
'@class'
:
'com.alibaba.fastjson.JSONObject'
,
'fpdm'
:
None
,
'kprq'
:
'20231229'
,
'u_key'
:
'1742416930985218050'
,
'kjje'
:
'3861.62'
,
'fphm'
:
'23952000000063440070'
,
'jym'
:
None
}]
self
.
seeds
=
[
{
"u_key"
:
"1934929493120057346"
,
"fpdm"
:
""
,
"fphm"
:
"25429165833000096487"
,
"kprq"
:
"20250519"
,
"kjje"
:
"264.50"
,
"jym"
:
""
}
]
def
base64_to_image
(
self
,
base64_str
):
def
base64_to_image
(
self
,
base64_str
):
"""
"""
...
@@ -178,45 +186,36 @@ class ChinataxSpider(object):
...
@@ -178,45 +186,36 @@ class ChinataxSpider(object):
return
img
return
img
def
get_img_base64
(
self
):
def
get_img_base64
(
self
):
#
yzminfo = ""
yzminfo
=
""
color
=
"black"
color
=
"black"
# img_base64 = ""
img_base64
=
""
# count = 1
count
=
1
while
len
(
img_base64
)
<=
len
(
"images/code.png"
)
and
count
<=
5
:
yzminfo
=
self
.
page
.
query_selector
(
"#yzminfo"
)
.
as_element
()
.
text_content
()
yzminfo
=
self
.
page
.
query_selector
(
"#yzminfo"
)
.
as_element
()
.
text_content
()
self
.
page
.
wait_for_timeout
(
1000
)
img_base64
=
self
.
page
.
query_selector
(
"#yzm_img"
)
.
get_attribute
(
"src"
)
img_base64
=
self
.
page
.
query_selector
(
"#yzm_img"
)
.
get_attribute
(
"src"
)
self
.
page
.
wait_for_timeout
(
1000
)
self
.
page
.
wait_for_timeout
(
1000
)
count
=
count
+
1
if
"蓝色"
in
yzminfo
:
if
"蓝色"
in
yzminfo
:
color
=
'blue'
color
=
'blue'
if
"红色"
in
yzminfo
:
if
"红色"
in
yzminfo
:
color
=
'red'
color
=
'red'
if
"黄色"
in
yzminfo
:
color
=
'yellow'
return
color
,
img_base64
return
color
,
img_base64
def
get_img
(
self
,
count
):
def
get_img
(
self
):
# count = 1
count
=
1
if
self
.
page
.
query_selector
(
"#yzm_img"
)
.
get_attribute
(
"src"
)
==
"images/code.png"
:
color
,
img_base64
=
self
.
get_img_base64
()
self
.
page
.
query_selector
(
"#yzm_img"
)
.
click
()
print
(
f
"第{count}次是{color}色"
)
self
.
page
.
wait_for_timeout
(
3000
)
while
color
!=
'black'
:
self
.
page
.
wait_for_load_state
()
elif
count
!=
0
:
self
.
page
.
query_selector
(
"#yzm_img"
)
.
click
()
self
.
page
.
query_selector
(
"#yzm_img"
)
.
click
()
self
.
page
.
wait_for_timeout
(
3000
)
self
.
page
.
wait_for_timeout
(
3000
)
self
.
page
.
wait_for_load_state
()
self
.
page
.
wait_for_load_state
()
count
=
count
+
1
color
,
img_base64
=
self
.
get_img_base64
()
color
,
img_base64
=
self
.
get_img_base64
()
# print(f"第{count}次是{color}色")
color
=
color
# while color != 'black':
img_base64
=
img_base64
# self.page.query_selector("#yzm_img").click()
print
(
f
"第{count}次是{color}色"
)
# self.page.wait_for_timeout(3000)
# self.page.wait_for_load_state()
# count = count + 1
# color, img_base64 = self.get_img_base64()
# color = color
# img_base64 = img_base64
# print(f"第{count}次是{color}色")
# self.base64_to_image(img_base64).save(f"test第{count}次.png")
# self.base64_to_image(img_base64).save(f"test第{count}次.png")
# image = self.base64_to_image(img_base64)
# image = self.base64_to_image(img_base64)
# image.save("test.png")
# image.save("test.png")
...
@@ -227,59 +226,20 @@ class ChinataxSpider(object):
...
@@ -227,59 +226,20 @@ class ChinataxSpider(object):
img
=
self
.
base64_to_image
(
img_base64
)
img
=
self
.
base64_to_image
(
img_base64
)
if
color
==
'black'
:
if
color
==
'black'
:
return
ocr
.
classification
(
img
)
return
ocr
.
classification
(
img
)
# 转换为cv2可以使用的对象
image
=
np
.
asarray
(
img
)
one_color_img
=
self
.
save_appointed_color
(
image
,
color
)
img
=
Image
.
fromarray
(
cv2
.
cvtColor
(
one_color_img
,
cv2
.
COLOR_BGR2RGB
))
# todo 预处理
# todo 预处理
return
ocr
.
classification
(
img
)
return
ocr
.
classification
(
img
)
def
save_appointed_color
(
self
,
image
,
color
):
hsv_image
=
cv2
.
cvtColor
(
image
,
cv2
.
COLOR_BGR2HSV
)
if
color
==
"red"
:
lower_red
=
np
.
array
([
0
,
100
,
100
])
upper_red
=
np
.
array
([
10
,
255
,
255
])
mask
=
cv2
.
inRange
(
hsv_image
,
lower_red
,
upper_red
)
print
(
"保留 red"
)
elif
color
==
"blue"
:
# 定义要保留的颜色的范围
lower_blue
=
np
.
array
([
100
,
50
,
50
])
upper_blue
=
np
.
array
([
140
,
255
,
255
])
mask
=
cv2
.
inRange
(
hsv_image
,
lower_blue
,
upper_blue
)
print
(
"保留 blue"
)
elif
color
==
"yellow"
:
# 定义要保留的颜色的范围
lower_yellow
=
np
.
array
([
26
,
43
,
46
])
upper_yellow
=
np
.
array
([
34
,
255
,
255
])
mask
=
cv2
.
inRange
(
hsv_image
,
lower_yellow
,
upper_yellow
)
print
(
"保留 yellow"
)
else
:
# 定义要保留的颜色的范围(这里以红色为例)
lower_red
=
np
.
array
([
0
,
50
,
50
])
upper_red
=
np
.
array
([
10
,
255
,
255
])
# HSV颜色空间中红色的另一个范围
lower_red2
=
np
.
array
([
170
,
50
,
50
])
upper_red2
=
np
.
array
([
180
,
255
,
255
])
mask1
=
cv2
.
inRange
(
hsv_image
,
lower_red
,
upper_red
)
mask2
=
cv2
.
inRange
(
hsv_image
,
lower_red2
,
upper_red2
)
mask
=
mask1
+
mask2
white
=
np
.
full_like
(
image
,
(
255
,
255
,
255
),
dtype
=
np
.
uint8
)
result
=
cv2
.
bitwise_and
(
white
,
white
,
mask
=
mask
)
return
result
def
get_seed
(
self
):
def
get_seed
(
self
):
while
True
:
seeds
=
spop
(
self
.
seed_key
,
1
)
try
:
# if self.seeds:
seeds
=
listpop
(
self
.
seed_key
)
break
except
FunctionTimedOut
as
e
:
logging
.
info
(
f
"get seed timeout {e}"
)
continue
# seeds = self.seeds.pop()
# seeds = self.seeds.pop()
# else:
# return None
if
seeds
:
if
seeds
:
seed
=
json
.
loads
(
seeds
)
seed
=
json
.
loads
(
seeds
[
0
]
)
# seed = seeds
# seed = seeds
if
seed
.
get
(
"fphm"
)
and
seed
.
get
(
"kprq"
)
and
(
seed
.
get
(
"kjje"
)
or
seed
.
get
(
"jym"
)):
seed
=
{
k
:
v
.
strip
()
if
isinstance
(
v
,
str
)
else
""
for
k
,
v
in
seed
.
items
()}
if
seed
.
get
(
"fphm"
)
and
seed
.
get
(
"kprq"
):
return
seed
return
seed
else
:
else
:
error_msg
=
"seed error"
error_msg
=
"seed error"
...
@@ -293,46 +253,46 @@ class ChinataxSpider(object):
...
@@ -293,46 +253,46 @@ class ChinataxSpider(object):
else
:
else
:
return
None
return
None
def
recaptcha
(
self
,
count
):
def
recaptcha
(
self
):
color
,
img_base64
=
self
.
get_img
(
count
)
color
,
img_base64
=
self
.
get_img
()
code
=
self
.
ddddocr_imge_get_code
(
color
,
img_base64
)
code
=
self
.
ddddocr_imge_get_code
(
color
,
img_base64
)
if
not
code
:
logging
.
info
(
"验证码识别失败"
)
if
color
==
'black'
:
return
False
logging
.
info
(
f
"验证码识别 {code}"
)
self
.
page
.
locator
(
"#yzm"
)
.
fill
(
code
)
self
.
page
.
locator
(
"#yzm"
)
.
fill
(
code
)
self
.
page
.
wait_for_timeout
(
1000
)
# 点击空白页失去焦点
# 点击空白页失去焦点
self
.
page
.
locator
(
"#pageshow"
)
.
click
()
self
.
page
.
locator
(
"#pageshow"
)
.
click
()
self
.
page
.
wait_for_timeout
(
1000
)
if
"display: none"
not
in
self
.
page
.
query_selector
(
"#checkfp"
)
.
get_attribute
(
"style"
):
if
"display: none"
not
in
self
.
page
.
query_selector
(
"#checkfp"
)
.
get_attribute
(
"style"
):
self
.
page
.
locator
(
"#checkfp"
)
.
click
()
self
.
page
.
locator
(
"#checkfp"
)
.
click
()
self
.
page
.
wait_for_timeout
(
1
000
)
self
.
page
.
wait_for_timeout
(
2
000
)
# 判断验证码是否通过
# 判断验证码是否通过
if
not
self
.
page
.
query_selector
(
"#dialog-body"
):
if
not
self
.
page
.
query_selector
(
"#dialog-body"
):
if
error_msg
:
=
self
.
page
.
query_selector
(
"#popup_message"
):
if
error_msg
:
=
self
.
page
.
query_selector
(
"#popup_message"
):
if
"超过该张发票当日查验次数"
in
error_msg
.
text_content
():
if
"超过该张发票当日查验次数"
in
error_msg
.
text_content
():
self
.
page
.
locator
(
"#popup_ok"
)
.
click
()
self
.
page
.
locator
(
"#popup_ok"
)
.
click
()
self
.
page
.
wait_for_timeout
(
1
000
)
self
.
page
.
wait_for_timeout
(
2
000
)
return
"count error"
return
"count error"
if
"验证码请求次数过于频繁"
in
error_msg
.
text_content
():
if
"验证码请求次数过于频繁"
in
error_msg
.
text_content
():
self
.
page
.
locator
(
"#popup_ok"
)
.
click
()
self
.
page
.
locator
(
"#popup_ok"
)
.
click
()
self
.
page
.
wait_for_timeout
(
1
000
)
self
.
page
.
wait_for_timeout
(
2
000
)
return
"recaptcha count error"
return
"recaptcha count error"
self
.
page
.
locator
(
"#popup_ok"
)
.
click
()
self
.
page
.
locator
(
"#popup_ok"
)
.
click
()
self
.
page
.
wait_for_timeout
(
1000
)
# 刷新验证码
# 刷新验证码
#
self.page.locator("#yzm_img").click()
self
.
page
.
locator
(
"#yzm_img"
)
.
click
()
# self.page.wait_for_timeout(1
000)
self
.
page
.
wait_for_timeout
(
2
000
)
logging
.
info
(
"验证码处理错误"
)
logging
.
info
(
"验证码处理错误"
)
return
False
return
False
else
:
else
:
return
True
return
True
else
:
else
:
return
"seed error"
return
"seed error"
else
:
return
False
def
get_item
(
self
,
seed
):
def
crawl
(
self
,
url
,
seed
):
# 需要打开的网站
self
.
page
.
goto
(
url
)
self
.
page
.
wait_for_timeout
(
1000
)
logging
.
info
(
f
"fpdm --> {seed.get('fpdm', '')}"
)
logging
.
info
(
f
"fpdm --> {seed.get('fpdm', '')}"
)
logging
.
info
(
f
"seed --> {seed}"
)
logging
.
info
(
f
"seed --> {seed}"
)
# fpdm --> None
# fpdm --> None
...
@@ -345,10 +305,10 @@ class ChinataxSpider(object):
...
@@ -345,10 +305,10 @@ class ChinataxSpider(object):
self
.
page
.
wait_for_timeout
(
3000
)
self
.
page
.
wait_for_timeout
(
3000
)
if
"开具金额"
in
self
.
page
.
query_selector
(
if
"开具金额"
in
self
.
page
.
query_selector
(
"span[id='context']"
)
.
text_content
()
or
"价税合计"
in
self
.
page
.
query_selector
(
"span[id='context']"
)
.
text_content
()
or
"价税合计"
in
self
.
page
.
query_selector
(
"span[id='context']"
)
.
text_content
()
or
'票价'
in
self
.
page
.
query_selector
(
"span[id='context']"
)
.
text_content
():
"span[id='context']"
)
.
text_content
():
kjje
=
seed
.
get
(
"kjje"
)
kjje
=
seed
.
get
(
"kjje"
)
else
:
else
:
kjje
=
seed
.
get
(
"jym"
,
""
)[
-
6
::]
kjje
=
seed
.
get
(
"jym"
,
""
)[
-
6
::]
self
.
page
.
locator
(
"#kjje"
)
.
fill
(
kjje
)
self
.
page
.
locator
(
"#kjje"
)
.
fill
(
kjje
)
...
@@ -360,7 +320,7 @@ class ChinataxSpider(object):
...
@@ -360,7 +320,7 @@ class ChinataxSpider(object):
"dom"
:
error_msg
,
"dom"
:
error_msg
,
}
}
xadd
(
self
.
save_key
,
data
)
xadd
(
self
.
save_key
,
data
)
logging
.
info
(
f
"{error_msg}"
)
logging
.
info
(
f
"
fpdmjy - >>
{error_msg}"
)
self
.
page
.
close
()
self
.
page
.
close
()
self
.
context
.
close
()
self
.
context
.
close
()
elif
error_msg
:
=
self
.
page
.
query_selector
(
"#fphmjy"
)
.
text_content
()
.
strip
():
elif
error_msg
:
=
self
.
page
.
query_selector
(
"#fphmjy"
)
.
text_content
()
.
strip
():
...
@@ -369,6 +329,15 @@ class ChinataxSpider(object):
...
@@ -369,6 +329,15 @@ class ChinataxSpider(object):
"dom"
:
error_msg
,
"dom"
:
error_msg
,
}
}
xadd
(
self
.
save_key
,
data
)
xadd
(
self
.
save_key
,
data
)
logging
.
info
(
f
"fphmjy - >>{error_msg}"
)
self
.
page
.
close
()
self
.
context
.
close
()
elif
self
.
page
.
query_selector
(
"xpath=.//div[@class='tip_common_wrong font_red tip_common_right']"
):
data
=
{
"u_key"
:
seed
.
get
(
"u_key"
),
"dom"
:
"发票号码有误!"
,
}
xadd
(
self
.
save_key
,
data
)
logging
.
info
(
f
"{error_msg}"
)
logging
.
info
(
f
"{error_msg}"
)
self
.
page
.
close
()
self
.
page
.
close
()
self
.
context
.
close
()
self
.
context
.
close
()
...
@@ -378,7 +347,7 @@ class ChinataxSpider(object):
...
@@ -378,7 +347,7 @@ class ChinataxSpider(object):
"dom"
:
error_msg
,
"dom"
:
error_msg
,
}
}
xadd
(
self
.
save_key
,
data
)
xadd
(
self
.
save_key
,
data
)
logging
.
info
(
f
"{error_msg}"
)
logging
.
info
(
f
"
kprqjy - >>
{error_msg}"
)
self
.
page
.
close
()
self
.
page
.
close
()
self
.
context
.
close
()
self
.
context
.
close
()
elif
error_msg
:
=
self
.
page
.
query_selector
(
"#kjjejy"
)
.
text_content
()
.
strip
():
elif
error_msg
:
=
self
.
page
.
query_selector
(
"#kjjejy"
)
.
text_content
()
.
strip
():
...
@@ -387,14 +356,14 @@ class ChinataxSpider(object):
...
@@ -387,14 +356,14 @@ class ChinataxSpider(object):
"dom"
:
error_msg
,
"dom"
:
error_msg
,
}
}
xadd
(
self
.
save_key
,
data
)
xadd
(
self
.
save_key
,
data
)
logging
.
info
(
f
"{error_msg}"
)
logging
.
info
(
f
"
kjjejy - >>
{error_msg}"
)
self
.
page
.
close
()
self
.
page
.
close
()
self
.
context
.
close
()
self
.
context
.
close
()
else
:
else
:
error
=
0
error
=
0
for
i
in
range
(
8
):
for
i
in
range
(
8
):
# 判断验证码是否通过
# 判断验证码是否通过
if
error_msg
:
=
self
.
recaptcha
(
i
):
if
error_msg
:
=
self
.
recaptcha
():
if
error_msg
in
[
"count error"
,
"seed error"
]:
if
error_msg
in
[
"count error"
,
"seed error"
]:
data
=
{
data
=
{
"u_key"
:
seed
.
get
(
"u_key"
),
"u_key"
:
seed
.
get
(
"u_key"
),
...
@@ -427,16 +396,10 @@ class ChinataxSpider(object):
...
@@ -427,16 +396,10 @@ class ChinataxSpider(object):
if
error
==
0
:
if
error
==
0
:
logging
.
info
(
"失败8次处理,将任务重新推送到redis"
)
logging
.
info
(
"失败8次处理,将任务重新推送到redis"
)
l
add
(
self
.
seed_key
,
json
.
dumps
(
seed
),
use_md5
=
False
)
s
add
(
self
.
seed_key
,
json
.
dumps
(
seed
),
use_md5
=
False
)
self
.
page
.
close
()
self
.
page
.
close
()
self
.
context
.
close
()
self
.
context
.
close
()
def
crawl
(
self
,
url
,
seed
):
# 需要打开的网站
self
.
page
.
goto
(
url
)
self
.
page
.
wait_for_timeout
(
1000
)
self
.
get_item
(
seed
)
def
change_user
(
self
):
def
change_user
(
self
):
user_agent
=
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
user_agent
=
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
self
.
context
=
self
.
browser
.
new_context
(
self
.
context
=
self
.
browser
.
new_context
(
...
@@ -455,15 +418,20 @@ class ChinataxSpider(object):
...
@@ -455,15 +418,20 @@ class ChinataxSpider(object):
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});
"""
"""
self
.
page
.
add_init_script
(
js
)
self
.
page
.
add_init_script
(
js
)
self
.
page
.
set_default_timeout
(
300000
)
# self.page.add_init_script(
# self.page.add_init_script(
# "const newProto = navigator.__proto__; delete newProto.webdriver; navigator.__proto__ = newProto;")
# "const newProto = navigator.__proto__; delete newProto.webdriver; navigator.__proto__ = newProto;")
def
run
(
self
):
def
run
(
self
):
while
True
:
while
True
:
try
:
seed
=
self
.
get_seed
()
seed
=
self
.
get_seed
()
except
rd
.
exceptions
.
ConnectionError
as
e
:
logging
.
info
(
f
"ConnectionError error {e}"
)
continue
try
:
try
:
if
seed
:
if
seed
:
logging
.
info
(
"获取任务成功"
)
self
.
change_user
()
self
.
change_user
()
url
=
"https://inv-veri.chinatax.gov.cn/index.html"
url
=
"https://inv-veri.chinatax.gov.cn/index.html"
self
.
crawl
(
url
,
seed
)
self
.
crawl
(
url
,
seed
)
...
@@ -471,23 +439,23 @@ class ChinataxSpider(object):
...
@@ -471,23 +439,23 @@ class ChinataxSpider(object):
time
.
sleep
(
30
)
time
.
sleep
(
30
)
logging
.
info
(
'no task sleep 30s'
)
logging
.
info
(
'no task sleep 30s'
)
except
Error
as
e
:
except
Error
as
e
:
logging
.
info
(
f
"
--> playwright error
"
)
logging
.
info
(
f
"
playwright error {e}
"
)
self
.
page
.
close
()
self
.
page
.
close
()
self
.
context
.
close
()
self
.
context
.
close
()
l
add
(
self
.
seed_key
,
json
.
dumps
(
seed
),
use_md5
=
False
)
s
add
(
self
.
seed_key
,
json
.
dumps
(
seed
),
use_md5
=
False
)
continue
continue
except
FunctionTimedOut
as
e
:
except
rd
.
exceptions
.
ConnectionError
as
e
:
logging
.
info
(
f
"
--> FunctionTimedOut
"
)
logging
.
info
(
f
"
ConnectionError error {e}
"
)
self
.
page
.
close
()
self
.
page
.
close
()
self
.
context
.
close
()
self
.
context
.
close
()
l
add
(
self
.
seed_key
,
json
.
dumps
(
seed
),
use_md5
=
False
)
s
add
(
self
.
seed_key
,
json
.
dumps
(
seed
),
use_md5
=
False
)
continue
continue
except
Exception
as
e
:
except
Exception
as
e
:
self
.
page
.
close
()
self
.
page
.
close
()
self
.
context
.
close
()
self
.
context
.
close
()
if
f
"{e}"
==
"Incorrect padding"
:
if
f
"{e}"
==
"Incorrect padding"
:
l
add
(
self
.
seed_key
,
json
.
dumps
(
seed
),
use_md5
=
False
)
s
add
(
self
.
seed_key
,
json
.
dumps
(
seed
),
use_md5
=
False
)
logging
.
info
(
f
"
-->
Incorrect padding error {e}"
)
logging
.
info
(
f
"Incorrect padding error {e}"
)
self
.
padding_error
+=
1
self
.
padding_error
+=
1
if
self
.
padding_error
>=
5
:
if
self
.
padding_error
>=
5
:
time
.
sleep
(
1200
)
time
.
sleep
(
1200
)
...
@@ -503,17 +471,6 @@ class ChinataxSpider(object):
...
@@ -503,17 +471,6 @@ class ChinataxSpider(object):
xadd
(
self
.
save_key
,
data
)
xadd
(
self
.
save_key
,
data
)
continue
continue
def
send_msg
(
self
,
account
,
title
,
content
):
import
requests
url
=
'http://47.112.96.71:8082/selection/sendMessage'
data
=
{
'account'
:
account
,
'title'
:
title
,
'content'
:
content
}
print
(
data
)
requests
.
post
(
url
=
url
,
data
=
data
,
timeout
=
15
)
def
main
(
self
):
def
main
(
self
):
headless
=
False
headless
=
False
# headless = True
# headless = True
...
@@ -529,16 +486,10 @@ class ChinataxSpider(object):
...
@@ -529,16 +486,10 @@ class ChinataxSpider(object):
# )
# )
self
.
browser
=
_playwright
.
chromium
.
launch
(
self
.
browser
=
_playwright
.
chromium
.
launch
(
headless
=
False
,
headless
=
False
,
executable_path
=
"C:
\
Program Files
\
Google
\
Chrome
\
Application
\
chrome.exe"
,
# executable_path="C:\Program Files\Google\Chrome\Application\chrome.exe",
executable_path
=
r"C:\Program Files (x86)\ChatAI Chrome\ChatAI_Chrome.exe"
,
)
)
try
:
self
.
run
()
self
.
run
()
except
Exception
as
e
:
print
(
e
)
account
=
'hezhe'
title
=
f
'税务爬取脚本 error'
content
=
f
"税务爬取脚本 error 时间:{time.strftime('
%
Y-
%
m-
%
d
%
H:
%
M:
%
S', time.localtime())}"
self
.
send_msg
(
account
,
title
,
content
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
...
@@ -553,3 +504,4 @@ if __name__ == '__main__':
...
@@ -553,3 +504,4 @@ if __name__ == '__main__':
# }
# }
# ladd('finance:sp_invoice_queue', json.dumps(seed), use_md5=False)
# ladd('finance:sp_invoice_queue', json.dumps(seed), use_md5=False)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment