注意:后端服务器监听80端口(HTTP),但前端是HTTPS HAProxy将终止SSL,并以HTTP协议与后端通信
新建爬虫文件
nano /etc/haproxy/blacklist-agent.txt
内容
YandexBot
DotBot
SemrushBot
AhrefsBot
BLEXBot
YaK
MJ12bot
MauiBot
MegaIndex.ru
GPTBot
meta-externalagent/1.1
Qwantbot
ClaudeBot
Amazonbot
SearchBot
DataForSeoBot
Barkrowler
GoogleOther
Googlebot
更多爬虫
配置文件
配置参考: https://github.com/woniu336/open_shell/blob/main/u-haproxy.cfg
global
log /dev/log local0 warning
log /dev/log local1 notice
chroot /var/lib/haproxy
user haproxy
group haproxy
daemon
maxconn 10000
stats socket /run/haproxy/admin.sock mode 660 level admin expose-fd listeners
stats timeout 30s
# SSL会话缓存优化
tune.ssl.cachesize 50000
tune.ssl.lifetime 300
tune.ssl.ssl-ctx-cache-size 1000
# TLS安全全局设置
ssl-default-bind-ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256
ssl-default-bind-ciphersuites TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256
ssl-default-bind-options no-sslv3 no-tlsv10 no-tlsv11 prefer-client-ciphers
defaults
log global
mode http
option dontlognull
option httplog
option http-keep-alive
option forwardfor
timeout connect 5s
timeout client 30s
timeout server 30s
timeout http-keep-alive 15s
errorfile 400 /etc/haproxy/errors/400.http
errorfile 403 /etc/haproxy/errors/403.http
errorfile 408 /etc/haproxy/errors/408.http
errorfile 500 /etc/haproxy/errors/500.http
errorfile 502 /etc/haproxy/errors/502.http
errorfile 503 /etc/haproxy/errors/503.http
errorfile 504 /etc/haproxy/errors/504.http
balance roundrobin
frontend http-in
bind *:80
# ============== 爬虫拦截系统 ==============
acl empty_ua hdr_cnt(user-agent) eq 0
acl suspicious_ua hdr_sub(user-agent) -i "python" "curl" "wget"
acl bad_bots hdr_sub(user-agent) -i -f /etc/haproxy/blacklist-agent.txt
http-request deny if empty_ua
http-request deny if suspicious_ua
http-request deny if bad_bots
# ============== 拦截系统结束 ==============
redirect scheme https code 301 if !{ ssl_fc }
frontend https-in
bind *:443 ssl crt /etc/haproxy/certs/ alpn h2,http/1.1 allow-0rtt
# ============== HTTPS爬虫拦截系统 ==============
acl empty_ua hdr_cnt(user-agent) eq 0
acl suspicious_ua hdr_sub(user-agent) -i "python" "curl" "wget"
acl bad_bots hdr_sub(user-agent) -i -f /etc/haproxy/blacklist-agent.txt
http-request deny if empty_ua
http-request deny if suspicious_ua
http-request deny if bad_bots
# ============== 拦截系统结束 ==============
# 安全头配置
http-response set-header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
# http-response set-header X-Frame-Options "SAMEORIGIN"
# http-response set-header Content-Security-Policy "frame-ancestors 'self'"
http-response set-header X-Content-Type-Options "nosniff"
http-response set-header X-XSS-Protection "1; mode=block"
http-response set-header Referrer-Policy "strict-origin-when-cross-origin"
# 路由规则
acl domain1_https hdr(host) -i 123.com
use_backend backend1 if domain1_https
# acl domain2_https hdr(host) -i 789.com
# use_backend backend2 if domain2_https
# 默认后端
default_backend backend1
backend backend1
server server1 8.8.8.8:80 check inter 10s rise 3 fall 3
# backend backend2
# server server2 3.3.3.3:80 check inter 10s rise 3 fall 3
验证
haproxy -c -f /etc/haproxy/haproxy.cfg
重载服务
systemctl restart haproxy
检查状态
systemctl status haproxy
测试 应返回 403 Forbidden
curl -A "GPTBot" http://your-server/
获取真实客户端 ip, 在源站点配置文件添加如下内容
set_real_ip_from 你的反代ip;
real_ip_header X-Forwarded-For;
real_ip_recursive on;