gfwlist2acl.py
#!/usr/bin/env python3
"""Convert gfwlist format to ssr compatible acl file"""
import fileinput
import re
from datetime import datetime, timedelta, tzinfo
from itertools import chain
ACL_TEMPLATE = """\
#
# Home: https://github.com/NateScarlet/gfwlist.acl
# Date: {date}
# URL: https://raw.githubusercontent.com/NateScarlet/gfwlist.acl/master/{filename}
#
[{default_action}]
[proxy_list]
{blacklist}
[bypass_list]
{whitelist}
"""
class ChinaTimezone(tzinfo):
"""Timezone of china."""
def tzname(self, dt):
return "UTC+8"
def utcoffset(self, dt):
return timedelta(hours=8)
def dst(self, dt):
return timedelta()
def get_regexp(line):
"""Get regular expression from a line.
Returns:
str
"""
# Escape, not use `re.escape` since it behavior changes in diffrent python version
ret = re.sub(r"[.*+?^${}()|[\]\\]", lambda x: "\\{}".format(x.group(0)), line)
# https://adblockplus.org/filters#basic
ret = ret.replace(r"\*", ".+")
# https://adblockplus.org/filters#separators
ret = ret.replace(r"\^", r"([^a-zA-Z0-9_-.%]|$)")
# https://adblockplus.org/filters#anchors
ret = re.sub(r"^\\\|\\\|(https?\??://)?", r"(^|\.)", ret)
ret = re.sub(r"^\\\|(https?\??://)?", "^", ret)
ret = re.sub(r"\\\|$", "$", ret)
return ret
def _split_long_regexp(regexp):
match = len(regexp) > 80 and re.match(r"(.*)\((.*)\)(.*)", regexp)
if not match:
return [regexp]
ret = []
prefix = match.group(1)
items = match.group(2).split("|")
suffix = match.group(3)
size = 10
for i in range(0, len(items), size):
chunk = items[i : i + size]
ret.append("{}({}){}".format(prefix, "|".join(chunk), suffix))
return ret
def get_rules(regexp):
"""Get acl rules from regular expression.
Returns:
List[str]
"""
regexp = re.sub(r"\^?https?\??://", "^", regexp)
regexp = re.sub(r"(\.\*)+$", "", regexp)
regexp = re.sub(r"/$", "$", regexp)
# Exclude pathname rule, since ssr only accept domain match
if "/" in re.sub(
r"(\[\^.*)/(.*\])", lambda match: match.group(1) + match.group(2), regexp
):
return []
ret = _split_long_regexp(regexp)
# SSR can not deal with too long rule in one line
ret = [i for i in ret if len(i) < 500]
return ret
def convert_line(line):
"""Convert a input line to acl rules
Returns:
List[str]
"""
if not line:
return []
line = line.replace(r"\/", "/")
# IP
match = re.match(
r"^\|*(?:https?://)?(\d{,3}\.\d{,3}\.\d{,3}\.\d{,3}(?::\d{1,5})?)/*$", line
)
if match:
return [match.group(1)]
# https://adblockplus.org/filters#regexps
if line.startswith("/") and line.endswith("/"):
return get_rules(line[1:-1])
return get_rules(get_regexp(line))
def get_acl_rules(_content):
"""Get acl rules from gfwlist
Args:
_content (Iterable[str]): gfwlist data
Returns:
(List[str], List[str]): (blacklist, whitelist)
"""
content = _content
content = (i.strip() for i in content)
# https://adblockplus.org/filters#comments
content = [i for i in content if not i.startswith(("!", "[AutoProxy"))]
# https://adblockplus.org/filters#whitelist
blacklist = chain(*(convert_line(i) for i in content if not i.startswith("@@")))
whitelist = chain(*(convert_line(i[2:]) for i in content if i.startswith("@@")))
return list(blacklist), list(whitelist)
def main():
blacklist, whitelist = get_acl_rules(fileinput.input())
print(
ACL_TEMPLATE.format(
date=datetime.now(ChinaTimezone()).isoformat(),
filename="gfwlist.acl",
default_action="bypass_all",
blacklist="\n".join(blacklist),
whitelist="\n".join(whitelist),
)
)
if __name__ == "__main__":
main()