mirror of
https://github.com/boostorg/mysql.git
synced 2025-05-12 14:11:41 +00:00
72 lines
2.3 KiB
Python
Executable File
72 lines
2.3 KiB
Python
Executable File
#!/usr/bin/python3
|
|
#
|
|
# Copyright (c) 2019-2025 Ruben Perez Hidalgo (rubenperez038 at gmail dot com)
|
|
#
|
|
# Distributed under the Boost Software License, Version 1.0. (See accompanying
|
|
# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
|
#
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import os
|
|
from os import path
|
|
|
|
REPO_BASE = path.abspath(path.join(path.dirname(__file__), '..', '..'))
|
|
DOC_PATH = path.join(REPO_BASE, 'doc', 'html')
|
|
|
|
def list_doc_files():
|
|
all_files = []
|
|
for base_dir, _, files in os.walk(DOC_PATH):
|
|
all_files += [path.join(base_dir, f) for f in files if f.endswith('.html')]
|
|
return all_files
|
|
|
|
def get_href(elm, current_file):
|
|
try:
|
|
res = elm['href']
|
|
except KeyError:
|
|
return None
|
|
if res.startswith('http://') or res.startswith('https://'):
|
|
if '#error_er_' in res:
|
|
return res.split('#error_er_')[0]
|
|
else:
|
|
return res
|
|
else:
|
|
curdir = path.dirname(current_file)
|
|
return path.realpath(path.join(curdir, res.split('#')[0]))
|
|
|
|
def extract_links():
|
|
external_links = {}
|
|
internal_links = {}
|
|
|
|
for fname in list_doc_files():
|
|
with open(fname, 'rt') as f:
|
|
html_doc = f.read()
|
|
soup = BeautifulSoup(html_doc, 'html.parser')
|
|
links = [get_href(elm, fname) for elm in soup.find_all('a')]
|
|
internal_links.update({ elm: fname for elm in links if elm is not None and elm.startswith('/')})
|
|
external_links.update({ elm: fname for elm in links if elm is not None and \
|
|
(elm.startswith('http://') or elm.startswith('https://'))})
|
|
|
|
return (external_links, internal_links)
|
|
|
|
def check_external_links(links):
|
|
s = requests.Session()
|
|
for url in sorted(links.keys()):
|
|
print('Checking ', url)
|
|
response = s.head(url, allow_redirects=True)
|
|
if response.status_code != 200:
|
|
print(' ++++ {} response code: {}'.format(url, response.status_code))
|
|
|
|
def check_internal_links(links):
|
|
for target, link_file in links.items():
|
|
if not path.exists(target):
|
|
print(' ++++ Link {} in file {} does not exist'.format(target, link_file))
|
|
|
|
def main():
|
|
external, internal = extract_links()
|
|
check_external_links(external)
|
|
check_internal_links(internal)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|