You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

401 lines
14KB

  1. #! /usr/bin/env python3
  2. """
  3. Copyright (C) 2007-2009 Vladimir Toncar
  4. Copyright (C) 2018-2020 Bernhard Ehlers
  5. Contributors:
  6. Redirect handling by Pavel "ShadoW" Dvorak
  7. This program is free software; you can redistribute it and/or modify
  8. it under the terms of the GNU General Public License as published by
  9. the Free Software Foundation; either version 3 of the License, or
  10. (at your option) any later version.
  11. This program is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. GNU General Public License for more details.
  15. """
  16. import sys
  17. import getopt
  18. import re
  19. import string
  20. import time
  21. import urllib.parse
  22. import xml.sax.saxutils
  23. from datetime import datetime
  24. from html.parser import HTMLParser
  25. import requests
  26. from reppy.robots import Robots
  27. helpText = """sitemap_gen.py version 1.2.5 (2020-03-14)
  28. This script crawls a web site from a given starting URL and generates
  29. a Sitemap file in the format that is accepted by Google. The crawler
  30. does not follow links to other web sites. It also respects the 'nofollow'
  31. tags and will not crawl into directories disallowed in the robots.txt file.
  32. Command line syntax:
  33. python3 sitemap_gen.py <options> <starting URL>
  34. Available options:
  35. -h --help Print this text and exit
  36. -b <ext> --block <ext> Exclude URLs with the given extension;
  37. <ext> must be without the leading dot.
  38. The comparison is case insensitive, so
  39. for example DOC and doc are treated
  40. the same. You can use this option several
  41. times to block several extensions.
  42. -c <value> --changefreq <value> Set the change frequency. The given value
  43. is used in all sitemap entries (maybe a
  44. future version of this script will change
  45. that). The allowed values are: always,
  46. hourly, daily, weekly, monthly, yearly,
  47. never.
  48. -p <prio> --priority <prio> Set the priority. The value must be from
  49. the interval between 0.0 and 1.0. The value
  50. will be used in all sitemap entries.
  51. -m <value> --max-urls <value> Set the maximum number of URLs to be crawled.
  52. The default value is 1000 and the largest
  53. value that you can set is 50000 (the script
  54. generates only a single sitemap file).
  55. -r <value> --ratelimit <value> Set a crawl rate limit [requests / second],
  56. zero (the default) results in no crawl rate
  57. limitation.
  58. -o <file> --output-file <file> Set the name of the geneated sitemap file.
  59. The default file name is sitemap.xml.
  60. Usage example:
  61. python3 sitemap_gen.py -b doc -b bmp -o test_sitemap.xml http://www.your-site-name.com/index.html
  62. For more information, visit http://toncar.cz/opensource/sitemap_gen.html
  63. """
  64. allowedChangefreq = ["always", "hourly", "daily", "weekly", \
  65. "monthly", "yearly", "never"]
  66. class RateLimit:
  67. """ rate limit requests """
  68. def __init__(self, rate):
  69. if rate <= 0.0:
  70. self.interval_ns = None
  71. else:
  72. self.interval_ns = round(1e9 / rate)
  73. self.req_time_ns = time.monotonic_ns() - self.interval_ns
  74. def sleep(self):
  75. if self.interval_ns:
  76. self.req_time_ns += self.interval_ns
  77. cur_time_ns = time.monotonic_ns()
  78. sleep_time = (self.req_time_ns - cur_time_ns) / 1e9
  79. if sleep_time > 0.0:
  80. time.sleep(sleep_time)
  81. else:
  82. self.req_time_ns = cur_time_ns
  83. class HTMLLoad:
  84. """ load http(s) page """
  85. def __init__(self, ratelimit=None):
  86. self.session = requests.Session()
  87. self.session.headers.update({'User-Agent': 'sitemap_gen/1.0'})
  88. if not ratelimit:
  89. ratelimit = 0.0
  90. self.ratelimit = RateLimit(ratelimit)
  91. self.page = None
  92. self.status = 0
  93. self.date = None
  94. self.redirect = None
  95. def _handle_redirect(self, resp):
  96. # taken from urllib.request source code
  97. newurl = resp.headers.get("location")
  98. if not newurl:
  99. raise requests.exceptions.HTTPError(
  100. "%s No new location in redirection for url: %s" %
  101. (resp.status_code, resp.url))
  102. urlparts = urllib.parse.urlparse(newurl)
  103. if urlparts.scheme not in ('http', 'https', 'ftp', ''):
  104. raise requests.exceptions.HTTPError(
  105. "%s Redirection to '%s' not allowed for url: %s" %
  106. (resp.status_code, newurl, resp.url))
  107. if not urlparts.path and urlparts.netloc:
  108. urlparts = urlparts._replace(path='/')
  109. newurl = urllib.parse.urlunparse(urlparts)
  110. # http.client.parse_headers() decodes as ISO-8859-1. Recover the
  111. # original bytes and percent-encode non-ASCII bytes, and any special
  112. # characters such as the space.
  113. newurl = urllib.parse.quote(newurl, encoding="iso-8859-1",
  114. safe=string.punctuation)
  115. self.redirect = urllib.parse.urljoin(resp.url, newurl)
  116. def get(self, url, allow_redirects=False):
  117. self.page = None
  118. self.status = 0
  119. self.date = datetime.now()
  120. self.redirect = None
  121. self.ratelimit.sleep()
  122. try:
  123. resp = self.session.get(url, timeout=10,
  124. allow_redirects=allow_redirects)
  125. self.status = resp.status_code
  126. if resp.status_code in (301, 302, 303, 307):
  127. self._handle_redirect(resp)
  128. else:
  129. resp.raise_for_status()
  130. self.page = resp.content
  131. date = resp.headers.get('last-modified') or resp.headers.get('date')
  132. try:
  133. if date:
  134. self.date = datetime.strptime(date,
  135. '%a, %d %b %Y %H:%M:%S %Z')
  136. except ValueError:
  137. pass
  138. except requests.exceptions.RequestException as detail:
  139. msg = str(detail)
  140. match = re.search(r"\(Caused by ([a-zA-Z0-9_]+)\('[^:]*: (.*)'\)",
  141. msg)
  142. if match:
  143. msg = "{}: {} for url: {}".format(*match.groups(), url)
  144. print("%s. Skipping..." % (msg))
  145. return self.page
  146. #end class
  147. def joinUrls(baseUrl, newUrl):
  148. helpUrl, _ = urllib.parse.urldefrag(newUrl)
  149. return urllib.parse.urljoin(baseUrl, helpUrl)
  150. #end def
  151. def getRobotParser(loader, startUrl):
  152. robotUrl = urllib.parse.urljoin(startUrl, "/robots.txt")
  153. page = loader.get(robotUrl, allow_redirects=True)
  154. if page is None:
  155. print("Could not read ROBOTS.TXT at: " + robotUrl)
  156. return None
  157. #end if
  158. rp = Robots.parse(robotUrl, page)
  159. print("Found ROBOTS.TXT at: " + robotUrl)
  160. return rp
  161. #end def
  162. EMPTY = []
  163. class MyHTMLParser(HTMLParser):
  164. def __init__(self, pageMap, redirects, baseUrl, maxUrls, blockExtensions, robotParser):
  165. HTMLParser.__init__(self)
  166. self.pageMap = pageMap
  167. self.redirects = redirects
  168. self.baseUrl = baseUrl
  169. self.server = urllib.parse.urlsplit(baseUrl)[1] # netloc in python 2.5
  170. self.maxUrls = maxUrls
  171. self.blockExtensions = tuple(blockExtensions)
  172. self.robotParser = robotParser
  173. #end def
  174. def hasBlockedExtension(self, url):
  175. p = urllib.parse.urlparse(url)
  176. path = p[2].upper() # path attribute
  177. return path.endswith(self.blockExtensions)
  178. #end def
  179. def handle_starttag(self, tag, attrs):
  180. if len(self.pageMap) >= self.maxUrls:
  181. return
  182. if tag.upper() == "BASE":
  183. if attrs[0][0].upper() == "HREF":
  184. self.baseUrl = joinUrls(self.baseUrl, attrs[0][1])
  185. print("BASE URL set to " + self.baseUrl)
  186. if tag.upper() == "A":
  187. #print("Attrs: " + str(attrs))
  188. url = ""
  189. # Let's scan the list of tag's attributes
  190. for attr in attrs:
  191. #print(" attr: " + str(attr))
  192. if (attr[0].upper() == "REL") and (attr[1].upper().find('NOFOLLOW') != -1):
  193. # We have discovered a nofollow, so we won't continue
  194. return
  195. elif (attr[0].upper() == "HREF") and (attr[1].upper().find('MAILTO:') == -1):
  196. # We have discovered a link that is not a Mailto:
  197. url = joinUrls(self.baseUrl, attr[1])
  198. #end for
  199. # if the url is empty, there was none in the list of attributes
  200. if url == "":
  201. return
  202. # Check if we want to follow the link
  203. if urllib.parse.urlsplit(url)[1] != self.server:
  204. return
  205. if self.hasBlockedExtension(url) or self.redirects.count(url) > 0:
  206. return
  207. if self.robotParser is not None and not self.robotParser.allowed(url, "sitemap_gen"):
  208. print("URL restricted by ROBOTS.TXT: " + url)
  209. return
  210. # It's OK to add url to the map and fetch it later
  211. if not url in self.pageMap:
  212. self.pageMap[url] = EMPTY
  213. #end if
  214. #end def
  215. #end class
  216. def getUrlToProcess(pageMap):
  217. for i in pageMap.keys():
  218. if pageMap[i] is EMPTY:
  219. return i
  220. return None
  221. def parsePages(loader, startUrl, maxUrls, blockExtensions):
  222. pageMap = {}
  223. pageMap[startUrl] = EMPTY
  224. redirects = []
  225. robotParser = getRobotParser(loader, startUrl)
  226. server = urllib.parse.urlsplit(startUrl)[1]
  227. while True:
  228. url = getUrlToProcess(pageMap)
  229. if url is None:
  230. break
  231. print(" " + url)
  232. page = loader.get(url)
  233. if page is None:
  234. del pageMap[url]
  235. elif loader.redirect:
  236. newUrl, _ = urllib.parse.urldefrag(loader.redirect)
  237. print("Redirect -> " + newUrl)
  238. del pageMap[url]
  239. redirects.append(url)
  240. if urllib.parse.urlsplit(newUrl)[1] == server and \
  241. newUrl not in pageMap and newUrl not in redirects and \
  242. (robotParser is None or \
  243. robotParser.allowed(newUrl, "sitemap_gen")):
  244. pageMap[newUrl] = EMPTY
  245. else:
  246. pageMap[url] = loader.date
  247. parser = MyHTMLParser(pageMap, redirects, url, maxUrls, blockExtensions, robotParser)
  248. try:
  249. parser.feed(page.decode("utf-8", errors='strict'))
  250. parser.close()
  251. except UnicodeError:
  252. print("Failed decoding %s . Try to check if the page is valid." % (url))
  253. #end while
  254. return pageMap
  255. #end def
  256. def generateSitemapFile(pageMap, fileName, changefreq="", priority=0.0):
  257. fw = open(fileName, "wt")
  258. fw.write('''<?xml version="1.0" encoding="UTF-8"?>
  259. <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n''')
  260. for i in sorted(pageMap.keys()):
  261. fw.write('<url>\n <loc>%s</loc>\n' % (xml.sax.saxutils.escape(i)))
  262. if isinstance(pageMap[i], datetime):
  263. fw.write(' <lastmod>%4d-%02d-%02d</lastmod>\n' %
  264. (pageMap[i].year, pageMap[i].month, pageMap[i].day))
  265. if changefreq != "":
  266. fw.write(' <changefreq>%s</changefreq>\n' % (changefreq))
  267. if priority > 0.0:
  268. fw.write(' <priority>%1.1f</priority>\n' % (priority))
  269. fw.write('</url>\n')
  270. #end for
  271. fw.write('</urlset>\n')
  272. fw.close()
  273. #end def
  274. def main():
  275. try:
  276. opts, args = getopt.getopt(sys.argv[1:],\
  277. "hb:c:m:p:r:o:", \
  278. ["help", "block=", "changefreq=", "max-urls=", \
  279. "priority=", "ratelimit=", "output-file="])
  280. except getopt.GetoptError:
  281. sys.stderr.write(helpText)
  282. return 1
  283. blockExtensions = []
  284. changefreq = ""
  285. priority = 0.0
  286. fileName = "sitemap.xml"
  287. maxUrls = 1000
  288. pageMap = {}
  289. ratelimit = None
  290. for opt, arg in opts:
  291. if opt in ("-h", "--help"):
  292. sys.stderr.write(helpText)
  293. return 1
  294. elif opt in ("-b", "--block"):
  295. blockExtensions.append("." + arg.upper())
  296. elif opt in ("-c", "--changefreq"):
  297. if arg in allowedChangefreq:
  298. changefreq = arg
  299. else:
  300. sys.stderr.write("Allowed changefreq values are:\n")
  301. for i in allowedChangefreq:
  302. sys.stderr.write(" {}\n".format(i))
  303. return 1
  304. elif opt in ("-m", "--max-urls"):
  305. maxUrls = int(arg)
  306. if (maxUrls < 0) or (maxUrls > 50000):
  307. sys.stderr.write("The maximum number of URLs must be between 1 and 50000\n")
  308. return 1
  309. elif opt in ("-p", "--priority"):
  310. priority = float(arg)
  311. if (priority < 0.0) or (priority > 1.0):
  312. sys.stderr.write("Priority must be between 0.0 and 1.0\n")
  313. return 1
  314. elif opt in ("-r", "--ratelimit"):
  315. ratelimit = float(arg)
  316. elif opt in ("-o", "--output-file"):
  317. fileName = arg
  318. if fileName in ("", ".", ".."):
  319. sys.stderr.write("Please provide a sensible file name\n")
  320. return 1
  321. #end if
  322. if not args:
  323. sys.stderr.write("You must provide the starting URL.\nTry the -h option for help.\n")
  324. return 1
  325. # Start processing
  326. print("Crawling the site...")
  327. loader = HTMLLoad(ratelimit)
  328. pageMap = parsePages(loader, args[0], maxUrls, blockExtensions)
  329. print("Generating sitemap: %d URLs" % (len(pageMap)))
  330. generateSitemapFile(pageMap, fileName, changefreq, priority)
  331. print("Finished.")
  332. return 0
  333. #end def
  334. if __name__ == '__main__':
  335. try:
  336. status_code = main()
  337. except KeyboardInterrupt:
  338. status_code = 130
  339. sys.exit(status_code)