You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

sitemap_gen.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323
  1. #! /usr/bin/env python3
  2. """
  3. Copyright (C) 2007-2009 Vladimir Toncar
  4. Copyright (C) 2018-2019 Bernhard Ehlers
  5. Contributors:
  6. Redirect handling by Pavel "ShadoW" Dvorak
  7. This program is free software; you can redistribute it and/or modify
  8. it under the terms of the GNU General Public License as published by
  9. the Free Software Foundation; either version 3 of the License, or
  10. (at your option) any later version.
  11. This program is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. GNU General Public License for more details.
  15. """
  16. import sys
  17. import getopt
  18. import gzip
  19. import urllib.error
  20. import urllib.parse
  21. import urllib.request
  22. from datetime import datetime
  23. from html.parser import HTMLParser
  24. import xml.sax.saxutils
  25. from reppy.robots import Robots
  26. helpText = """sitemap_gen.py version 1.2.3 (2019-02-12)
  27. This script crawls a web site from a given starting URL and generates
  28. a Sitemap file in the format that is accepted by Google. The crawler
  29. does not follow links to other web sites. It also respects the 'nofollow'
  30. tags and will not crawl into directories disallowed in the robots.txt file.
  31. Command line syntax:
  32. python3 sitemap_gen.py <options> <starting URL>
  33. Available options:
  34. -h --help Print this text and exit
  35. -b <ext> --block <ext> Exclude URLs with the given extension;
  36. <ext> must be without the leading dot.
  37. The comparison is case insensitive, so
  38. for example DOC and doc are treated
  39. the same. You can use this option several
  40. times to block several extensions.
  41. -c <value> --changefreq <value> Set the change frequency. The given value
  42. is used in all sitemap entries (maybe a
  43. future version of this script will change
  44. that). The allowed values are: always,
  45. hourly, daily, weekly, monthly, yearly,
  46. never.
  47. -p <prio> --priority <prio> Set the priority. The value must be from
  48. the interval between 0.0 and 1.0. The value
  49. will be used in all sitemap entries.
  50. -m <value> --max-urls <value> Set the maximum number of URLs to be crawled.
  51. The default value is 1000 and the largest
  52. value that you can set is 50000 (the script
  53. generates only a single sitemap file).
  54. -o <file> --output-file <file> Set the name of the geneated sitemap file.
  55. The default file name is sitemap.xml.
  56. Usage example:
  57. python3 sitemap_gen.py -b doc -b bmp -o test_sitemap.xml http://www.your-site-name.com/index.html
  58. For more information, visit http://toncar.cz/opensource/sitemap_gen.html
  59. """
  60. allowedChangefreq = ["always", "hourly", "daily", "weekly", \
  61. "monthly", "yearly", "never"]
  62. def getPage(url):
  63. try:
  64. f = urllib.request.urlopen(url)
  65. page = f.read()
  66. if 'Content-Encoding' in f.headers and \
  67. f.headers['Content-Encoding'] == 'gzip':
  68. page = gzip.decompress(page)
  69. # Get the last modify date
  70. try:
  71. if 'Last-Modified' in f.headers:
  72. date = f.headers['Last-Modified']
  73. else:
  74. date = f.headers['Date']
  75. date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z')
  76. date = (date.year, date.month, date.day)
  77. except (KeyError, ValueError):
  78. date = (0, 0, 0)
  79. f.close()
  80. return (page, date, f.url)
  81. except urllib.error.URLError as detail:
  82. print("%s. Skipping..." % (detail))
  83. return (None, (0, 0, 0), "")
  84. #end def
  85. def joinUrls(baseUrl, newUrl):
  86. helpUrl, _ = urllib.parse.urldefrag(newUrl)
  87. return urllib.parse.urljoin(baseUrl, helpUrl)
  88. #end def
  89. def getRobotParser(startUrl):
  90. robotUrl = urllib.parse.urljoin(startUrl, "/robots.txt")
  91. page, _, _ = getPage(robotUrl)
  92. if page is None:
  93. print("Could not read ROBOTS.TXT at: " + robotUrl)
  94. return None
  95. #end if
  96. rp = Robots.parse(robotUrl, page)
  97. print("Found ROBOTS.TXT at: " + robotUrl)
  98. return rp
  99. #end def
  100. class MyHTMLParser(HTMLParser):
  101. def __init__(self, pageMap, redirects, baseUrl, maxUrls, blockExtensions, robotParser):
  102. HTMLParser.__init__(self)
  103. self.pageMap = pageMap
  104. self.redirects = redirects
  105. self.baseUrl = baseUrl
  106. self.server = urllib.parse.urlsplit(baseUrl)[1] # netloc in python 2.5
  107. self.maxUrls = maxUrls
  108. self.blockExtensions = tuple(blockExtensions)
  109. self.robotParser = robotParser
  110. #end def
  111. def hasBlockedExtension(self, url):
  112. p = urllib.parse.urlparse(url)
  113. path = p[2].upper() # path attribute
  114. return path.endswith(self.blockExtensions)
  115. #end def
  116. def handle_starttag(self, tag, attrs):
  117. if len(self.pageMap) >= self.maxUrls:
  118. return
  119. if tag.upper() == "BASE":
  120. if attrs[0][0].upper() == "HREF":
  121. self.baseUrl = joinUrls(self.baseUrl, attrs[0][1])
  122. print("BASE URL set to " + self.baseUrl)
  123. if tag.upper() == "A":
  124. #print("Attrs: " + str(attrs))
  125. url = ""
  126. # Let's scan the list of tag's attributes
  127. for attr in attrs:
  128. #print(" attr: " + str(attr))
  129. if (attr[0].upper() == "REL") and (attr[1].upper().find('NOFOLLOW') != -1):
  130. # We have discovered a nofollow, so we won't continue
  131. return
  132. elif (attr[0].upper() == "HREF") and (attr[1].upper().find('MAILTO:') == -1):
  133. # We have discovered a link that is not a Mailto:
  134. url = joinUrls(self.baseUrl, attr[1])
  135. #end for
  136. # if the url is empty, there was none in the list of attributes
  137. if url == "":
  138. return
  139. # Check if we want to follow the link
  140. if urllib.parse.urlsplit(url)[1] != self.server:
  141. return
  142. if self.hasBlockedExtension(url) or self.redirects.count(url) > 0:
  143. return
  144. if self.robotParser is not None and not self.robotParser.allowed(url, "sitemap_gen"):
  145. print("URL restricted by ROBOTS.TXT: " + url)
  146. return
  147. # It's OK to add url to the map and fetch it later
  148. if not url in self.pageMap:
  149. self.pageMap[url] = ()
  150. #end if
  151. #end def
  152. #end class
  153. def getUrlToProcess(pageMap):
  154. for i in pageMap.keys():
  155. if pageMap[i] == ():
  156. return i
  157. return None
  158. def parsePages(startUrl, maxUrls, blockExtensions):
  159. pageMap = {}
  160. pageMap[startUrl] = ()
  161. redirects = []
  162. robotParser = getRobotParser(startUrl)
  163. while True:
  164. url = getUrlToProcess(pageMap)
  165. if url is None:
  166. break
  167. print(" " + url)
  168. page, date, newUrl = getPage(url)
  169. if page is None:
  170. del pageMap[url]
  171. elif url != newUrl:
  172. print("Redirect -> " + newUrl)
  173. del pageMap[url]
  174. pageMap[newUrl] = ()
  175. redirects.append(url)
  176. else:
  177. pageMap[url] = date
  178. parser = MyHTMLParser(pageMap, redirects, url, maxUrls, blockExtensions, robotParser)
  179. try:
  180. parser.feed(page.decode("utf-8", errors='strict'))
  181. parser.close()
  182. except UnicodeError:
  183. print("Failed decoding %s . Try to check if the page is valid." % (url))
  184. #end while
  185. return pageMap
  186. #end def
  187. def generateSitemapFile(pageMap, fileName, changefreq="", priority=0.0):
  188. fw = open(fileName, "wt")
  189. fw.write('''<?xml version="1.0" encoding="UTF-8"?>
  190. <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n''')
  191. for i in sorted(pageMap.keys()):
  192. fw.write('<url>\n <loc>%s</loc>\n' % (xml.sax.saxutils.escape(i)))
  193. if pageMap[i] not in [(), (0, 0, 0)]:
  194. fw.write(' <lastmod>%4d-%02d-%02d</lastmod>\n' % pageMap[i])
  195. if changefreq != "":
  196. fw.write(' <changefreq>%s</changefreq>\n' % (changefreq))
  197. if priority > 0.0:
  198. fw.write(' <priority>%1.1f</priority>\n' % (priority))
  199. fw.write('</url>\n')
  200. #end for
  201. fw.write('</urlset>\n')
  202. fw.close()
  203. #end def
  204. def main():
  205. try:
  206. opts, args = getopt.getopt(sys.argv[1:],\
  207. "hb:c:m:p:o:", \
  208. ["help", "block=", "changefreq=", \
  209. "max-urls=", "priority=", "output-file="])
  210. except getopt.GetoptError:
  211. sys.stderr.write(helpText)
  212. return 1
  213. blockExtensions = []
  214. changefreq = ""
  215. priority = 0.0
  216. fileName = "sitemap.xml"
  217. maxUrls = 1000
  218. pageMap = {}
  219. for opt, arg in opts:
  220. if opt in ("-h", "--help"):
  221. sys.stderr.write(helpText)
  222. return 1
  223. elif opt in ("-b", "--block"):
  224. blockExtensions.append("." + arg.upper())
  225. elif opt in ("-c", "--changefreq"):
  226. if arg in allowedChangefreq:
  227. changefreq = arg
  228. else:
  229. sys.stderr.write("Allowed changefreq values are:\n")
  230. for i in allowedChangefreq:
  231. sys.stderr.write(" {}\n".format(i))
  232. return 1
  233. elif opt in ("-m", "--max-urls"):
  234. maxUrls = int(arg)
  235. if (maxUrls < 0) or (maxUrls > 50000):
  236. sys.stderr.write("The maximum number of URLs must be between 1 and 50000\n")
  237. return 1
  238. elif opt in ("-p", "--priority"):
  239. priority = float(arg)
  240. if (priority < 0.0) or (priority > 1.0):
  241. sys.stderr.write("Priority must be between 0.0 and 1.0\n")
  242. return 1
  243. elif opt in ("-o", "--output-file"):
  244. fileName = arg
  245. if fileName in ("", ".", ".."):
  246. sys.stderr.write("Please provide a sensible file name\n")
  247. return 1
  248. #end if
  249. if not args:
  250. sys.stderr.write("You must provide the starting URL.\nTry the -h option for help.\n")
  251. return 1
  252. # Set user agent string
  253. opener = urllib.request.build_opener()
  254. opener.addheaders = [('User-agent', 'sitemap_gen/1.0'),
  255. ('Accept', '*/*'), ('Accept-Encoding', 'gzip')]
  256. urllib.request.install_opener(opener)
  257. # Start processing
  258. print("Crawling the site...")
  259. pageMap = parsePages(args[0], maxUrls, blockExtensions)
  260. print("Generating sitemap: %d URLs" % (len(pageMap)))
  261. generateSitemapFile(pageMap, fileName, changefreq, priority)
  262. print("Finished.")
  263. return 0
  264. #end def
  265. if __name__ == '__main__':
  266. try:
  267. status_code = main()
  268. except KeyboardInterrupt:
  269. status_code = 130
  270. sys.exit(status_code)