Sitemap Generator

sitemap_gen.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. #! /usr/bin/env python3
  2. """
  3. Copyright (C) 2007-2009 Vladimir Toncar
  4. Copyright (C) 2018 Bernhard Ehlers
  5. Contributors:
  6. Redirect handling by Pavel "ShadoW" Dvorak
  7. This program is free software; you can redistribute it and/or modify
  8. it under the terms of the GNU General Public License as published by
  9. the Free Software Foundation; either version 3 of the License, or
  10. (at your option) any later version.
  11. This program is distributed in the hope that it will be useful,
  12. but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. GNU General Public License for more details.
  15. """
  16. import sys
  17. import getopt
  18. import urllib.error
  19. import urllib.parse
  20. import urllib.request
  21. from datetime import datetime
  22. from html.parser import HTMLParser
  23. import xml.sax.saxutils
  24. from reppy.robots import Robots
  25. helpText = """sitemap_gen.py version 1.2.1 (2018-07-10)
  26. This script crawls a web site from a given starting URL and generates
  27. a Sitemap file in the format that is accepted by Google. The crawler
  28. does not follow links to other web sites. It also respects the 'nofollow'
  29. tags and will not crawl into directories disallowed in the robots.txt file.
  30. Command line syntax:
  31. python3 sitemap_gen.py <options> <starting URL>
  32. Available options:
  33. -h --help Print this text and exit
  34. -b <ext> --block <ext> Exclude URLs with the given extension;
  35. <ext> must be without the leading dot.
  36. The comparison is case insensitive, so
  37. for example DOC and doc are treated
  38. the same. You can use this option several
  39. times to block several extensions.
  40. -c <value> --changefreq <value> Set the change frequency. The given value
  41. is used in all sitemap entries (maybe a
  42. future version of this script will change
  43. that). The allowed values are: always,
  44. hourly, daily, weekly, monthly, yearly,
  45. never.
  46. -p <prio> --priority <prio> Set the priority. The value must be from
  47. the interval between 0.0 and 1.0. The value
  48. will be used in all sitemap entries.
  49. -m <value> --max-urls <value> Set the maximum number of URLs to be crawled.
  50. The default value is 1000 and the largest
  51. value that you can set is 50000 (the script
  52. generates only a single sitemap file).
  53. -o <file> --output-file <file> Set the name of the geneated sitemap file.
  54. The default file name is sitemap.xml.
  55. Usage example:
  56. python3 sitemap_gen.py -b doc -b bmp -o test_sitemap.xml http://www.your-site-name.com/index.html
  57. For more information, visit http://toncar.cz/opensource/sitemap_gen.html
  58. """
  59. allowedChangefreq = ["always", "hourly", "daily", "weekly", \
  60. "monthly", "yearly", "never"]
  61. def getPage(url):
  62. try:
  63. f = urllib.request.urlopen(url)
  64. page = f.read()
  65. # Get the last modify date
  66. try:
  67. if 'Last-Modified' in f.headers:
  68. date = f.headers['Last-Modified']
  69. else:
  70. date = f.headers['Date']
  71. date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z')
  72. date = (date.year, date.month, date.day)
  73. except (KeyError, ValueError):
  74. date = (0, 0, 0)
  75. f.close()
  76. return (page, date, f.url)
  77. except urllib.error.URLError as detail:
  78. print("%s. Skipping..." % (detail))
  79. return (None, (0, 0, 0), "")
  80. #end def
  81. def joinUrls(baseUrl, newUrl):
  82. helpUrl, _ = urllib.parse.urldefrag(newUrl)
  83. return urllib.parse.urljoin(baseUrl, helpUrl)
  84. #end def
  85. def getRobotParser(startUrl):
  86. robotUrl = urllib.parse.urljoin(startUrl, "/robots.txt")
  87. page, _, _ = getPage(robotUrl)
  88. if page is None:
  89. print("Could not read ROBOTS.TXT at: " + robotUrl)
  90. return None
  91. #end if
  92. rp = Robots.parse(robotUrl, page)
  93. print("Found ROBOTS.TXT at: " + robotUrl)
  94. return rp
  95. #end def
  96. class MyHTMLParser(HTMLParser):
  97. def __init__(self, pageMap, redirects, baseUrl, maxUrls, blockExtensions, robotParser):
  98. HTMLParser.__init__(self)
  99. self.pageMap = pageMap
  100. self.redirects = redirects
  101. self.baseUrl = baseUrl
  102. self.server = urllib.parse.urlsplit(baseUrl)[1] # netloc in python 2.5
  103. self.maxUrls = maxUrls
  104. self.blockExtensions = tuple(blockExtensions)
  105. self.robotParser = robotParser
  106. #end def
  107. def hasBlockedExtension(self, url):
  108. p = urllib.parse.urlparse(url)
  109. path = p[2].upper() # path attribute
  110. return path.endswith(self.blockExtensions)
  111. #end def
  112. def handle_starttag(self, tag, attrs):
  113. if len(self.pageMap) >= self.maxUrls:
  114. return
  115. if tag.upper() == "BASE":
  116. if attrs[0][0].upper() == "HREF":
  117. self.baseUrl = joinUrls(self.baseUrl, attrs[0][1])
  118. print("BASE URL set to " + self.baseUrl)
  119. if tag.upper() == "A":
  120. #print("Attrs: " + str(attrs))
  121. url = ""
  122. # Let's scan the list of tag's attributes
  123. for attr in attrs:
  124. #print(" attr: " + str(attr))
  125. if (attr[0].upper() == "REL") and (attr[1].upper().find('NOFOLLOW') != -1):
  126. # We have discovered a nofollow, so we won't continue
  127. return
  128. elif (attr[0].upper() == "HREF") and (attr[1].upper().find('MAILTO:') == -1):
  129. # We have discovered a link that is not a Mailto:
  130. url = joinUrls(self.baseUrl, attr[1])
  131. #end for
  132. # if the url is empty, there was none in the list of attributes
  133. if url == "":
  134. return
  135. # Check if we want to follow the link
  136. if urllib.parse.urlsplit(url)[1] != self.server:
  137. return
  138. if self.hasBlockedExtension(url) or self.redirects.count(url) > 0:
  139. return
  140. if self.robotParser is not None and not self.robotParser.allowed(url, "*"):
  141. print("URL restricted by ROBOTS.TXT: " + url)
  142. return
  143. # It's OK to add url to the map and fetch it later
  144. if not url in self.pageMap:
  145. self.pageMap[url] = ()
  146. #end if
  147. #end def
  148. #end class
  149. def getUrlToProcess(pageMap):
  150. for i in pageMap.keys():
  151. if pageMap[i] == ():
  152. return i
  153. return None
  154. def parsePages(startUrl, maxUrls, blockExtensions):
  155. pageMap = {}
  156. pageMap[startUrl] = ()
  157. redirects = []
  158. robotParser = getRobotParser(startUrl)
  159. while True:
  160. url = getUrlToProcess(pageMap)
  161. if url is None:
  162. break
  163. print(" " + url)
  164. page, date, newUrl = getPage(url)
  165. if page is None:
  166. del pageMap[url]
  167. elif url != newUrl:
  168. print("Redirect -> " + newUrl)
  169. del pageMap[url]
  170. pageMap[newUrl] = ()
  171. redirects.append(url)
  172. else:
  173. pageMap[url] = date
  174. parser = MyHTMLParser(pageMap, redirects, url, maxUrls, blockExtensions, robotParser)
  175. try:
  176. parser.feed(page.decode("utf-8", errors='strict'))
  177. parser.close()
  178. except UnicodeError:
  179. print("Failed decoding %s . Try to check if the page is valid." % (url))
  180. #end while
  181. return pageMap
  182. #end def
  183. def generateSitemapFile(pageMap, fileName, changefreq="", priority=0.0):
  184. fw = open(fileName, "wt")
  185. fw.write('''<?xml version="1.0" encoding="UTF-8"?>
  186. <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n''')
  187. for i in sorted(pageMap.keys()):
  188. fw.write('<url>\n <loc>%s</loc>\n' % (xml.sax.saxutils.escape(i)))
  189. if pageMap[i] not in [(), (0, 0, 0)]:
  190. fw.write(' <lastmod>%4d-%02d-%02d</lastmod>\n' % pageMap[i])
  191. if changefreq != "":
  192. fw.write(' <changefreq>%s</changefreq>\n' % (changefreq))
  193. if priority > 0.0:
  194. fw.write(' <priority>%1.1f</priority>\n' % (priority))
  195. fw.write('</url>\n')
  196. #end for
  197. fw.write('</urlset>\n')
  198. fw.close()
  199. #end def
  200. def main():
  201. try:
  202. opts, args = getopt.getopt(sys.argv[1:],\
  203. "hb:c:m:p:o:", \
  204. ["help", "block=", "changefreq=", \
  205. "max-urls=", "priority=", "output-file="])
  206. except getopt.GetoptError:
  207. sys.stderr.write(helpText)
  208. return 1
  209. blockExtensions = []
  210. changefreq = ""
  211. priority = 0.0
  212. fileName = "sitemap.xml"
  213. maxUrls = 1000
  214. pageMap = {}
  215. for opt, arg in opts:
  216. if opt in ("-h", "--help"):
  217. sys.stderr.write(helpText)
  218. return 1
  219. elif opt in ("-b", "--block"):
  220. blockExtensions.append("." + arg.upper())
  221. elif opt in ("-c", "--changefreq"):
  222. if arg in allowedChangefreq:
  223. changefreq = arg
  224. else:
  225. sys.stderr.write("Allowed changefreq values are:\n")
  226. for i in allowedChangefreq:
  227. sys.stderr.write(" {}\n".format(i))
  228. return 1
  229. elif opt in ("-m", "--max-urls"):
  230. maxUrls = int(arg)
  231. if (maxUrls < 0) or (maxUrls > 50000):
  232. sys.stderr.write("The maximum number of URLs must be between 1 and 50000\n")
  233. return 1
  234. elif opt in ("-p", "--priority"):
  235. priority = float(arg)
  236. if (priority < 0.0) or (priority > 1.0):
  237. sys.stderr.write("Priority must be between 0.0 and 1.0\n")
  238. return 1
  239. elif opt in ("-o", "--output-file"):
  240. fileName = arg
  241. if fileName in ("", ".", ".."):
  242. sys.stderr.write("Please provide a sensible file name\n")
  243. return 1
  244. #end if
  245. if not args:
  246. sys.stderr.write("You must provide the starting URL.\nTry the -h option for help.\n")
  247. return 1
  248. # Set user agent string
  249. opener = urllib.request.build_opener()
  250. opener.addheaders = [('User-agent', 'sitemap_gen/1.0')]
  251. urllib.request.install_opener(opener)
  252. # Start processing
  253. print("Crawling the site...")
  254. pageMap = parsePages(args[0], maxUrls, blockExtensions)
  255. print("Generating sitemap: %d URLs" % (len(pageMap)))
  256. generateSitemapFile(pageMap, fileName, changefreq, priority)
  257. print("Finished.")
  258. return 0
  259. #end def
  260. if __name__ == '__main__':
  261. try:
  262. status_code = main()
  263. except KeyboardInterrupt:
  264. status_code = 130
  265. sys.exit(status_code)