# $Id: robots.txt,v 1.9.2.1 2008/12/10 20:12:19 goba Exp $ # # robots.txt # # This file is to prevent the crawling and indexing of certain parts # of your site by web crawlers and spiders run by sites like Yahoo! # and Google. By telling these "robots" where not to go on your site, # you save bandwidth and server resources. # # This file will be ignored unless it is at the root of your host: # Used: http://example.com/robots.txt # Ignored: http://example.com/site/robots.txt # # For more information about the robots.txt standard, see: # http://www.robotstxt.org/wc/robots.html # # For syntax checking, see: # http://www.sxw.org.uk/computing/robots/check.html # Thank you Wikipedia for, as usual, being a tremendous resource for, # y'know, everything, and in this case, for which robots to try to block # The following was taken from http://en.wikipedia.org/robots.txt # Some bots are known to be trouble, particularly those designed to copy # entire sites. Please obey robots.txt. User-agent: sitecheck.internetseer.com Disallow: / User-agent: Zealbot Disallow: / User-agent: MSIECrawler Disallow: / User-agent: SiteSnagger Disallow: / User-agent: WebStripper Disallow: / User-agent: WebCopier Disallow: / User-agent: Fetch Disallow: / User-agent: Offline Explorer Disallow: / User-agent: Teleport Disallow: / User-agent: TeleportPro Disallow: / User-agent: WebZIP Disallow: / User-agent: linko Disallow: / User-agent: HTTrack Disallow: / User-agent: Microsoft.URL.Control Disallow: / User-agent: Xenu Disallow: / User-agent: larbin Disallow: / User-agent: libwww Disallow: / User-agent: ZyBORG Disallow: / User-agent: Download Ninja Disallow: / # Hits many times per second, not acceptable # http://www.nameprotect.com/botinfo.html User-agent: NPBot Disallow: / # A capture bot, downloads gazillions of pages with no public benefit # http://www.webreaper.net/ User-agent: WebReaper Disallow: / # User-agent: Googlebot # User-agent: Slurp # User-agent: MSNbot # User-agent: Teoma # Ask.com User-agent: * Crawl-delay: 10 # Directories Disallow: /includes/ Disallow: /misc/ Disallow: /modules/ Disallow: /profiles/ Disallow: /scripts/ Disallow: /sites/all/ Disallow: /sites/default/ Allow: /sites/default/files/ Disallow: /sites/manomet.org/ Allow: /sites/manomet.org/files/partnerships-pdfs/ Allow: /sites/manomet.org/files/scidocs-pdfs/ Allow: /sites/manomet.org/files/today-manomet-pdfs/ Disallow: /sites/manomet.org/files/textimage/ Disallow: /sites/ccwa/ Allow: /sites/ccwa/files/ Disallow: /sites/climateandwildlife.org/ Disallow: /sites/arcticblog.manomet.org/ Disallow: /sites/70.32.115.167.arcticblog/ Disallow: /themes/ # Files Disallow: /CHANGELOG.txt Disallow: /cron.php Disallow: /INSTALL.mysql.txt Disallow: /INSTALL.pgsql.txt Disallow: /install.php Disallow: /INSTALL.txt Disallow: /LICENSE.txt Disallow: /MAINTAINERS.txt Disallow: /update.php Disallow: /UPGRADE.txt Disallow: /xmlrpc.php # added April 2010 by Alison Disallow: /favicon.ico Disallow: /favicon.gif Disallow: /sites/default/files/index.php Disallow: /news.html Disallow: /index.html Disallow: /index.htm Disallow: /google_map.php Disallow: /*.htm$ Disallow: /*.html$ Disallow: /*.php$ Allow: /sites/*/files/*.pdf$ Allow: /sites/*/files/*.xls$ Allow: /sites/*/files/*.mp3$ # Paths (clean URLs) Disallow: /admin/ Disallow: /comment/reply/ Disallow: /contact/ Disallow: /logout/ Disallow: /node/add/ Disallow: /search/ Disallow: /user/register/ Disallow: /user/password/ Disallow: /user/login/ # added April 2010 by Alison Disallow: /data/Unsorted/ Disallow: /data/unsorted/ Disallow: /map/ Disallow: /esp/ Disallow: /textsize/ Disallow: /admin/build/views/ Disallow: /admin/build/views/edit/ Disallow: /admin/build/views/clone/ Disallow: /admin/build/views/export/ Disallow: /views_ui_basic/ Disallow: /rightsideitem/ Disallow: /contentblockbutton/ Disallow: /contentblock/ Disallow: /contentblocksecthead/ Allow: /contact/executive-office-team Allow: /contact/contact-whsrn Allow: /sites/list-sites/whsrn-list-sites # Pattern-matching (Googlebot and Slurp obey) Disallow: /*/main.php$ Disallow: /*/news.html Disallow: /*/index.html Disallow: /*/index.htm Allow: /*?$ # allow any URL that begins with your domain name, followed by a string, followed by a ?, with no characters after the ?; via http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449 Disallow: /*? # block any URL that begins with your domain name, followed by any string, followed by a question mark, followed by any string; via http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=156449 # Paths (no clean URLs) Disallow: /?q=admin/ Disallow: /?q=comment/reply/ Disallow: /?q=contact/ Disallow: /?q=logout/ Disallow: /?q=node/add/ Disallow: /?q=search/ Disallow: /?q=user/password/ Disallow: /?q=user/register/ Disallow: /?q=user/login/