# robots.txt for http://pennine.ddns.me.uk/ # this is the public server on spey # SemrushBot seems specifically to provide search results which # facilitate blog spam, so we would choose to block it from the # entire site. Its web pages suggest that it does respect your # robots.txt, but the IP address from which we found it # crawling is also blocked in the firewall blanketban script User-agent: SemrushBot Disallow: / User-agent: SemrushBot-SA Disallow: / User-agent: * # don't want ANY pages which duplicate www.pennine.demon.co.uk or www.waddingtons.info reachable by search engines # Partly irrelevant now that www.pennine.demon.co.uk has gone Disallow: /Mary/ Disallow: /Andy/ Disallow: /Sarah/ Disallow: /Michael/ Disallow: /5434/ Disallow: /ajaxterm/ Disallow: /Arboretum/ Disallow: /cgi-bin/ Disallow: /CUCC/ Disallow: /Design/ Disallow: /dynamicdrive/ Disallow: /ebay/ Disallow: /family/ Disallow: /galleria/ # downloads will come from here, but not be searchable Disallow: /Gallery # do we actually need this or should we just delete the directory ? Disallow: /GMFoto/ # we allow graphics Disallow: /Icons/ # we allow img Disallow: /Java/ Disallow: /js/ Disallow: /kayak/ Disallow: /lofoten/ # we do allow /mail/[index.htm] Disallow: /NPC/ Disallow: /Pennine/ # we do allow /phone/ Disallow: /photos/ # not actually sure how this is mounted and appearing here, but block it Disallow /blog/photos/mainarchive/ # we do allow /publickeys/ # we do allow riverlevels # gone Disallow: /ROCK/ # we don't want routes searchable as we don't want traffic to wtp2 Disallow: /routes/ Disallow: /SkiGuide/ Disallow: /SOC/ # svn is not in /home/pub/www - but see the apache2 config for why it needs to be here Disallow: /svn/ # we are allowing uploaded - but stuff here should be temporary Disallow: /uploads/ # /uploads is also protected by .htaccess Disallow: /video/ # if we are using websupport things on the SOC site, then they need to # support https, so we can't use the www.waddingtons.info versions. OTOH # this stuff doesn't want to be *searchable*, so disallow robots Disallow: /websupport/ Disallow: /xfamily/ # xss only exists to support our own experiments / penetration testing using cross-site scripting Disallow: /xss/ # the root page ought not to duplicate www.pennine... but is not designed for human use anyway (use Home.htm) Disallow: /index.htm # don't want ephemeral things intended for specific recipients. Most of the time this doesn't exist anyway # Disallow: /temp/