Subdomain Posts
None | 4 days ago
Python | 21 days ago
C# | 25 days ago
C# | 26 days ago
C# | 28 days ago
C# | 28 days ago
C | 172 days ago
Python | 175 days ago
C++ | 196 days ago
PHP | 260 days ago
Recent Posts
None | 8 sec ago
HTML | 10 sec ago
None | 11 sec ago
None | 14 sec ago
None | 19 sec ago
None | 23 sec ago
None | 26 sec ago
Bash | 1 min ago
None | 1 min ago
None | 1 min ago
Sitereport
Find cool info about any domain on the internet?
visit sitereport
Free Subdomains
Want a pastebin.com sub-domain for your community?
learn more...
What is pastebin?
Pastebin is a website that hosts all your text & code on dedicated servers for easy sharing.
learn more...
Learn a little bit about the new Pastebin.com on our help page. hide message
By IceDragon on the 4th of Jul 2009 02:40:33 PM Download | Raw | Embed | Report
  1. <?php
  2. /*
  3.  *** robots.txt processing class
  4.  * Author: IceDragon of QuickFox.org
  5.  *         http://www.icerealm.org/
  6.  *
  7.  * Feel free to modify/use for any purpose.
  8.  *
  9.  * Change Log:
  10.  *   1.0.0 [20090704] - Initial release.
  11.  */
  12.  
  13. class Robots {
  14.     /** Members **/
  15.     public $gUserAgent = NULL;
  16.     public $gRules     = array();
  17.  
  18.     /** Constructor **/
  19.     // Class constructor - optionally accepts a path to preload robots.txt
  20.     // exclusion rules from.
  21.     public function __construct( $path = NULL, $userAgent = FALSE )
  22.     {
  23.         if( $userAgent )
  24.             $this->SetUserAgent( $userAgent );
  25.         if( $path != NULL )
  26.             $this->Load( $path );
  27.     }
  28.  
  29.  
  30.     /** Methods **/
  31.     // This function loads a robots.txt file from a specific path and stores
  32.     // the exclusion rules in the $rules method. Use this to prime the class
  33.     // with data.
  34.     public function Load( $path )
  35.     {
  36.         $fd = fopen( $path, 'r' );
  37.         if( !$fd )
  38.             throw new Exception("Unable to open path `$path`");
  39.  
  40.         $user_agent   = "*";
  41.         $disallowed   = array();
  42.         $this->gRules = array();
  43.        
  44.         while( !feof($fd) )
  45.         {
  46.             // Read line and check if we've reached an EOF.
  47.             $line = fgets($fd);
  48.             if( feof($fd) )
  49.                 continue;
  50.  
  51.             $line       = trim($line);
  52.             $split_line = explode(' ', $line);
  53.  
  54.             // Disregard comments or empty lines.
  55.             if( $line == "" || $line[0] == "#" )
  56.                 continue;
  57.  
  58.             // UserAgent change.
  59.             if( strtolower($split_line[0]) == "user-agent:" )
  60.             {
  61.                 if( array_key_exists( $user_agent, $this->gRules ) )
  62.                 {
  63.                     $existing_rules = $this->gRules[$user_agent];
  64.                     $disallowed = array_merge( $existing_rules, $disallowed );
  65.                 }
  66.                
  67.                 $this->gRules[$user_agent] = $disallowed;
  68.                 $user_agent = strtolower( $split_line[1] );
  69.                 $disallowed = array();
  70.                 continue;
  71.             }
  72.  
  73.             // Disallow rule.
  74.             if( strtolower($split_line[0]) == "disallow:" )
  75.                 if( $split_line[1][0] != "#" )
  76.                     array_push( $disallowed, $split_line[1] );
  77.         }
  78.  
  79.         // Add the last entry.
  80.         if( array_key_exists( $user_agent, $this->gRules ) )
  81.         {
  82.             $existing_rules = $this->gRules[$user_agent];
  83.             $disallowed = array_merge( $existing_rules, $disallowed );
  84.         }
  85.         $this->gRules[$user_agent] = $disallowed;
  86.         fclose($fd);
  87.     } // Load()
  88.  
  89.  
  90.     // This function sets the UserAgent to match URLs against.
  91.     public function SetUserAgent( $userAgent )
  92.     {
  93.         $this->gUserAgent = $userAgent;
  94.     } // SetUserAgent()
  95.  
  96.    
  97.     // This function verifies if the current UserAgent is allowed to access the
  98.     // specific URL. Returns TRUE if allowed, FALSE otherwise.
  99.     // NOTE: Do not specify the full URL (http://...), only the part after the
  100.     //       domain! i.e.: IsAllowed("/robots.txt");
  101.     public function IsAllowed( $url )
  102.     {
  103.         // Locate a matching UserAgent string from the list.
  104.         foreach( $this->gRules as $user_agent => $rules )
  105.         {
  106.             if( $user_agent != "*" && strstr( strtolower($this->gUserAgent), $user_agent ) == FALSE )
  107.                 continue;
  108.  
  109.             foreach( $rules as $str )
  110.                 if( $this->_StartsWith( $url, $str ) )
  111.                     return FALSE;
  112.         }
  113.  
  114.         return TRUE;
  115.     } // IsAllowed()
  116.  
  117.  
  118.     // Check if a string starts with a substring.
  119.     private function _StartsWith( $str, $subStr )
  120.     {
  121.         return ( substr($str, 0, strlen($subStr)) == $subStr );
  122.     } // _StartsWith()
  123. } // class Robots
  124. ?>
Submit a correction or amendment below. Make A New Post
To highlight particular lines, prefix each line with @h@
Syntax highlighting:
Post expiration:
Post exposure:
Name / Title:
Email: