Helper Example

   1 #!/usr/bin/ruby
   2 # encoding: utf-8
   3 
   4 require "rubygems"
   5 require "net/http"
   6 require "open-uri"
   7 require 'timeout'
   8 require 'libxml'
   9 
  10 require 'syslog'
  11 
  12 
  13 module Crawler
  14  class NetHttp
  15    def initialize(proxy_host, proxy_port=80, proxy_user = nil, proxy_pass = nil)
  16      @proxy_host =  proxy_host;
  17      @proxy_port =  proxy_port;
  18      @proxy_user =  proxy_user;
  19      @proxy_pass =  proxy_pass;
  20    end
  21 
  22    def request_response(uri_str, limit = 10)
  23      begin
  24        http = Net::HTTP::Proxy(@proxy_host, @proxy_port, @proxy_user, @proxy_pass)
  25        result = http.get_response(URI.parse(uri_str))
  26        case result
  27        when Net::HTTPSuccess     then result
  28        when Net::HTTPRedirection then request_response(result['location'], limit - 1)
  29        else
  30          result.error!
  31        end
  32      rescue Exception => e
  33          puts e.message
  34          return false
  35      end
  36    end
  37 
  38   def self.head(url)
  39     url = URI.parse(url)
  40 
  41     begin
  42       res = Net::HTTP.start(url.host, url.port) {|http|
  43         http.head(url.path)
  44       }
  45     rescue =>e
  46       return nil
  47     end
  48     return res
  49   end
  50 
  51   def self.get(url)
  52     url = URI.parse(url)
  53 
  54     begin
  55       res = Net::HTTP.start(url.host, url.port) {|http|
  56         http.get(url.path)
  57       }
  58     rescue =>e
  59       return nil
  60     end
  61     return res
  62   end
  63 
  64   def getmeta4(url)
  65    return self.get(url + ".meta4") if !url.end_with?(".meta4")
  66   end
  67 
  68 
  69 
  70   def self.redirect?(url)
  71    res = nil
  72    begin
  73    status = Timeout::timeout(0.5) {
  74    res = self.head(url)
  75    }
  76    rescue => e
  77      $stderr.puts "Redirect Check Timedout"
  78      res = false
  79    end
  80    if res && res.code == "301" && res.code == "302"
  81       return true
  82     elsif res && res.code == "200"
  83       return false
  84     else
  85       return nil
  86     end
  87   end
  88 
  89   def self.digest?(url)
  90    res = nil
  91    begin
  92    status = Timeout::timeout(2) {
  93    res = self.head(url)
  94    }
  95    rescue => e
  96      $stderr.puts "Redirect Check Timedout"
  97      res = false
  98    end
  99    if res["Digest"]
 100       return true
 101     else
 102       return false
 103     end
 104   end
 105 
 106  end
 107 end
 108 
 109 #c = Crawler::NetHttp.new("<http proxy URL>", "<port >", "Proxy user name", "Proxy Password")
 110 
 111 
 112 class Cache
 113         def initialize
 114         @host = "localhost"
 115         @db = "0"
 116         @port = 6379
 117         #@redis = Redis.new(:host => @host, :port => @port)
 118         #@redis.select @db
 119         end
 120 
 121         def setvid(url,vid)
 122            #return @redis.setex  "md5(" + vid+ ")",1200 ,url
 123            return true;
 124         end
 125 
 126         def geturl(vid)
 127            return @redis.get "md5(" + vid + ")"
 128         end
 129 
 130 
 131         def sfdlid(url)
 132                         m = url.match(/^http:\/\/.*\.dl\.sourceforge\.net\/(.*)/)
 133                         if m[1]
 134                                 return m[1]
 135                         else
 136                                 return nil
 137                         end
 138         end
 139 
 140         def vimid(url)
 141             m = url.match(/.*\.com\/(.*)\?(.*)/)
 142             offset =  m[2].match( /(aktimeoffset\=([\d\.]+))/ ) if m != nil
 143             return m[1] + "?offset=" + offset[2] if  offset != nil
 144             return m[1] if m != nil
 145             return nil
 146         end
 147 
 148         def imdbid(url)
 149             m = url.match(/.*\.com\/(.*)\?(.*)/)
 150             return m[1] if m != nil
 151             return nil
 152         end
 153 
 154         def dmvid(url)
 155             m = url.match(/.*(\.net|\.com)\/(.*)\?.*/)
 156             ec_seek = url.match(/.*(\&ec_seek\=[\d\.]+|\&start\=[\d\.]+).*/)
 157             return m[2] + ec_seek[1] if m != nil && ec_seek != nil
 158             return m[2] if m != nil
 159             return nil
 160         end
 161 
 162         def vsvid(url)
 163             m = url.match(/http:\/\/(proxy[\d]+\.videoslasher\.com)\/(.*)\?.*/)
 164             ec_seek = url.match(/.*(\&ec_seek\=[\d\.]+|\&start\=[\d\.]+).*/)
 165             return m[2] + ec_seek[1] if m != nil && ec_seek != nil
 166             return m[2] if m != nil
 167             return nil
 168         end
 169 
 170 
 171         def ytimg(url)
 172                 m = url.match(/.*\.ytimg.com\/(.*\.jpg|.*\.gif|.*\.js)/)
 173                 if m[1]
 174                         return m[1]
 175                 else
 176                         return nil
 177                 end
 178         end
 179 
 180         def ytvid(url)
 181 
 182                 id = getytid(url)
 183                 itag = getytitag(url)
 184                 range = getytrange(url)
 185                 redirect = getytredirect(url)
 186                 if id == nil
 187                         return nil
 188                 else
 189                         vid = id
 190                 end
 191                 if itag != nil
 192                         vid = vid + "&" + itag
 193                 end
 194                 if range != nil
 195                         vid = vid + "&" + range
 196                 end
 197                 if redirect != nil
 198                         vid = vid + "&" + redirect
 199                 end
 200                 if Crawler::NetHttp.redirect?(url)
 201                         vid = vid + "&non_cache=1"
 202                 end
 203                 return vid
 204         end
 205 
 206         private
 207                 def getytid(url)
 208                         m = url.match(/(id\=[a-zA-Z0-9\-\_\%]+)/)
 209                         return m.to_s if m != nil
 210                 end
 211 
 212                 def getytitag(url)
 213                         m = url.match(/(itag\=[0-9\-\_]+)/)
 214                         return m.to_s if m != nil
 215                 end
 216 
 217                 def getytrange(url)
 218                         m = url.match(/(range\=[0-9\-]+)/)
 219                         return m.to_s if m != nil
 220                 end
 221 
 222                 def getytredirect(url)
 223                         m = url.match(/(redirect\=)([a-zA-Z0-9\-\_]+)/)
 224                         return (m.to_s + Time.now.to_i.to_s) if m != nil
 225                 end
 226 
 227 
 228 end
 229 
 230 def rewriter(request)
 231                 case request
 232 
 233                 when /^http:\/\/[a-zA-Z0-9\-\_\.]+\.squid\.internal\/.*/
 234                    url = $cache.geturl(request)
 235                    if url != nil
 236                       return url
 237                     else
 238                       return ""
 239                   return ""
 240                     end
 241                 when /^http:\/\/[a-zA-Z0-9\-\_\.]+\.dl\.sourceforge\.net\/.*/
 242                   vid = $cache.sfdlid(request)
 243                   $cache.setvid(request, "http://dl.sourceforge.net.squid.internal/" + vid) if vid != nil
 244                   url = "http://dl.sourceforge.net.squid.internal/" + vid if vid != nil
 245                   return url
 246                 when /^http:\/\/av\.vimeo\.com\/.*/
 247                   vid = $cache.vimid(request)
 248                   $cache.setvid(request, "http://vimeo.squid.internal/" + vid) if vid != nil
 249                   url = "http://vimeo.squid.internal/" + vid if vid != nil
 250                   return url
 251                 when /^http:\/\/[a-zA-Z0-9\-\_\.]+\.c\.youtube\.com\/videoplayback\?.*id\=.*/
 252                   vid = $cache.ytvid(request)
 253                   $cache.setvid(request, "http://youtube.squid.internal/" + vid) if vid != nil
 254                   url = "http://youtube.squid.internal/" + vid if vid != nil
 255                   return url
 256                 when /^http:\/\/[a-zA-Z0-9\-\_\.]+\.ytimg\.com\/(.*\.jpg|.*\.gif|.*\.js)/
 257                   vid = $cache.ytimg(request)
 258                   $cache.setvid(request, "http://ytimg.squid.internal/" + vid) if vid != nil
 259                   url = "http://ytimg.squid.internal/" + vid if vid != nil
 260                   return url
 261                 when /^http:\/\/video\-http\.media\-imdb\.com\/.*\.mp4\?.*/
 262                   vid = $cache.imdbid(request)
 263                   $cache.setvid(request, "http://imdbv.squid.internal/" + vid) if vid != nil
 264                   url = "http://imdbv.squid.internal/" + vid if vid != nil
 265                   return url
 266                 when /^http:\/\/(vid\.ec\.dmcdn\.net|proxy\-[\d]+\.dailymotion\.com)\/.*(mp4|flv).*/
 267                   vid = $cache.dmvid(request)
 268                   $cache.setvid(request, "http://dmv.squid.internal/" + vid) if vid != nil
 269                   url = "http://dmv.squid.internal/" + vid if vid != nil
 270                   return url
 271                 when /http:\/\/proxy[\d]+\.videoslasher\.com\/free\/.*\.flv?.*/
 272                   vid = $cache.vsvid(request)
 273                   $cache.setvid(request, "http://videoslasher.squid.internal/" + vid) if vid != nil
 274                   url = "http://videoslasher.squid.internal/" + vid if vid != nil
 275                   return url  
 276                 when /http:\/\/(pd-vdp-cdn[\d]+-nap.terra.com)\/(terratv\/[0-9]+\.mp4)?.*/
 277                   url = "http://terratv.squid.internal/" + $2 if $2
 278                   return url
 279                 when /http:\/\/(i|vid)[\d]+\.photobucket\.com\/(.*)\.(mp4|jpg)/  
 280                   url = "http://photobucket.squid.internal/" + $2 + ".jpg" if $3 == "jpg"
 281                   url = "http://photobucket.squid.internal/" + $2 + ".mp4" if $3 == "mp4"
 282                   return url
 283                 when /http:\/\/(khm|mt)[\d]+\.google\.[a-z\.]+\/(.*)\&s\=[a-zA-Z]+/
 284                   url = "http://googlemapskhm.squid.internal/" + $2 if $1 == "khm"
 285                   url = "http://googlemapsmt.squid.internal/" + $2 if $1 == "mt"
 286                   return url
 287                 when /http:\/\/([\-a-z0-9\.]+)\.c\.android\.clients\.google\.com\/(market\/GetBinary\/[\/0-9a-z\.\-]+)\?.*/
 288                   url = "http://androidmarket.squid.internal/" + $2 if $2
 289                   return url
 290                 when /http:\/\/download\.oracle\.com\/(otn\-pub[a-zA-Z0-9\-\/\.]+)\?.*/
 291                   url = "http://oracleotn.squid.internal/" + $1 if $1
 292                   return url
 293                 when /http:\/\/image\.slidesharecdn\.com\/(.*\.jpg)\?[0-9]+/
 294                   url = "http://slidesharecdn.squid.internal/" + $1 if $1 
 295                   return url
 296                 when /http:\/\/cdn\.slidesharecdn\.com\/(.*jpg)\?[0-9]+/
 297                    url = "http://slidesharecdn.squid.internal/" + $1 if $1
 298                    return url
 299                 when /^quit.*/
 300                   exit 0
 301                 else
 302                  return ""
 303                 end
 304 end
 305 
 306 def log(msg)
 307  Syslog.log(Syslog::LOG_ERR, "%s", msg)
 308 end
 309 
 310 def eval
 311         request = gets
 312         if (request && (request.match /^[0-9]+\ /))
 313          conc(request)
 314          return true
 315         else
 316          noconc(request)
 317          return false
 318         end
 319 
 320 end
 321 
 322 
 323 def conc(request)
 324                 return if !request
 325                 request = request.split
 326                 if request[0] && request[1]
 327                         log("original request [#{request.join(" ")}].") if $debug
 328                         result = rewriter(request[1])
 329                         if result
 330                           url = request[0] +" OK store-id=" + result
 331                         else
 332                           url = request[0] +" ERR"
 333                         end
 334                         log("modified response [#{url}].") if $debug
 335                         puts url
 336                 else
 337                         log("original request [had a problem].") if $debug
 338                         url = request[0] + "ERR"
 339                         log("modified response [#{url}].") if $debug
 340                         puts url
 341                 end
 342 
 343 end
 344 
 345 def noconc(request)
 346                 return if !request
 347                 request = request.split
 348                 if request[0]
 349                         log("Original request [#{request.join(" ")}].") if $debug
 350                         result = rewriter(request[0])
 351                         if result && (result.size > 10)
 352                                 url = "OK store-id=" + rewriter(request[0])
 353                                 #url = "OK store-id=" + request[0] if ( ($empty % 2) == 0 )
 354                         else
 355                                 url = "ERR"
 356                         end
 357                         log("modified response [#{url}].") if $debug
 358                         puts url
 359                 else
 360                         log("Original request [had a problem].") if $debug
 361                         url = "ERR"
 362                         log("modified response [#{url}].") if $debug
 363                         puts url
 364                 end
 365 end
 366 
 367 def validr?(request)
 368   if (request.ascii_only? && request.valid_encoding?)
 369     return true
 370   else
 371     STDERR.puts("errorness line#{request}")
 372     #sleep 2
 373     return false
 374   end
 375 
 376 
 377 end
 378 
 379 def main
 380 
 381         Syslog.open('cordinator.rb', Syslog::LOG_PID)
 382         log("Started")
 383 
 384         c = eval
 385 
 386          if c
 387           while request = gets
 388              conc(request) if validr?(request)
 389           end
 390          else
 391           while request = gets
 392 #            $empty += 1
 393              noconc(request) if validr?(request)
 394           end
 395          end
 396 end
 397 
 398 $debug = true
 399 $cache = Cache.new
 400 STDOUT.sync = true
 401 #$empty = 1
 402 main

Features/StoreID/Helper (last edited 2013-07-08 20:47:21 by Eliezer Croitoru)