crawler: modify API to support upcoming bucket-counting crawler

2009-02-19 19:31:42 -07:00 · 2009-02-19 19:31:42 -07:00 · ef4ff21ae7
commit ef4ff21ae7
parent 9bc08158c6
2 changed files with 137 additions and 50 deletions
--- a/src/allmydata/storage/crawler.py
+++ b/src/allmydata/storage/crawler.py
@ -1,5 +1,6 @@
-import os, time, struct, pickle
+import os, time, struct
 import cPickle as pickle
 from twisted.internet import reactor
 from twisted.application import service
 from allmydata.storage.server import si_b2a
@ -25,7 +26,10 @@ class ShareCrawler(service.MultiService):
    To use this, create a subclass which implements the process_bucket()
    method. It will be called with a prefixdir and a base32 storage index
-    string. process_bucket() should run synchronously.
+    string. process_bucket() should run synchronously. Any keys added to
    self.state will be preserved. Override add_initial_state() to set up
    initial state keys. Override finished_cycle() to perform additional
    processing when the cycle is complete.
    Then create an instance, with a reference to a StorageServer and a
    filename where it can store persistent state. The statefile is used to
@ -39,15 +43,15 @@ class ShareCrawler(service.MultiService):
    the Deferred that it returns.
    """
-    # use up to 10% of the CPU, on average. This can be changed at any time.
+    # all three of these can be changed at any time
-    allowed_cpu_percentage = .10
+    allowed_cpu_percentage = .10 # use up to 10% of the CPU, on average
-    # use up to 1.0 seconds before yielding. This can be changed at any time.
+    cpu_slice = 1.0 # use up to 1.0 seconds before yielding
-    cpu_slice = 1.0
+    minimum_cycle_time = 300 # don't run a cycle faster than this
    # don't run a cycle faster than this
    minimum_cycle_time = 300
-    def __init__(self, server, statefile):
+    def __init__(self, server, statefile, allowed_cpu_percentage=None):
        service.MultiService.__init__(self)
        if allowed_cpu_percentage is not None:
            self.allowed_cpu_percentage = allowed_cpu_percentage
        self.server = server
        self.sharedir = server.sharedir
        self.statefile = statefile
@ -56,24 +60,73 @@ class ShareCrawler(service.MultiService):
        self.prefixes.sort()
        self.timer = None
        self.bucket_cache = (None, [])
-        self.first_cycle_finished = False
+        self.current_sleep_time = None
        self.next_wake_time = None
    def get_state(self):
        """I return the current state of the crawler. This is a copy of my
        state dictionary, plus the following keys::
         current-sleep-time: float, duration of our current sleep
         next-wake-time: float, seconds-since-epoch of when we will next wake
        If we are not currently sleeping (i.e. get_state() was called from
        inside the process_prefixdir, process_bucket, or finished_cycle()
        methods, or if startService has not yet been called on this crawler),
        these two keys will be None.
        """
        state = self.state.copy() # it isn't a deepcopy, so don't go crazy
        state["current-sleep-time"] = self.current_sleep_time
        state["next-wake-time"] = self.next_wake_time
        return state
    def load_state(self):
        # we use this to store state for both the crawler's internals and
        # anything the subclass-specific code needs. The state is stored
        # after each bucket is processed, after each prefixdir is processed,
        # and after a cycle is complete. The internal keys we use are:
        #  ["version"]: int, always 1
        #  ["last-cycle-finished"]: int, or None if we have not yet finished
        #                           any cycle
        #  ["current-cycle"]: int, or None if we are sleeping between cycles
        #  ["last-complete-prefix"]: str, two-letter name of the last prefixdir
        #                            that was fully processed, or None if we
        #                            are sleeping between cycles, or if we
        #                            have not yet finished any prefixdir since
        #                            a cycle was started
        #  ["last-complete-bucket"]: str, base32 storage index bucket name
        #                            of the last bucket to be processed, or
        #                            None if we are sleeping between cycles
        try:
            f = open(self.statefile, "rb")
            state = pickle.load(f)
            f.close()
        except EnvironmentError:
            state = {"version": 1,
                     "last-cycle-finished": None,
                     "current-cycle": 0,
                     "last-complete-prefix": None,
                     "last-complete-bucket": None,
                     }
        self.state = state
        lcp = state["last-complete-prefix"]
        if lcp == None:
            self.last_complete_prefix_index = -1
        else:
            self.last_complete_prefix_index = self.prefixes.index(lcp)
-            self.last_complete_bucket = state["last-complete-bucket"]
+        self.add_initial_state()
-            self.first_cycle_finished = state["first-cycle-finished"]
+
-            f.close()
+    def add_initial_state(self):
-        except EnvironmentError:
+        """Hook method to add extra keys to self.state when first loaded.
-            self.last_complete_prefix_index = -1
+
-            self.last_complete_bucket = None
+        The first time this Crawler is used, or when the code has been
-            self.first_cycle_finished = False
+        upgraded, the saved state file may not contain all the keys you
        expect. Use this method to add any missing keys. Simply modify
        self.state as needed.
        This method for subclasses to override. No upcall is necessary.
        """
        pass
    def save_state(self):
        lcpi = self.last_complete_prefix_index
@ -81,14 +134,10 @@ class ShareCrawler(service.MultiService):
            last_complete_prefix = None
        else:
            last_complete_prefix = self.prefixes[lcpi]
-        state = {"version": 1,
+        self.state["last-complete-prefix"] = last_complete_prefix
                 "last-complete-prefix": last_complete_prefix,
                 "last-complete-bucket": self.last_complete_bucket,
                 "first-cycle-finished": self.first_cycle_finished,
                 }
        tmpfile = self.statefile + ".tmp"
        f = open(tmpfile, "wb")
-        pickle.dump(state, f)
+        pickle.dump(self.state, f)
        f.close()
        fileutil.move_into_place(tmpfile, self.statefile)
@ -105,6 +154,8 @@ class ShareCrawler(service.MultiService):
    def start_slice(self):
        self.timer = None
        self.current_sleep_time = None
        self.next_wake_time = None
        start_slice = time.time()
        try:
            self.start_current_prefix(start_slice)
@ -112,7 +163,8 @@ class ShareCrawler(service.MultiService):
        except TimeSliceExceeded:
            finished_cycle = False
        # either we finished a whole cycle, or we ran out of time
-        this_slice = time.time() - start_slice
+        now = time.time()
        this_slice = now - start_slice
        # this_slice/(this_slice+sleep_time) = percentage
        # this_slice/percentage = this_slice+sleep_time
        # sleep_time = (this_slice/percentage) - this_slice
@ -128,10 +180,16 @@ class ShareCrawler(service.MultiService):
        else:
            self.sleeping_between_cycles = False
        self.current_sleep_time = sleep_time # for status page
        self.next_wake_time = now + sleep_time
        self.yielding(sleep_time)
        self.timer = reactor.callLater(sleep_time, self.start_slice)
    def start_current_prefix(self, start_slice):
        if self.state["current-cycle"] is None:
            assert self.state["last-cycle-finished"] is not None
            self.state["current-cycle"] = self.state["last-cycle-finished"] + 1
        cycle = self.state["current-cycle"]
        for i in range(self.last_complete_prefix_index+1, len(self.prefixes)):
            if time.time() > start_slice + self.cpu_slice:
                raise TimeSliceExceeded()
@ -147,17 +205,19 @@ class ShareCrawler(service.MultiService):
                except EnvironmentError:
                    buckets = []
                self.bucket_cache = (i, buckets)
-            self.process_prefixdir(prefixdir, buckets, start_slice)
+            self.process_prefixdir(cycle, prefix, prefixdir,
                                   buckets, start_slice)
            self.last_complete_prefix_index = i
            self.save_state()
        # yay! we finished the whole cycle
        self.last_complete_prefix_index = -1
-        self.last_complete_bucket = None
+        self.state["last-complete-bucket"] = None
-        self.first_cycle_finished = True
+        self.state["last-cycle-finished"] = cycle
        self.state["current-cycle"] = None
        self.finished_cycle(cycle)
        self.save_state()
        self.finished_cycle()
-    def process_prefixdir(self, prefixdir, buckets, start_slice):
+    def process_prefixdir(self, cycle, prefix, prefixdir, buckets, start_slice):
        """This gets a list of bucket names (i.e. storage index strings,
        base32-encoded) in sorted order.
@ -166,20 +226,43 @@ class ShareCrawler(service.MultiService):
        are being managed by this server.
        """
        for bucket in buckets:
-            if bucket <= self.last_complete_bucket:
+            if bucket <= self.state["last-complete-bucket"]:
                continue
            if time.time() > start_slice + self.cpu_slice:
                raise TimeSliceExceeded()
-            self.process_bucket(prefixdir, bucket)
+            self.process_bucket(cycle, prefix, prefixdir, bucket)
-            self.last_complete_bucket = bucket
+            self.state["last-complete-bucket"] = bucket
            self.save_state()
-    def process_bucket(self, prefixdir, storage_index_b32):
+    def process_bucket(self, cycle, prefix, prefixdir, storage_index_b32):
        """Examine a single bucket. Subclasses should do whatever they want
        to do to the shares therein, then update self.state as necessary.
        This method will be called exactly once per share (per cycle), unless
        the crawler was interrupted (by node restart, for example), in which
        case it might be called a second time on a bucket which was processed
        during the previous node's incarnation. However, in that case, no
        changes to self.state will have been recorded.
        This method for subclasses to override. No upcall is necessary.
        """
        pass
-    def finished_cycle(self):
+    def finished_cycle(self, cycle):
        """Notify subclass that a cycle (one complete traversal of all
        prefixdirs) has just finished. 'cycle' is the number of the cycle
        that just finished. This method should perform summary work and
        update self.state to publish information to status displays.
        This method for subclasses to override. No upcall is necessary.
        """
        pass
    def yielding(self, sleep_time):
        """The crawler is about to sleep for 'sleep_time' seconds. This
        method is mostly for the convenience of unit tests.
        This method for subclasses to override. No upcall is necessary.
        """
        pass
--- a/src/allmydata/test/test_crawler.py
+++ b/src/allmydata/test/test_crawler.py
@ -15,23 +15,23 @@ from common_util import StallMixin
 class BucketEnumeratingCrawler(ShareCrawler):
    cpu_slice = 500 # make sure it can complete in a single slice
-    def __init__(self, server, statefile):
+    def __init__(self, *args, **kwargs):
-        ShareCrawler.__init__(self, server, statefile)
+        ShareCrawler.__init__(self, *args, **kwargs)
        self.all_buckets = []
        self.finished_d = defer.Deferred()
-    def process_bucket(self, prefixdir, storage_index_b32):
+    def process_bucket(self, cycle, prefix, prefixdir, storage_index_b32):
        self.all_buckets.append(storage_index_b32)
-    def finished_cycle(self):
+    def finished_cycle(self, cycle):
        eventually(self.finished_d.callback, None)
 class PacedCrawler(ShareCrawler):
    cpu_slice = 500 # make sure it can complete in a single slice
-    def __init__(self, server, statefile):
+    def __init__(self, *args, **kwargs):
-        ShareCrawler.__init__(self, server, statefile)
+        ShareCrawler.__init__(self, *args, **kwargs)
        self.countdown = 6
        self.all_buckets = []
        self.finished_d = defer.Deferred()
-    def process_bucket(self, prefixdir, storage_index_b32):
+    def process_bucket(self, cycle, prefix, prefixdir, storage_index_b32):
        self.all_buckets.append(storage_index_b32)
        self.countdown -= 1
        if self.countdown == 0:
@ -39,7 +39,7 @@ class PacedCrawler(ShareCrawler):
            self.cpu_slice = -1.0
    def yielding(self, sleep_time):
        self.cpu_slice = 500
-    def finished_cycle(self):
+    def finished_cycle(self, cycle):
        eventually(self.finished_d.callback, None)
 class ConsumingCrawler(ShareCrawler):
@ -47,18 +47,18 @@ class ConsumingCrawler(ShareCrawler):
    allowed_cpu_percentage = 0.5
    minimum_cycle_time = 0
-    def __init__(self, server, statefile):
+    def __init__(self, *args, **kwargs):
-        ShareCrawler.__init__(self, server, statefile)
+        ShareCrawler.__init__(self, *args, **kwargs)
        self.accumulated = 0.0
        self.cycles = 0
        self.last_yield = 0.0
-    def process_bucket(self, prefixdir, storage_index_b32):
+    def process_bucket(self, cycle, prefix, prefixdir, storage_index_b32):
        start = time.time()
        time.sleep(0.05)
        elapsed = time.time() - start
        self.accumulated += elapsed
        self.last_yield += elapsed
-    def finished_cycle(self):
+    def finished_cycle(self, cycle):
        self.cycles += 1
    def yielding(self, sleep_time):
        self.last_yield = 0.0
@ -99,7 +99,7 @@ class Basic(unittest.TestCase, StallMixin, pollmixin.PollMixin):
        sis = [self.write(i, ss, serverid) for i in range(10)]
        statefile = os.path.join(self.basedir, "statefile")
-        c = BucketEnumeratingCrawler(ss, statefile)
+        c = BucketEnumeratingCrawler(ss, statefile, allowed_cpu_percentage=.1)
        c.load_state()
        c.start_current_prefix(time.time())
@ -322,7 +322,11 @@ class Basic(unittest.TestCase, StallMixin, pollmixin.PollMixin):
        # empty methods in the base class
        def _check():
-            return c.first_cycle_finished
+            return bool(c.state["last-cycle-finished"] is not None)
        d = self.poll(_check)
        def _done(ignored):
            state = c.get_state()
            self.failUnless(state["last-cycle-finished"] is not None)
        d.addCallback(_done)
        return d