Auction simulation¶

The following IPython notebook summarizes the main functions used for clustering the auction graph using the R-LDG algorithm, running the auction simulator, and implementing the experiment of experiments framework.

Imports¶

import ast
import collections
import numpy as np
import os
import pandas
import tqdm  # a progress-bar package.

Auction functions¶

def get_global_reserves(reserves, assignments):
    # reserves.shape = (16268, 2)
    return reserves[range(len(reserves)), assignments]

def get_shifts(ctr_func):
    # pos_{j-1} - pos_j
    shifts = np.zeros(len(ctr_func))
    shifts[:-1] = ctr_func[:-1] - ctr_func[1:]
    return shifts

def get_parts(bids, reserves):
    meets_reserve_idx = (bids > reserves).nonzero()
    part_bids = bids[meets_reserve_idx]
    part_reserves = reserves[meets_reserve_idx]
    return meets_reserve_idx, part_bids, part_reserves

def get_payments(bids, shifts, reserves):
    # sorted in decreasing order!
    n_bidders = len(bids)
    pre_payments = np.zeros(n_bidders)
    sum_items = shifts[:n_bidders-1] * bids[1:]
    pre_payments[:n_bidders-1] = np.cumsum(sum_items[::-1])[::-1]
    payments = np.maximum(pre_payments, reserves)
    return payments

def get_utilities(bids, payments, ctr_func):
    return (bids - payments) * ctr_func[:len(bids)]

def run_auction(bids, reserves, ctr_func, shifts):
    utilities = np.zeros(len(bids))
    indices, pbids, preserves = get_parts(bids, reserves)
    if len(pbids) != 0:
        payments = get_payments(pbids, shifts, preserves)
        putils = get_utilities(pbids, payments, ctr_func)
        utilities[indices] = putils
    return utilitiesdef get_global_assignment(n_clusters,
                          clustering,
                          prop_treated):
    A = np.zeros(n_clusters, dtype=int)
    A[:int(n_clusters * prop_treated)] = 1
    np.random.shuffle(A)
    return A[clustering]

Experiment functions¶

def get_global_assignment(n_clusters,
                          clustering,
                          prop_treated):
    A = np.zeros(n_clusters, dtype=int)
    A[:int(n_clusters * prop_treated)] = 1
    np.random.shuffle(A)
    return A[clustering]

def get_EofE_assignment(clustering_1,
                        nbr_clusters_1,
                        clustering_2,
                        nbr_clusters_2,
                        prop_treated):
    W = np.zeros(len(clustering_1), dtype=np.int32)
    W[:int(len(clustering_1)/2)] = 1
    np.random.shuffle(W)
    cl_1_indexes = W.nonzero()
    cl_2_indexes = (1 - W).nonzero()
    Z1 = get_global_assignment(nbr_clusters_1,
                               clustering_1[cl_1_indexes],
                               prop_treated)
    Z2 = get_global_assignment(nbr_clusters_2,
                               clustering_2[cl_2_indexes],
                               prop_treated)
    Z = np.zeros(len(clustering_1), dtype=np.int32)
    Z[cl_1_indexes] = Z1
    Z[cl_2_indexes] = Z2
    return Z, W

def get_util_estimate(utilities, assignments):
    t_mean = np.mean(utilities[assignments == 1])
    c_mean = np.mean(utilities[assignments == 0])
    return t_mean - c_mean

def analyse_EofE(utilities, Z, W):
    u_t1 = utilities[np.logical_and(Z == 1, W == 1)]
    u_c1 = utilities[np.logical_and(Z == 0, W == 1)]
    u_t2 = utilities[np.logical_and(Z == 1, W == 0)]
    u_c2 = utilities[np.logical_and(Z == 0, W == 0)]
    # multiplied by 4 because M/m = 2 and len(Z) = 2 * N
    est1 = 4 * (np.sum(u_t1) - np.sum(u_c1)) / (len(Z))
    est2 = 4 * (np.sum(u_t2) - np.sum(u_c2)) / (len(Z))
    return est1, est2

Note: to achieve significant speedup, we assume that bids (and bidders) are sorted in decreasing order!

class Auction(object):
    """"
    Main auction class

    bids: list of lists. First level represents each individual auction.
          The second level represents the bids in decreasing order.
          Ex: [[10.4, 3.5, 2.1], [9.2, 1.0]] represents two auctions, 
          with 3 and 2 bids respectively.
    
    bidders: list of lists. IDs of bidders making the bids as listed
             above. Ex: [[1, 7, 2], [90, 2]] represents two auctions,
             with 3 and 2 bidders respectively. Bidder 2 bids each time.
             
    clustering_{}_f: clustering_file as provided by the clustering
             functions above, corresponding to an array of length 
             number_of_bidders with each entry representing the cluster
             ID of that bidder.
             
    n_auctions: maximum number of auctions to run. 
    
    min/max_reserve: reserve is drawn uniformly from this segment for 
                     each bidder.
                     
    n_bidders: number of bidders
    
    prop_treated: proportion of bidder clusters to assign to the 
                  intervention.
    """
    def __init__(self,
                 bids,
                 bidders,
                 clustering_1_f=None,
                 clustering_2_f=None,
                 n_auctions=None,
                 min_reserve=100,
                 max_reserve=500,
                 n_bidders=16268,
                 prop_treated=.5):
        self.bidders= bidders
        self.bids = bids
        if n_auctions is None:
            self.n_auctions = len(bidders)
        else:
            self.n_auctions = n_auctions
        self.n_bidders = n_bidders
        assert len(bidders) == len(bids)
        
        if clustering_1_f is not None:
            self.clustering_1 = np.load(file=clustering_1_f)
            self.n_cluster_1 = len(np.bincount(self.clustering_1))
        if clustering_2_f is not None:
            self.clustering_2 = np.load(file=clustering_2_f)
            self.n_cluster_2 = len(np.bincount(self.clustering_2))
                                   
        self.ctr_func = np.zeros(11)
        self.ctr_func[:5] = np.array([.022, .011, .0075, .005, .003])
        self.shifts = get_shifts(self.ctr_func)
        
        self.reserve_arr = np.zeros((n_bidders, 2))
        self.reserve_arr[:, 1] = np.random.randint(
            min_reserve, max_reserve, size=n_bidders)
                                   
        self.prop_treated = .5
        
    def get_ground_truth(self):
        tutils = np.zeros(self.n_bidders)
        cutils = np.zeros(self.n_bidders)
        for _, bid, bidder in zip(range(self.n_auctions),
                                  self.bids, self.bidders):
            if len(bid) > 1 and len(bid) < len(self.ctr_func):
                t_reserve_arr = self.reserve_arr[bidder, 1]
                tutils[bidder] += run_auction(bid,
                                               t_reserve_arr,
                                               self.ctr_func,
                                               self.shifts)
                c_reserve_arr = self.reserve_arr[bidder, 0]
                cutils[bidder] += run_auction(bid,
                                               c_reserve_arr,
                                               self.ctr_func,
                                               self.shifts)
        return np.mean(tutils - cutils)
    
    
    def run_experiment(self, cluster_idx, n_samples):
        if cluster_idx == 1:
            n_clusters = self.n_cluster_1
            clustering = self.clustering_1
        else:
            n_clusters = self.n_cluster_2
            clustering = self.clustering_2
        assert len(clustering) == self.n_bidders
        estimates = np.zeros(n_samples)
        for i in tqdm.tqdm(range(n_samples)):
            Z = get_global_assignment(n_clusters,
                                      clustering,
                                      self.prop_treated)
            global_reserves = get_global_reserves(self.reserve_arr, Z)
            utilities = np.zeros(self.n_bidders)
            for _, bid, bidder in zip(range(self.n_auctions),
                                      self.bids,
                                      self.bidders):
                if len(bid) > 1 and len(bid) < len(self.ctr_func):
                    reserves = global_reserves[bidder]
                    utilities[bidder] += run_auction(bid,
                                                     reserves,
                                                     self.ctr_func,
                                                     self.shifts)
            estimates[i] = get_util_estimate(utilities, Z)
        return estimates

    def run_EofE(self, n_samples):
        estimates = np.zeros([n_samples, 2])
        for i in tqdm.tqdm(range(n_samples)):
            Z, W = get_EofE_assignment(self.clustering_1,
                                       self.n_cluster_1,
                                       self.clustering_2,
                                       self.n_cluster_2,
                                       self.prop_treated)            
            global_reserves = get_global_reserves(self.reserve_arr, Z)
            utilities = np.zeros(self.n_bidders)
            for _, bid, bidder in zip(range(self.n_auctions),
                                      self.bids,
                                      self.bidders):
                if len(bid) > 1 and len(bid) < len(self.ctr_func):
                    reserves = global_reserves[bidder]
                    utilities[bidder] += run_auction(bid,
                                                      reserves,
                                                      self.ctr_func,
                                                      self.shifts)
            estimates[i] = analyse_EofE(utilities, Z, W)
        return estimates
    
    def plot_treatment_dist(self, cluster_idx, n_samples):
        if cluster_idx == 1:
            n_clusters = self.n_cluster_1
            clustering = self.clustering_1
        else:
            n_clusters = self.n_cluster_2
            clustering = self.clustering_2
        S = []
        for i in tqdm.tqdm(range(n_samples)):
            Z = get_global_assignment(n_clusters,
                                      clustering,
                                      .5)
            for _, bid, bidder in zip(range(100000), bids, bidders):
                S.append(np.mean(Z[bidder]))
        return S

Clustering the bipartite graph¶

Shuffling nodes helps with clustering:

the_shuffle = np.arange(NUM_NODES)
np.random.shuffle(the_shuffle)

Note: the following code is a modification of Justin Vincent's code: http://algorithmshop.com/20131213-graph-partitioning.html

def to_undirected(edge_iterable, num_edges, num_nodes, shuffle=None):
    """Takes an iterable of edges and produces the list of 
     edges for the undirected graph.
    
    > to_undirected([[0,1],[1,2],[2,10]], 3, 11)
    array([[ 0,  1],
       [ 1,  0],
       [ 1,  2],
       [ 2,  1],
       [ 2, 10],
       [10,  2]])
    """
    # need int64 to do gross bithacks
    as_array = np.zeros((num_edges, 2), dtype=np.int64)
    for (i, (n_0, n_1, weight)) in enumerate(edge_iterable):
            as_array[i,0] = n_0
            as_array[i,1] = n_1
    # The graph is directed, but we want to make it undirected,
    # which means we will duplicate some rows.

    left_nodes = as_array[:,0]
    right_nodes = as_array[:,1]
    
    if shuffle is not None:
        left_nodes = shuffle.take(left_nodes)
        right_nodes = shuffle.take(right_nodes)

    
    # numpy.unique will not unique whole rows, so this little bit-hacking
    # is a quick way to get unique rows after making a flipped copy of
    # each edge.
    max_bits = int(np.ceil(np.log2(num_nodes + 1)))
    
    encoded_edges_forward = np.left_shift(left_nodes, max_bits) \
                            | right_nodes
    
    # Flip the columns and do it again:
    encoded_edges_reverse = np.left_shift(right_nodes, max_bits) \
                            | left_nodes

    unique_encoded_edges = np.unique(np.hstack((encoded_edges_forward, 
                                                encoded_edges_reverse)))
    
    left_node_decoded = np.right_shift(unique_encoded_edges, max_bits)
    
    # Mask out the high order bits
    right_node_decoded = (2 ** (max_bits) - 1) & unique_encoded_edges
    
    undirected_edges = np.vstack((left_node_decoded, 
                                  right_node_decoded)).T.astype(np.int32)

    # ascontiguousarray so that it's c-contiguous for cython code below
    return np.ascontiguousarray(undirected_edges)


def get_weights(edges, edge_iterable, shuffle=None):
    graph_dict = {}
    weights = np.zeros(len(edges), dtype=np.float32)
    for (node_1, node_2, weight) in edge_iterable:
        if shuffle is not None:
            node_1, node_2 = shuffle.take([node_1, node_2])
        graph_dict[tuple([node_1, node_2])] = weight
        graph_dict[tuple([node_2, node_1])] = weight
    for (node_1, node_2), idx in zip(edges, range(len(weights))):
        if node_1 != node_2:
            weights[idx] = graph_dict[tuple([node_1, node_2])]
    return weights


def get_clean_data(shuffle=None):
    edges = to_undirected(row_generator(),
                          NUM_EDGES,
                          NUM_NODES,
                          shuffle)
    weights = get_weights(edges,
                          row_generator(),
                          shuffle)
    return edges, weights

def row_generator():
    return iterate_through_edges(bids, bidders)

print("Normalizing data")
%time edges, weights = get_clean_data(the_shuffle)
print('\nEDGES SHAPE: {}'.format(edges.shape))
print('\nWEIGHTS SHAPE: {}'.format(weights.shape))

def score(assignment, edges, weights):
    """Compute the score given an assignment of vertices.
    
    N nodes are assigned to clusters 0 to K-1.
    
    assignment: Vector where N[i] is the cluster node i is assigned to.
    edges: The edges in the graph, assumed to have one in each direction
    
    Returns: (total wasted bin space, ratio of edges cut)
    """
    balance = np.bincount(assignment) / len(assignment)
    waste = (np.max(balance) - balance).sum()
  
    mismatch = 0
    for (node_1, node_2), weight in zip(edges, weights):
        if assignment[node_1] != assignment[node_2]:
            mismatch += weight
    total_weight = weights.sum()
    cut_ratio = mismatch / total_weight
    return (waste, cut_ratio)

%load_ext cython
%pylab inline

%%cython
import numpy as np
cimport cython

cdef int UNMAPPED = -1

def linear_deterministic_greedy(int[:,::] edges,
                                float[::] weights,
                                int num_nodes,
                                int num_cust_nodes,
                                int num_partitions,
                                int[::] partition,
                                int[::] is_customer_node):
    """
    This algorithm favors a cluster if it has many neighbors of a node, 
    but penalizes the cluster if it is close to capacity.
    
    edges: An [:,2] array of edges.
    num_nodes: The number of nodes in the graph.
    num_partitions: How many partitions we are breaking the graph into.
    partition: The partition from a previous run. Used for restreaming.

    Returns: A new partition.
    """
    assert len(weights) == len(edges)
    # The output partition

    if partition is None:
        partition = np.repeat(np.int32(UNMAPPED), num_nodes)

    cdef int[::] partition_sizes = np.zeros(num_partitions, 
                                            dtype=np.int32)
    cdef float[::] partition_votes = np.zeros(num_partitions, 
                                              dtype=np.float32)
    
    # Fine to be a little off, to stay integers
    cdef int partition_capacity = num_cust_nodes / num_partitions    
    cdef int last_left = edges[0,0]
    cdef int i = 0
    cdef int left = 0
    cdef int right = 0
    cdef int arg = 0
    cdef int max_arg = 0
    cdef float max_val = 0
    cdef float val = 0
    cdef int len_edges = len(edges)
    cdef float weight = 0
    
    for i in range(len_edges):
        left = edges[i,0]
        right = edges[i,1]
        weight = weights[i]
    
        if last_left != left:
            # We have found a new node so assign last_left to a partition
                
            max_arg = 0
            max_val = (partition_votes[0]) * (
                       partition_capacity - partition_sizes[0])

            for arg in range(1, num_partitions):
                val = (partition_votes[arg]) * (
                       partition_capacity - partition_sizes[arg])
                if val > max_val:
                    max_arg = arg
                    max_val = val

            if max_val == 0:
                max_arg = arg
                # No neighbors (or multiple maxed out) so "randomly" select
                # the smallest partition
                for arg in range(i % num_partitions, num_partitions):
                    if partition_sizes[arg] < partition_capacity:
                        max_arg = arg
                        max_val = 1
                        break
                if max_val == 0:
                    for arg in range(0, i % num_partitions):
                        if partition_sizes[arg] < partition_capacity:
                            max_arg = arg
                            break


            if is_customer_node[last_left]:
                partition_sizes[max_arg] += 1
            partition[last_left] = max_arg
            partition_votes[:] = 0            
            last_left = left

            
        if partition[right] != UNMAPPED:
            partition_votes[partition[right]] += weight


    # Clean up the last assignment
    max_arg = 0
    max_val = 0
    for arg in range(0, num_partitions):
        if partition_sizes[arg] < partition_capacity:
            val = (partition_votes[arg]) * (
                    1 - partition_sizes[arg] / partition_capacity)
            if val > max_val:
                max_arg = arg
                max_val = val
    partition[left] = max_arg

    return np.asarray(partition)

def run_restreaming_greedy(edges,
                           weights,
                           num_nodes, 
                           num_cust_nodes,
                           num_partitions,
                           num_iterations,
                           is_customer_node):
    print('\n{} PARTITIONS'.format(num_partitions))
    assignments = None
    print('ROUND\tWASTE\tSCORE')
    waste_values = []
    edge_score_values = []
    #flipped_edges = numpy.flipud(edges).copy()
    for i in range(num_iterations):
        assignments = linear_deterministic_greedy(edges,
                                                  weights,
                                                  num_nodes,
                                                  num_cust_nodes,
                                                  num_partitions,
                                                  assignments,
                                                  is_customer_node)
        (waste, edge_score) = score(assignments, edges, weights)
        waste_values.append(waste)
        edge_score_values.append(edge_score)
        print('{}\t{:0.3f}\t{:0.3f}'.format(i, waste, edge_score))
    return assignments, waste_values, edge_score_values

We have to keep track of which nodes are customer nodes to ensure balance on the customer-side of the graph!

# NUM_CUST_NODES = 16268
is_customer_node = np.zeros(NUM_NODES, dtype=np.int32)
is_customer_node[:NUM_CUST_NODES] = 1
is_customer_node = is_customer_node[the_shuffle]

Run the algorithm:

rval = run_restreaming_greedy(edges,
                              weights,
                              NUM_NODES,
                              NUM_CUST_NODES,
                              NUM_PARTITIONS,
                              num_iterations=10,
                              is_customer_node=is_customer_node)
assignments, waste_values, edge_score_values = rval

Unshuffle nodes:

idx = np.argsort(np.argsort(the_shuffle))
rval = assignments[idx]

Get a random clustering:

def get_random_partition(n_partitions,
                         n_nodes,
                         n_cust_nodes):
    partition_capacity = int(n_cust_nodes / n_partitions)
    cust_partition = np.concatenate([[i] * partition_capacity for 
                                     i in range(n_partitions)])
    np.random.shuffle(cust_partition)
    
    partition = np.zeros(n_nodes, dtype=np.int32)
    partition[:len(cust_partition)] = cust_partition
    partition[len(cust_partition):] = np.random.randint(
        0, n_partitions, size=n_nodes - len(cust_partition))
    return partition

random_partition = get_random_partition(NUM_PARTITIONS,
                                        NUM_NODES, NUM_CUST_NODES)