o
    Ci;                     @   s   d dl Zd dlT d dlZdd Zdd Zed fddZd&d
dZd'ddZ	e	Z
d&ddZdd ZeZd(ddZdd Zdd ZeZd)ddZeZd)ddZG dd dZd*d d!Zefd"d#ZG d$d% d%ZdS )+    N)*c                 C      t j| dd} | j\}}t j||fdd}t j||fdd}t }t||_t||_||_	||_
|  ||t|  |  ||fS )zPreturn k smallest values (and their indices) of the lines of a
    float32 arrayfloat32dtypeint64)npascontiguousarrayshapezerosfaissfloat_maxheap_array_tswig_ptridsvalnhkheapifyaddnreorderarrayr   mnIDha r   l/var/www/html/fyndo/python/python_agents/rag_suite/venv/lib/python3.10/site-packages/faiss/extra_wrappers.pykmin      


r   c                 C   r   )zOreturn k largest values (and their indices) of the lines of a
    float32 arrayr   r   r   )r   r	   r
   r   r   float_minheap_array_tr   r   r   r   r   r   r   r   r   r   r   r   kmax(   r    r"   c           	   
   C   s   t j| dd} t j|dd}| j\}}|j\}}||ksJ t j||fdd}|tkr<t||t| |t|t| |S |tkrK| |j |dd< |S t	||t| |t|||t| |S )zJcompute the whole pairwise distance matrix between two sets of
    vectorsr   r   N)
r   r	   r
   empty	METRIC_L2pairwise_L2sqrr   METRIC_INNER_PRODUCTTpairwise_extra_distances)	xqxbmetric
metric_argnqdnbd2disr   r   r   pairwise_distances:   s.   



r2   90  c                 C   $   t j| dd}tt||j| |S Nr   r   )r   r#   
float_randr   sizer   seedresr   r   r   randS      r;   c                 C   sD   t j| dd}|d u rtt||j| |S tt||j|| |S )Nr   r   )r   r#   
int64_randr   r7   int64_rand_max)r   r9   vmaxr:   r   r   r   randintY   s   r@   c                 C   r4   r5   )r   r#   float_randnr   r7   r8   r   r   r   randne   r<   rB   c                 C   s`   |  d} | j}|d@ }tt|d t| d|  d}t||D ]
}|t| d 7 }q#|S )z> compute a checksum for quick-and-dirty comparisons of arrays uint8   Nint32iy  )viewr7   ivec_checksumintr   rangex)ar   n4csir   r   r   checksumk   s   
$rP     c                 C   s(   t j| |fdd}t| |t|| |S r5   )r   r#   rand_smooth_vectors_cr   )r   r.   r9   r:   r   r   r   rand_smooth_vectorsy   s   rS   c              	   C   s   t j| dd} t j|dd}| jd }|jd |ksJ | jd |jd }}d}t|D ]}|t|t| | |t|| 7 }q-|S )z< size of intersection between each line of two result tablesr   r   r      )r   r	   r
   rJ   ranklist_intersection_sizer   )I1I2r   k1k2ninterrO   r   r   r   eval_intersection   s   
r[   c                 C   s    t | jd | jd t|  d S )NrT   r   )fvec_renorm_L2r
   r   )rK   r   r   r   normalize_L2   s    r]   c                 C   s|   t j| dd} |du rt|  d }t j|d dd}t j| jdd}t| jt| 	d|t|t|| ||fS )a  Perform a bucket sort on a table of integers.

    Parameters
    ----------
    tab : array_like
        elements to sort, max value nbucket - 1
    nbucket : integer
        number of buckets, None if unknown
    nt : integer
        number of threads to use (0 = use unthreaded codepath)

    Returns
    -------
    lims : array_like
        cumulative sum of bucket sizes (size vmax + 1)
    perm : array_like
        perm[lims[i] : lims[i + 1]] contains the indices of bucket #i (size tab.size)
    r   r   NrT   uint64)
r   r	   rI   maxr#   r7   bucket_sort_cr   r   rG   )tabnbucketntlimspermr   r   r   bucket_sort   s   rf   c                 C   sn   | j dks| j dksJ | j\}}|du rt|  d }tj|d dd}t||t| |t|| |S )a  Perform a bucket sort on a matrix, recording the original
    row of each element.

    Parameters
    ----------
    tab : array_like
        array of size (N, ncol) that contains the bucket ids, maximum
        value nbucket - 1.
        On output, it the elements are shuffled such that the flat array
        tab.ravel()[lims[i] : lims[i + 1]] contains the row numbers
        of each bucket entry.
    nbucket : integer
        number of buckets (the maximum value in tab should be nbucket - 1)
    nt : integer
        number of threads to use (0 = use unthreaded codepath)

    Returns
    -------
    lims : array_like
        cumulative sum of bucket sizes (size vmax + 1)
    rF   r   NrT   r   )	r   r
   rI   r_   r   r#   matrix_bucket_sort_inplace_cr   r   )ra   rb   rc   nrowncolrd   r   r   r   matrix_bucket_sort_inplace   s   

rj   c                   @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )
ResultHeapz_Accumulate query results from a sliced dataset. The final result will
    be in self.D, self.I.Fc                 C   s~   t j||fdd| _t j||fdd| _||| _| _|r!t }nt }||_||_t	| j|_
t	| j|_|  || _dS )z
        nq: number of query vectors,
        k: number of results per query
        keep_max: keep the top-k maximum values instead of the minima
        r   r   r   N)r   r   r   r   r-   r   r!   r   r   r   r   r   r   heaps)selfr-   r   keep_maxrl   r   r   r   __init__   s   
zResultHeap.__init__c                 C   sd   |j \}}tj|dd}tj|dd}|j ||fksJ || jks#J | j|t|t|| dS )z
        Add results for all heaps
        D, I should be of size (nh, nres)
        D, I do not need to be in a particular order (heap or sorted)
        r   r   r   N)r
   r   r	   r-   rl   addn_with_idsr   )rm   r   r   r-   kdr   r   r   
add_result   s   
zResultHeap.add_resultc                 C   s   |j \}}|t|ksJ |jdkr|j |j ks%|jdkr#|j |fks%J tj|dd}tj|dd}tj|dd}|jdkrAdn|}| j|t||t|t|| dS )z
        Add results for a subset of heaps.
        D, I should hold resutls for all the subset
        as a special case, if I is 1D, then all ids are assumed to be the same
           rT   r   r   r   r   N)r
   lenndimr   r	   rl   addn_query_subset_with_idsr   )rm   subsetr   r   nsubsetrq   	id_strider   r   r   add_result_subset  s   
zResultHeap.add_result_subsetc                 C   s   | j   d S )N)rl   r   )rm   r   r   r   finalize  s   zResultHeap.finalizeNF)__name__
__module____qualname____doc__ro   rr   rz   r{   r   r   r   r   rk      s    
rk   Fc           	   	   C   s|   |j | j ksJ | j \}}}tj||f| jd}tj||f|jd}|r&tnt}||||t| t|t|t| ||fS )z
    Merge a set of sorted knn-results obtained from different shards in a dataset
    Dall and Iall are of size (nshard, nq, k) each D[i, j] should be sorted
    returns D, I of size (nq, k) as the merged result set
    r   )r
   r   r#   r   merge_knn_results_CMaxmerge_knn_results_CMinr   )	DallIallrn   nshardr   r   DnewInewfuncr   r   r   merge_knn_results  s   r   c           
   
   C   s   t j| dd} t j|dd}| j\}}|j\}}||ksJ t j||fdd}t j||fdd}	|tkrKtt| t|||||t|	t| |	|fS |tkrftt| t|||||t|	t| |	|fS t	d)a  
    Compute the k nearest neighbors of a vector without constructing an index


    Parameters
    ----------
    xq : array_like
        Query vectors, shape (nq, d) where d is appropriate for the index.
        `dtype` must be float32.
    xb : array_like
        Database vectors, shape (nb, d) where d is appropriate for the index.
        `dtype` must be float32.
    k : int
        Number of nearest neighbors.
    distance_type : MetricType, optional
        distance measure to use (either METRIC_L2 or METRIC_INNER_PRODUCT)

    Returns
    -------
    D : array_like
        Distances of the nearest neighbors, shape (nq, k)
    I : array_like
        Labels of the nearest neighbors, shape (nq, k)
    r   r   r   z'only L2 and INNER_PRODUCT are supported)
r   r	   r
   r#   r$   	knn_L2sqrr   r&   knn_inner_productNotImplementedError)
r)   r*   r   r+   r-   r.   r/   r0   r   r   r   r   r   knn0  s(   

r   c                   @   s*   e Zd ZdZdd Zd	ddZdd ZdS )
Kmeansa  Object that performs k-means clustering and manages the centroids.
    The `Kmeans` class is essentially a wrapper around the C++ `Clustering` object.

    Parameters
    ----------
    d : int
       dimension of the vectors to cluster
    k : int
       number of clusters
    gpu: bool or int, optional
       False: don't use GPU
       True: use all GPUs
       number: use this many GPUs
    progressive_dim_steps:
        use a progressive dimension clustering (with that number of steps)

    Subsequent parameters are fields of the Clustring object. The most important are:

    niter: int, optional
       clustering iterations
    nredo: int, optional
       redo clustering this many times and keep best
    verbose: bool, optional
    spherical: bool, optional
       do we want normalized centroids?
    int_centroids: bool, optional
       round centroids coordinates to integer
    seed: int, optional
       seed for the random number generator

    c                 K   s   || _ || _d| _d|v rt | _nt | _| D ]$\}}|dkr1|dks*|dkr-t }|| _qt| j| t	| j|| qd| _
dS )zd: input dimension, k: nb of centroids. Additional
         parameters are passed on the ClusteringParameters object,
         including niter=25, verbose=False, spherical = False
        Fprogressive_dim_stepsgpuTN)r.   r   r   "ProgressiveDimClusteringParameterscpClusteringParametersitemsget_num_gpusgetattrsetattr	centroids)rm   r.   r   kwargsvr   r   r   ro     s   

zKmeans.__init__Nc                    s  t j|dd}|j\}}|| jksJ | jjtkr^t|| j| j}|dur9|j\}}||ks0J t	
| |j | jjrCt|| _nt|| _| jrUt	j| j| jd| _||| j| n0|du sdJ |du sjJ | jjrpJ t|| j| j}| jrt| jd}	nt }	||t||	 t	|j}
|
| j|| _|jfddt D t dd D | _d   fd	dD | _| jjd
kr| jd S dS )a   Perform k-means clustering.
        On output of the function call:

        - the centroids are in the centroids field of size (`k`, `d`).

        - the objective value at each iteration is in the array obj (size `niter`)

        - detailed optimization statistics are in the array iteration_stats.

        Parameters
        ----------
        x : array_like
            Training vectors, shape (n, d), `dtype` must be float32 and n should
            be larger than the number of clusters `k`.
        weights : array_like
            weight associated to each vector, shape `n`
        init_centroids : array_like
            initial set of centroids, shape (n, d)

        Returns
        -------
        final_obj: float
            final optimization objective

        r   r   N)ngpuc                    s   g | ]}  |qS r   )at).0rO   )statsr   r   
<listcomp>  s    z Kmeans.train.<locals>.<listcomp>c                 S   s   g | ]}|j qS r   )obj)r   str   r   r   r     s    z,obj time time_search imbalance_factor nsplitc                    s   g | ]  fd dD qS )c                    s   i | ]}|t  |qS r   )r   )r   fieldr   r   r   
<dictcomp>  s    z+Kmeans.train.<locals>.<listcomp>.<dictcomp>r   )r   )stat_fieldsr   r   r     s    r   r   g        ) r   r	   r
   r.   r   	__class__r   
Clusteringr   r   copy_array_to_vectorravelr   	sphericalIndexFlatIPindexIndexFlatL2r   index_cpu_to_all_gpustrainProgressiveDimClusteringGpuProgressiveDimIndexFactoryProgressiveDimIndexFactoryr   vector_float_to_arrayreshapeiteration_statsrJ   r7   r   r   split)rm   rK   weightsinit_centroidsr   r.   clusncr0   facr   r   )r   r   r   r     sB   



zKmeans.trainc                 C   sZ   t j|dd}| jd usJ d| j  | j| j | j|d\}}| | fS )Nr   r   zshould train before assigningrT   )r   r	   r   r   resetaddsearchr   )rm   rK   r   r   r   r   r   assign  s   
zKmeans.assign)NN)r}   r~   r   r   ro   r   r   r   r   r   r   r   f  s
     
Fr   )r3   )r3   N)rQ   )Nr   r|   )numpyr   faiss.loaderr   r   r"   r$   r2   r;   r@   lrandrB   rP   rS   rR   r[   r]   rf   r`   rj   rg   rk   r   r   r   r   r   r   r   <module>   s.   


	



'
@6