o
    Ci+                     @   s   d dl Z d dlZd dlZddlmZmZmZmZ ddl	m
Z
 G dd dZG dd deZd	D ]
Ze jer9 nq/d
ZG dd deZdd ZG dd deZG dd deZG dd deZG dd deZdddZdS )    N   )
fvecs_read
ivecs_read
bvecs_mmap
fvecs_mmap)knnc                   @   s`   e Zd ZdZdd Zdd ZdddZd	d
 ZdddZdddZ	dddZ
dd Zdd ZdS )Datasetz+ Generic abstract class for a test dataset c                 C   s"   d| _ d| _d| _d| _d| _dS )z2 the constructor should set the following fields: L2Ndmetricnqnbntself r   n/var/www/html/fyndo/python/python_agents/rag_suite/venv/lib/python3.10/site-packages/faiss/contrib/datasets.py__init__   s
   
zDataset.__init__c                 C      t  )z' return the queries as a (nq, d) array NotImplementedErrorr   r   r   r   get_queries      zDataset.get_queriesNc                 C   r   )z' return the queries as a (nt, d) array r   r   maxtrainr   r   r   	get_train   r   zDataset.get_trainc                 C   r   )z' return the queries as a (nb, d) array r   r   r   r   r   get_database    r   zDataset.get_database   r   r   c           	      c   sb    |   }|\}}| j| | | j|d  | }}t|||D ]}||t|| | V  q dS )a7  returns an iterator on database vectors.
        bs is the number of vectors per batch
        split = (nsplit, rank) means the dataset is split in nsplit
        shards and we want shard number rank
        The default implementation just iterates over the full matrix
        returned by get_dataset.
        r   N)r   r   rangemin	r   bssplitxbnsplitranki0i1j0r   r   r   database_iterator$   s   "zDataset.database_iteratorc                 C   r   )z7 return the ground truth for k-nearest neighbor search r   r   kr   r   r   get_groundtruth2   r   zDataset.get_groundtruthc                 C   r   )z* return the ground truth for range search r   )r   threshr   r   r   get_groundtruth_range6   r   zDataset.get_groundtruth_rangec              
   C   s,   d| j  d| j d| j d| j d| j 
S )Nzdataset in dimension z, with metric z
, size: Q z B z T r   r   r   r   r   __str__:   s   zDataset.__str__c                 C   s   |   j| j| jfksJ | jdkr(| jdd}|jd| jfks(J d|jf |  j| j| jfks5J | jddj| jdfksCJ dS )z8 runs the previous and checks the sizes of the matrices r   {   )r   zshape=%s   )r.   N)	r   shaper   r   r   r   r   r   r/   )r   xtr   r   r   check_sizes>   s   
  zDataset.check_sizesNr   r    )__name__
__module____qualname____doc__r   r   r   r   r,   r/   r1   r2   r7   r   r   r   r   r      s    



r   c                   @   s>   e Zd ZdZdddZdd Zdd	d
Zdd ZdddZdS )SyntheticDatasetzOA dataset that is not completely random but still challenging to
    index
    r
   :  c                 C   s   t |  ||||f\| _| _| _| _d}|| | }tj|}	|	j	||fd}
t
|
|	||}
|
|	|d d  }
t|
}
|
d}
|| _|
d | | _|
|||  | _|
|| d  | _d S )N
   )size   g?float32)r   r   r   r   r   r   nprandomRandomStatenormaldotrandsinastyper   r6   r&   xq)r   r   r   r   r   r   seedd1nrsxr   r   r   r   M   s   


zSyntheticDataset.__init__c                 C      | j S r8   )rL   r   r   r   r   r   _      zSyntheticDataset.get_queriesNc                 C   s    |d ur|n| j }| jd | S r8   )r   r6   r   r   r   r   r   b   s   zSyntheticDataset.get_trainc                 C   rR   r8   )r&   r   r   r   r   r   f   rS   zSyntheticDataset.get_databased   c                 C   s.   t | j| j|| jdkrtjd S tjd S )Nr
   r   )r   rL   r&   r   faiss	METRIC_L2METRIC_INNER_PRODUCTr-   r   r   r   r/   i   s   
z SyntheticDataset.get_groundtruth)r
   r?   r8   )rT   	r:   r;   r<   r=   r   r   r   r   r/   r   r   r   r   r>   H   s    

r>   )z/datasets01/simsearch/041218/z7/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/zdata/c                   @   s<   e Zd ZdZdd Zdd ZdddZd	d
 ZdddZdS )DatasetSIFT1Mz_
    The original dataset is available at: http://corpus-texmex.irisa.fr/
    (ANN_SIFT1M)
    c                 C   s,   t |  d\| _| _| _| _td | _d S )N)r   順 @B '  zsift1M/)r   r   r   r   r   r   dataset_basedirbasedirr   r   r   r   r      s   
zDatasetSIFT1M.__init__c                 C      t | jd S )Nzsift_query.fvecsr   r^   r   r   r   r   r         zDatasetSIFT1M.get_queriesNc                 C   s(   |d ur|n| j }t| jd d | S )Nzsift_learn.fvecs)r   r   r^   r   r   r   r   r      s   zDatasetSIFT1M.get_trainc                 C   r_   )Nzsift_base.fvecsr`   r   r   r   r   r      ra   zDatasetSIFT1M.get_databasec                 C   s:   t | jd }|d ur|dksJ |d d d |f }|S )Nzsift_groundtruth.ivecsrT   )r   r^   r   r.   gtr   r   r   r/      s
   zDatasetSIFT1M.get_groundtruthr8   rX   r   r   r   r   rY      s    
rY   c                 C   s   t j| ddS )NrC   dtype)rD   ascontiguousarray)rQ   r   r   r   sanitize   ra   rg   c                   @   H   e Zd ZdZdddZdd Zddd	Zdd
dZdd ZdddZ	dS )DatasetBigANNz_
    The original dataset is available at: http://corpus-texmex.irisa.fr/
    (ANN_SIFT1B)
      c                 C   sN   t |  |dv sJ || _|d }dd|df\| _| _| _| _td | _d S )N)
r         r@      2   rT      i  rj   r[   r    r\   zbigann/)	r   r   nb_Mr   r   r   r   r]   r^   )r   rq   r   r   r   r   r      s   
zDatasetBigANN.__init__c                 C   s   t t| jd d d  S )Nzbigann_query.bvecs)rg   r   r^   r   r   r   r   r      s   zDatasetBigANN.get_queriesNc                 C   ,   |d ur|n| j }tt| jd d | S )Nzbigann_learn.bvecs)r   rg   r   r^   r   r   r   r   r         zDatasetBigANN.get_trainc                 C   s@   t | jd| j  }|d ur|dksJ |d d d |f }|S )Nzgnd/idx_%dM.ivecsrT   )r   r^   rq   rb   r   r   r   r/      s
   zDatasetBigANN.get_groundtruthc                 C   s.   | j dk s	J dtt| jd d | j S )NrT   dataset too large, use iteratorbigann_base.bvecs)rq   rg   r   r^   r   r   r   r   r   r         zDatasetBigANN.get_databaser   r    c           	      c   l    t | jd }|\}}| j| | | j|d  | }}t|||D ]}t||t|| | V  q#d S )Nru   r   )r   r^   r   r!   rg   r"   r#   r   r   r   r,         "zDatasetBigANN.database_iterator)rj   r8   r9   
r:   r;   r<   r=   r   r   r   r/   r   r,   r   r   r   r   ri      s    


ri   c                   @   rh   )DatasetDeep1Bzv
    See
    https://github.com/facebookresearch/faiss/tree/main/benchs#getting-deep1b
    on how to get the data
     ʚ;c                 C   sf   t |  dddddd}||v sJ dd|d	f\| _| _| _| _td
 | _d| j|| j f | _d S )N100k1M10M100M1B)rZ   r[   i rp   r{   `   i]r\   zdeep1b/z%sdeep%s_groundtruth.ivecs)	r   r   r   r   r   r   r]   r^   gt_fname)r   r   
nb_to_namer   r   r   r      s   

zDatasetDeep1B.__init__c                 C   s   t t| jd S )Nzdeep1B_queries.fvecs)rg   r   r^   r   r   r   r   r      s   zDatasetDeep1B.get_queriesNc                 C   rr   )Nzlearn.fvecs)r   rg   r   r^   r   r   r   r   r      rs   zDatasetDeep1B.get_trainc                 C   s6   t | j}|d ur|dksJ |d d d |f }|S )NrT   )r   r   rb   r   r   r   r/      
   
zDatasetDeep1B.get_groundtruthc                 C   s.   | j dks	J dtt| jd d | j  S )Nrp   rt   
base.fvecs)r   rg   r   r^   r   r   r   r   r      rv   zDatasetDeep1B.get_databaser   r    c           	      c   rw   )Nr   r   )r   r^   r   r!   rg   r"   r#   r   r   r   r,      rx   zDatasetDeep1B.database_iterator)r{   r8   r9   ry   r   r   r   r   rz      s    


rz   c                   @   s4   e Zd ZdZdddZdd Zdd	 Zdd
dZdS )DatasetGlovezD
    Data from http://ann-benchmarks.com/glove-100-angular.hdf5
    NFc                 C   sh   dd l }|r
J d|std }||d| _d| _d\| _| _| jd jd | _| jd jd | _	d S )	Nr   znot implementedzglove/glove-100-angular.hdf5rIP)rT   r   traintest)
h5pyr]   File
glove_h5pyr   r   r   r5   r   r   )r   locdownloadr   r   r   r   r      s   zDatasetGlove.__init__c                 C      t | jd }t| |S )Nr   rD   arrayr   rU   normalize_L2r   rL   r   r   r   r        
zDatasetGlove.get_queriesc                 C   r   )Nr   r   r   r&   r   r   r   r     r   zDatasetGlove.get_databasec                 C   s6   | j d }|d ur|dksJ |d d d |f }|S )N	neighborsrT   )r   rb   r   r   r   r/     r   zDatasetGlove.get_groundtruth)NFr8   r:   r;   r<   r=   r   r   r   r/   r   r   r   r   r      s    
r   c                   @   s2   e Zd ZdZdd Zdd Zdd Zdd	d
ZdS )DatasetMusic100zO
    get dataset from
    https://github.com/stanis-morozov/ip-nsw#dataset
    c                 C   s2   t |  d\| _| _| _| _d| _td | _d S )N)rT   r   r[   r\   r   z
music-100/)	r   r   r   r   r   r   r   r]   r^   r   r   r   r   r   #  s   
zDatasetMusic100.__init__c                 C   $   t j| jd dd}|dd}|S )Nzquery_music100.binrC   rd   r	   rT   rD   fromfiler^   reshaper   r   r   r   r   )     zDatasetMusic100.get_queriesc                 C   r   )Nzdatabase_music100.binrC   rd   r	   rT   r   r   r   r   r   r   .  r   zDatasetMusic100.get_databaseNc                 C   s<   t | jd }|d ur|dksJ |d d d |f }|S )Nzgt.npyrT   )rD   loadr^   rb   r   r   r   r/   3  s
   zDatasetMusic100.get_groundtruthr8   r   r   r   r   r   r     s    r   deep1MFc                 C   s   | dkrt  S | dr| dkrdnt| dd }t|dS | dr^| d	d
 }|d dkr;dt|d
d  }n|dkrBd}n|d dkrSdt|d
d  }nJ d| t|dS | dkret S | dkrnt|dS td|  )z converts a string describing a dataset to a Dataset object
    Supports sift1M, bigann1M..bigann1B, deep1M..deep1B, music-100 and glove
    sift1Mbigannbigann1Brj      r	   )rq   deeprB   NMr[   r   r{   r.   Fzdid not recognize suffix )r   z	music-100glove)r   zunknown dataset )rY   
startswithintri   rz   r   r   RuntimeError)datasetr   dbsizeszsufr   r   r   dataset_from_name<  s(   




r   )r   F)osnumpyrD   rU   vecs_ior   r   r   r   exhaustive_searchr   r   r>   r]   pathexistsrY   rg   ri   rz   r   r   r   r   r   r   r   <module>   s&   ;/(0#