
    Bi9              
          d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZmZmZmZmZmZ ddlmZmZ dd	lmZ  e	e
          Zej                            ej                            e          d          ZdedefdZde fdZ!de fdZ"dede#de#dedef
dZ$dede#de#fdZ%de&e         dej'        fdZ(dede)e&e         e&e         e f         fdZ*dedefdZ+d Z,e-d k    r e,             dS dS )!zz
Smart PDF ingestion system with incremental processing.
Only processes new or modified files, preserving existing index.
    N)tqdm)OpenAI)Path)datetime)OPENAI_API_KEYEMBED_MODEL
CHUNK_SIZECHUNK_OVERLAP	INDEX_DIR	TENANT_ID)load_pdf_pagesextract_section_heading)
FaissStore)api_keyzprocessed_files.jsonfilepathreturnc                 V   t          j                    }	 t          | d          5 t          fdd          D ]}|                    |           	 ddd           n# 1 swxY w Y   |                                S # t          $ r }t          d|  d|            Y d}~dS d}~ww xY w)z
    Calculate MD5 hash of file to detect changes.
    
    Args:
        filepath: Path to file
        
    Returns:
        MD5 hash as hex string
    rbc                  .                          d          S )Ni    )read)fs   HC:\Users\Terasoftware\OneDrive\Desktop\Graph Rag\rag_suite\rag\ingest.py<lambda>zget_file_hash.<locals>.<lambda>/   s    affTll     r   Nu    ⚠️  Warning: Could not hash :  )hashlibmd5openiterupdate	hexdigest	Exceptionprint)r   hash_md5chunker   s       @r   get_file_hashr(   "   s    {}}H(D!! 	'Q2222C88 ' '&&&&'	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' 	' !!###   @@@Q@@AAArrrrrs:   A> ,AA> A##A> &A#'A> >
B(B##B(c                  .   t           j                            t                    rp	 t	          t          dd          5 } t          j        |           cddd           S # 1 swxY w Y   n+# t          $ r}t          d|            i cY d}~S d}~ww xY wi S )z
    Load record of already processed files from disk.
    
    Returns:
        Dict mapping filename -> {hash, processed_date, chunks, pages}
    rutf-8encodingNu/   ⚠️  Warning: Could not load tracking file: )	ospathexistsTRACKING_FILEr   jsonloadr#   r$   )r   r'   s     r   load_processed_filesr4   7   s     
w~~m$$ 	mS7;;; $qy||$ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ 	 	 	GAGGHHHIIIIII	 Is@   A* AA* A""A* %A"&A* *
B4BBB	processedc                 \   	 t          j        t           j                            t                    d           t          t          dd          5 }t          j        | |dd           d	d	d	           d	S # 1 swxY w Y   d	S # t          $ r}t          d
|            Y d	}~d	S d	}~ww xY w)zu
    Save record of processed files to disk.
    
    Args:
        processed: Dict mapping filename -> metadata
    T)exist_okwr+   r,      Findentensure_asciiNu/   ⚠️  Warning: Could not save tracking file: )
r.   makedirsr/   dirnamer1   r   r2   dumpr#   r$   )r5   r   r'   s      r   save_processed_filesr@   H   s   E
BGOOM22TBBBB-w777 	B1Ii15AAAA	B 	B 	B 	B 	B 	B 	B 	B 	B 	B 	B 	B 	B 	B 	B 	B 	B 	B E E ECCCDDDDDDDDDEs<   AB A7*B 7A;;B >A;?B 
B+B&&B+doc_idpagechunk_ordertextc           	          |  d| d| d|dd          }t          j        |                    d                                                    S )a  
    Generate stable, deterministic chunk ID.
    
    Args:
        doc_id: Document identifier
        page: Page number
        chunk_order: Chunk sequence number
        text: Chunk text (first 50 chars used)
        
    Returns:
        SHA1 hash as hex string
    |N2   r+   )r   sha1encoder"   )rA   rB   rC   rD   bases        r   stable_chunk_idrK   [   sW     77t77k77D"I77D<G,,--77999r   
chunk_sizeoverlapc              #   P  K   d                     |                                           } | sdS d}|t          |           k     rct          t          |           ||z             }| ||         V  |t          |           k    rdS ||z
  }|dk     r|}|t          |           k     adS dS )a  
    Generate text chunks with overlap using generator (memory efficient).
    
    Args:
        text: Input text to chunk
        chunk_size: Size of each chunk in characters
        overlap: Overlap between consecutive chunks
        
    Yields:
        Text chunks as strings
     Nr   )joinsplitlenmin)rD   rL   rM   startends        r   chunk_text_generatorrV   l   s       88DJJLL!!D E
#d))
 #d))UZ/0059o #d)) 	E g 19 	E #d))
     r   textsc                    | st          j        dd          S 	 t          j                            t
          |           }d |j        D             }t          j        |d          S # t          $ r}t          d|             d}~ww xY w)z
    Generate embeddings for text chunks using OpenAI API.
    
    Args:
        texts: List of text strings to embed
        
    Returns:
        Numpy array of shape (len(texts), embedding_dim)
    )r   i   float32)dtype)modelinputc                     g | ]	}|j         
S  )	embedding).0ds     r   
<listcomp>zembed_texts.<locals>.<listcomp>   s    999aak999r   u!   ❌ Error generating embeddings: N)
npzerosclient
embeddingscreater   dataarrayr#   r$   )rW   responserf   r'   s       r   embed_textsrk      s      4x	3333	$++ , 
 
 :98=999
x
)4444   5!55666s   AA' '
B	1BB	folderc                    t          t          j        t          j                            | d                              }|st          d|            t                      }g }g }|D ]}t          j                            |          }t          |          }|s|	                    |           H||v r`||         
                    dd          }||k    r|	                    |           t          d|            |	                    |           t          d|            |	                    |           |||fS )z
    Analyze which files need processing.
    
    Args:
        folder: Path to folder containing PDFs
        
    Returns:
        Tuple of (files_to_process, skipped_files, processed_files_dict)
    *.pdfzNo PDF files found in hashr   u      🔄 Modified: u      ✨ New: )sortedglobr.   r/   rP   
ValueErrorr4   basenamer(   appendgetr$   )	rl   pdfsprocessed_filesfiles_to_processskipped_filespdf_path	file_namecurrent_hashstored_hashs	            r   analyze_files_to_processr~      sn    $)BGLL99::;;D <:&::;;; +,,O M . .G$$X..	$X.. 	##H--- ' 	.))488DDK{* 2$$Y//// 6966777 ''1111 ,,,---##H----]O;;r   c                 v   t          d|  d           t          |           \  }}}t          |          t          |          z   }t          d           t          d|            t          dt          |                      t          dt          |                      |rdt          d           |dd	         D ]}t          d
|            t          |          d	k    r#t          dt          |          d	z
   d           |s-t          d           t          dt                      t          S t          dt          |           d           t
          j                            t                    rt          |          dk    rt          d           	 t          j	        t                    }t          dt          |j
                   d           n# t          $ rV}t          d|            t          d           t          dg          j        d         }t          |          }Y d}~nBd}~ww xY wt          d           t          dg          j        d         }t          |          }g }	d}
t          |d          D ]}	 t
          j                            |          }t!          j        |                                                                          }t)          |          }|st          d|            d}|D ]}|d         }|d         }|                                s't-          |          }t/          |t0          t2                    D ]f}|                                st5          ||||          }t          |g          }t6          ||||||||d 	}|                    ||g           |dz  }g|
|z  }
t;          |          t=          j                                                     |t          |          d!||<   |	!                    ||t          |          |d"           # t          $ r=}t          d#t
          j                            |           d$|            Y d}~d}~ww xY wt          d%           |"                    t                     t          d&t                      tG          |           t          d't                      t
          j        $                    t
          j        %                    t                    d(          }	 tM          |d)d*+          5 }tO          j(        |	|d,d-.           ddd           n# 1 swxY w Y   t          d/|            n)# t          $ r}t          d0|            Y d}~nd}~ww xY wt          S )1z
    Ingest PDFs from folder with incremental processing.
    Only processes new or modified files.
    
    Args:
        folder: Path to folder containing PDFs
        
    Returns:
        Path to saved index directory
    u   
📂 Scanning folder: 
u   📊 Ingestion Plan:z   - Total PDFs found: z$   - Already processed (unchanged): z   - New or modified: u"   
⏭️  Skipping unchanged files:N   u
         ✓ z      ... and z moreu1   
✅ All files already up-to-date! Nothing to do.u&   
💡 To reprocess all files, delete: u   
🔄 Processing z file(s)...
r   u   📂 Loading existing index...u"      ✅ Loaded existing index with z chunks
u*      ⚠️  Could not load existing index: z!   Creating new index instead...
zdimension probe   )dimu   📂 Creating new index...
zProcessing PDFs)descu*   
⚠️  Warning: No pages extracted from rB   rD   )		tenant_idrA   doc_name
source_urlpage_numbersection_headingchunk_idrC   rD   )ro   processed_datechunkspages)rA   r   r   r   u   
❌ Error processing r   u   
💾 Saving updated index...u      ✅ Index saved to: u      ✅ Tracking file updated: zingestion_manifest.jsonr8   r+   r,   r9   Fr:   u      ✅ Manifest saved: u$      ⚠️  Could not save manifest: ))r$   r~   rR   r1   r   r.   r/   r0   r   r3   metar#   rk   shaper   rs   r   r   rI   r"   r   stripr   rV   r	   r
   rK   r   addr(   r   now	isoformatrt   saver@   rP   r>   r   r2   r?   )rl   rx   ry   rw   
total_pdfsnamestorer'   r   ingestion_manifesttotal_chunks_addedrz   r   rA   r   chunk_counterrB   page_numrD   r   r&   r   vecr   manifest_pathr   s                             r   ingest_folder_incrementalr      s    

/V
/
/
/000 8PPV7W7W4m_%&&]););;J 

!"""	
0J
0
0111	
E]1C1C
E
EFFF	
:3'7#8#8
:
:;;; B4555!"1"% 	' 	'D%t%%&&&&}! 	B@3}#5#5#9@@@AAA  CDDDGGGHHH	
Cs#344
C
C
CDDD 
w~~i   $S%7%7!%; $.///	(OI..EQs5:QQQRRRR 	( 	( 	(BqBBCCC677701228;C3'''EEEEEE		( 	,---,-..4Q7s### )0ABBB G GF	w''11H[!2!233==??F #8,,E NHNNOOOM  "' "'<F|zz|| "9$"?"? 2$
MRR ' 'E ;;== ! .vxPUVVH &ug..C &/"($,&.'/+:$,'4 %
 
D IIcD6***!Q&MM1'4 -/ &h//"*,..":":"<"<'U	) )OH% %% $U'	' '      	 	 	MBG,<,<X,F,FMM!MMNNNHHHH	
 

+,,,	JJy	
/I
/
/000 )))	
:=
:
:;;; GLL!;!;=VWWM:-w777 	K1I(!AEJJJJ	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K 	K7778888 : : :8Q8899999999: su   !>G   
I *AH;;I A:QD0Q
R
2RR
6V U-!V -U11V 4U15V 
V1V,,V1c                     t          d           t          d           t          d           d} t          j                            |           s3t          d|  d           t          d           t          d           d	S t	          j        t          j                            | d
                    }|s3t          d|  d           t          d           t          d           d	S 	 t          |           }t          d           t          d           t          d           t          j                            |          rQ	 t          j        |          }t          d           t          dt          |j
                  d           n#  Y nxY wt                      }t          dt          |                      t          d           t          d|            t          dt                      t          d           t          d           t          d           t          d|  d           t          d           d	S # t          $ r}t          d           t          d           t          d           t          d| d           dd	l}t          d            |                                 t          d           Y d	}~d	S d	}~ww xY w)!z&Main entry point for ingestion script.z=
============================================================u%   🚀 RAG INCREMENTAL INGESTION SYSTEMz<============================================================rh   u   
❌ Error: Folder 'z' not found!z&   Please create it and add PDF files.z=============================================================
Nrn   u   
❌ No PDF files found in 'z'!z&   Please add PDF files to the folder.u   ✅ INGESTION COMPLETE!u   
📊 Final Index Statistics:z   - Total chunks in index: ,z   - Total tracked files: u   
📁 Output Locations:z   - Index: z   - Tracking: u   
💡 Next Steps:z    1. Test Q&A: python -m rag.qaz)   2. Start API: uvicorn api:app --reloadz   3. Add more PDFs to 'z/' and run againz>
============================================================
u   ❌ INGESTION FAILED!z
Error: r   r   zFull traceback:)r$   r.   r/   r0   rq   rP   r   r   r3   rR   r   r4   r1   r#   	traceback	print_exc)rl   	pdf_files
index_pathr   r5   r'   r   s          r   mainr     s   	/	
1222	(OOOF 7>>&!! :f:::;;;7888o 	"',,vw7788I 8f8889997888o*&.v66
 	o'(((h 7>>*%% 	"
337888HS__HHHIIII )**	;3y>>;;<<<)***)Z))***///000#$$$1222:;;;AAAABBB$%%%%% 	& 	& 	&o%&&&h!      $%%%%%%%%%	&s4   *AI AF I FB9I 
KA6KK__main__).__doc__r.   rq   r   r2   numpyrc   r   openair   pathlibr   r   
rag.configr   r   r	   r
   r   r   rag.pdf_utilsr   r   rag.vectorstorer   re   r/   rP   r>   r1   strr(   dictr4   r@   intrK   rV   listndarrayrk   tupler~   r   r   __name__r^   r   r   <module>r      s   
 
			                                c c c c c c c c c c c c c c c c A A A A A A A A & & & & & & 
	'	'	' RW__Y779OPPC C    *d    "ED E E E E&:C :s : :C :C : : : :" s    c        FtCy RZ    :0<S 0<U49d3i3M-N 0< 0< 0< 0<nVc Vc V V V VzB& B& B&J z DFFFFF r   