
    @i                     P    d dl mZ d dlZdefdZdededz  fdZdeded	efd
ZdS )    )	PdfReaderNpdf_pathc                 
   t          |           }g }t          |j                  D ]\\  }}	 |                                pd}n# t          $ r d}Y nw xY w|                    |dz   |                                d           ]|S )z>
    Load PDF pages with page numbers and extracted text.
        )pagetext)r   	enumeratepagesextract_text	Exceptionappendstrip)r   readerr   ir   r	   s         KC:\Users\Terasoftware\OneDrive\Desktop\Graph Rag\rag_suite\rag\pdf_utils.pyload_pdf_pagesr      s     x  FEV\** 	 	4	$$&&,"DD 	 	 	DDD	 	EJJLL
 
 	 	 	 	
 Ls   AAAr	   returnc                     | sdS d |                      d          D             }|sdS |d         }|                                st          j        d|          r
|dd         S dS )z
    Best-effort heuristic to extract a section heading.
    Looks for uppercase or numbered headings at the top of the page.
    Nc                 ^    g | ]*}|                                 |                                 +S  )r   ).0ls     r   
<listcomp>z+extract_section_heading.<locals>.<listcomp>#   s-    >>>1AGGII>QWWYY>>>    
r   z^\d+(\.\d+)*\s+[A-Z]x   )splitisupperrematch)r	   lines
first_lines      r   extract_section_headingr$      s    
  t>>

4 0 0>>>E tqJ 	 8+Z88  $3$4r   
chunk_sizeoverlapc                    d                     |                                           } | sg S g }d}t          |           }||k     rt          |||z             }||k     r+|                     d||          }|dk    r||dz   k    r|dz   }| ||                                         }|r|                    |           ||k    rnt          d||z
            }||k     |S )zN
    Deterministic sliding-window chunking with whitespace normalization.
     r   .d   r   )joinr   lenminrfindr   r   max)	r	   r%   r&   chunksstarttext_lenendlast_periodchunks	            r   
chunk_textr7   6   s    88DJJLL!!D 	FE4yyH
(
 &(EJ.// > 	&**S%55Kb  &[53;%> &!AoU3Y%%'' 	!MM%   (? 	AsW}%%! (
 &$ Mr   )pypdfr   r    strr   r$   intr7   r   r   r   <module>r;      s          				S    *# #*    6S c C      r   