o
    Ai                     @   s  d dl Z d dlZd dlmZmZ d dlZd dlmZ d dl	m
Z
mZmZmZmZmZmZmZmZmZ d dlmZ ee
dZdedejfd	d
Zdedee defddZdedeeef defddZdededee fddZdefddZedkre Z e!de   dS dS )    N)ListDict)OpenAI)
OPENAI_API_KEYEMBED_MODEL	LLM_MODEL	INDEX_DIRARTIFACTS_DIR
CATEGORIESTOP_K_PER_CATEGORYMAX_FINDINGS_PER_CATEGORYRUN_ID	INPUT_DIR)
FaissStore)api_keyqueryreturnc                 C   s*   t jjt| gd}tj|jd jgddS )z.Convert a text query into an embedding vector.)modelinputr   float32)dtype)client
embeddingscreater   nparraydata	embedding)r   resp r   </var/www/html/fyndo/python/python_agents/rag_suite/rag/qa.pyembed_query   s
   r!   questionsearch_resultsc              
   C   s   |sdS g }|D ](}| di }| dd}| dd}| dd}|d	| d
| d|  qd|dd }d|  d|dd  d}	ztjjjtd|	dgddd}
|
jd j	j
 W S  tyy } zdt| W  Y d}~S d}~ww )z\
    Generate an LLM answer based on retrieved chunks.
    Used by main.py ask command.
    z/No relevant information found in the documents.metadatadoc_nameUnknownpage_number?text [z, Page z]
z

N   zUYou are an AI assistant analyzing annual reports and financial documents.

Question: z

Context from documents:
i.  a  

Instructions:
- Provide a clear, factual answer based ONLY on the context above
- Cite sources using [Document Name, Page X] format after each claim
- If the information is not in the context, say "Information not available in the provided documents"
- Be concise but complete

Answer:userrolecontentg?iX  )r   messagestemperature
max_tokensr   zError generating answer: )getappendjoinr   chatcompletionsr   r   choicesmessager0   strip	Exceptionstr)r"   r#   context_partsrmetar%   pager)   contextpromptresponseer   r   r    answer_question_with_llm(   s6   

rF   evidence	chunk_mapc                 C   st   | |  d}|sdS |  dd }|sdS ||d  vr"dS |  d|d kr-dS |  d|d kr8dS d	S )
z
    Validate that evidence actually exists in the retrieved chunks.
    Checks: chunk_id exists, quote is in text, doc_name matches, page matches.
    chunk_idFquoter*   r)   r%   rA   r'   T)r4   lower)rG   rH   chunkrJ   r   r   r    validate_evidenceY   s   rM   categoryc              
      s  t t}t|}|j|td}g }i  |D ]#}|d }|d |d |d |d |d d}|| | |d < qtj	t
d	|  d
ddd d}	tjt
dd t|	ddd}
tj||
dd W d   n1 spw   Y  d|  dtj|dd d|  d}ztjjjtd|dgdd}t|jd jj}W n! tjtfy } ztd|  d|  g W  Y d}~S d}~ww g }|d g dt D ]}
 fd!d"|
d#g D }|r||
d#< ||
 q|S )$z
    For a given category (e.g., 'Risks', 'Management'), retrieve relevant chunks
    and use LLM to extract structured findings with evidence.
    )kr$   rI   r%   r'   
source_urlr)   )rI   r%   r'   rP   r)   retrieval_bundle_ _/z.jsonTexist_okwutf-8encoding   )indentNzD
You are a pharma-grade annual report extraction engine.

Category: u   

RULES:
- Return VALID JSON ONLY
- Use ONLY the provided chunks
- If unsupported, OMIT the finding
- Each finding must include evidence with doc_name, page, chunk_id, quote (≤25 words)

CHUNKS:
z"

OUTPUT SCHEMA:
{
  "category": "aC  ",
  "findings": [
    {
      "finding": "Short description",
      "severity": "High|Medium|Low",
      "confidence": "High|Medium|Low",
      "evidence": [
        {
          "doc_name": "...",
          "page": 123,
          "chunk_id": "...",
          "quote": "exact quote from text"
        }
      ]
    }
  ]
}
r-   r.   r   )r   r1   r2   u%   ⚠️ Error extracting findings for z: findingsc                    s   g | ]	}t | r|qS r   )rM   .0rE   rH   r   r    
<listcomp>   s    z1extract_findings_per_category.<locals>.<listcomp>rG   ) r   loadr   r!   searchr   r5   ospathr6   r	   rK   replacemakedirsopenjsondumpdumpsr   r7   r8   r   r   loadsr9   r:   r0   JSONDecodeErrorr<   printr4   r   )rN   r   storeqvechitsbundlehr@   entrybundle_pathfrC   r   resultrE   r]   valid_evidencer   r`   r    extract_findings_per_categoryu   sh   

#



ry   c                  C   s  g } d}t d t D ]5\}}t d| d t||}|D ]}d|d|d< ||d< |d7 }| | qt d	t| d
 q| D ])}dd |d D }t||d< dddd|ddd}|d|d   |d< qD| jdd dd dd | dd D }tt	t| | |d}	t
jtdd t
jtd}
t|
d d!d"}tj|	|dd#d$ W d   n1 sw   Y  t d%t|   t d&d'|  t d(|
 d) |
S )*z
    Generate the master findings.json file by:
    1. Extracting findings for each category
    2. Scoring and ranking findings
    3. Identifying top 10 critical findings
       u*   
🔍 Extracting findings per category...
z  Processing: z...zF-03didrN   u       → Found z	 findingsc                 S   s   h | ]}|d  qS )r%   r   r^   r   r   r    	<setcomp>       z)generate_findings_json.<locals>.<setcomp>rG   recurrence_count   r[   )HighMediumLowseverityr   scorec                 S   s   | d S )Nr   r   )xr   r   r    <lambda>   s    z(generate_findings_json.<locals>.<lambda>T)keyreversec                 S   s   g | ]}|d  qS )r|   r   )r_   rv   r   r   r    ra      r~   z*generate_findings_json.<locals>.<listcomp>N
   )run_idinput_foldertotal_findingsr]   
top_10_idsrU   zfindings.jsonrW   rX   rY   F)r\   ensure_asciiu   
✅ Total findings: u   ✅ Top 10 critical findings: z, u   ✅ findings.json saved at: 
)rn   r
   itemsry   r5   lenr4   sortr   r   rd   rg   r	   re   r6   rh   ri   rj   )all_findingsfidrN   r   r]   rv   doc_set	sev_scorer   findings_dataout_pathr   r   r    generate_findings_json   sD   
r   __main__u   ✅ findings.json generated at )"ri   rd   typingr   r   numpyr   openair   
rag.configr   r   r   r   r	   r
   r   r   r   r   rag.vectorstorer   r   r=   ndarrayr!   rF   boolrM   ry   r   __name__re   rn   r   r   r   r    <module>   s"   0
1`5