In [1]:
!pip3 install transformers -q
!pip3 install sentencepiece -q

[K     |████████████████████████████████| 4.9 MB 20.5 MB/s 
[K     |████████████████████████████████| 6.6 MB 71.9 MB/s 
[K     |████████████████████████████████| 120 kB 62.8 MB/s 
[K     |████████████████████████████████| 1.3 MB 35.3 MB/s 
[?25h

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
from pathlib import Path
import os
import json

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [3]:
!wget https://raw.githubusercontent.com/realsarm/ReportQL/main/data/trialReport/ReportQL/Schemas/organs/simpleSchema.json

--2022-09-15 19:47:07--  https://raw.githubusercontent.com/realsarm/ReportQL/main/data/trialReport/ReportQL/Schemas/organs/simpleSchema.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20200 (20K) [text/plain]
Saving to: ‘simpleSchema.json’


2022-09-15 19:47:08 (94.2 MB/s) - ‘simpleSchema.json’ saved [20200/20200]



In [4]:
configs = dict(
    schema_path="simpleSchema.json",
    max_seq_length=350,
    min_length=250,
    num_beams=5,
    repetition_penalty=1.0,
)

In [5]:
input_ = 'Liver is normal in size and with normal parenchymal echogenicity with few hypoechoic cystic structures largest one measuring about 42*12 mm mainly in left lobe of liver possibility of hydatid cysts. CBD is dilated with 12mm in diameter. GB is distended containing few stones up to 10 mm without wall thickening or pericholecystic fluid. Spleen is normal in size and parenchymal echo with no sol. Both kidneys are small in size with increased cortical parenchymal echogenicity with no sign of stone, stasis or perinephric collection. Ureters are not dilated. Urinary bladder is normal, with no stone or wall thickening. Moderate free fluid is seen in abdominopelvic cavity.'.lower()
organs = ['liver', 'cbd', 'gb', 'pancreas', 'bladder', 'prostate', 'spleen', 'left kidney', 'right kidney']
with open(os.path.join(configs['schema_path']), mode='r') as sch:
    schema = json.load(sch)
report_schema = [
    f' <t> {organ["name"]} ' + "{ " + ' '.join([field["name"] for field in organ["fields"]]) + " } </t>"
    for organ in schema["types"] if organ["name"] in organs
]

In [6]:
tokenizer = T5Tokenizer.from_pretrained("sarme/ReportQL-base")
model = T5ForConditionalGeneration.from_pretrained(
    "sarme/ReportQL-base",
    num_beams=configs['num_beams'],
    max_length=configs['max_seq_length'],
    min_length=configs['min_length'],
    repetition_penalty=configs['repetition_penalty']
).cuda()

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/145 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.94k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/800 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [7]:
import time
t1 = time.perf_counter()
source_encoding = tokenizer(
        [f"r: {input_.strip().lower()} s:{'{ ' + ''.join(report_schema) + ' }'}"],
        max_length=configs['max_seq_length'],
        padding="max_length", truncation="only_second",
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt")

generated_ids = model.generate(
        input_ids=source_encoding["input_ids"].cuda(),
        attention_mask=source_encoding["attention_mask"].cuda(),
        num_beams=configs['num_beams'],
        max_length=configs['max_seq_length'],
        min_length=configs['min_length'],
        repetition_penalty=configs['repetition_penalty'],
        early_stopping=True
    )
t2 = time.perf_counter()
print('time taken to run:',t2-t1)

time taken to run: 12.452272198000003


In [8]:
print(tokenizer.batch_decode(source_encoding["input_ids"]))

['r: liver is normal in size and with normal parenchymal echogenicity with few hypoechoic cystic structures largest one measuring about 42*12 mm mainly in left lobe of liver possibility of hydatid cysts. cbd is dilated with 12mm in diameter. gb is distended containing few stones up to 10 mm without wall thickening or pericholecystic fluid. spleen is normal in size and parenchymal echo with no sol. both kidneys are small in size with increased cortical parenchymal echogenicity with no sign of stone, stasis or perinephric collection. ureters are not dilated. urinary bladder is normal, with no stone or wall thickening. moderate free fluid is seen in abdominopelvic cavity. s: { <t> liver { size small size large size echogenicity increased echogenicity fatty echogenicity lesion lesion count lesion echogenicity lesion component lesion size lesion border lesion pos lesion suggestive bile duct } </t> <t> left kidney { position transplanted size small size large size shape cortical parenchymal 

In [9]:
preds = tokenizer.batch_decode(generated_ids, clean_up_tokenization_spaces=True, skip_special_tokens=True)

print(preds)

['{ abdomino pelvic cavity { free fluid { yes } free fluid severity { moderate } } left ureter { dilation { no } } right ureter { dilation { no } } bladder { distension { normal } stone { no } wall thickening { no } } spleen { size { normal } echogenicity { normal } } cbd { dilation { yes } largest diameter { <size> } } gb { seen { yes } distention { yes } stone { yes } stone quantity { few } stone size { <size> } wall thickening { no } pericholecystic fluid { no } } left kidney { size { small } cortical parenchymal { increased } stone { no } stasis severity { no } perinehric collection { no } } right kidney { size { small } cortical parenchymal { increased } stone { no } stasis severity { no } perinehric collection { no } } liver { size { normal } echogenicity { normal } lesion { yes } lesion count { a few } lesion echogenicity { hypoechoic } } }']
