diff --git a/Script/PRONTO.py b/Script/PRONTO.py index 3aa3c4a..2062067 100755 --- a/Script/PRONTO.py +++ b/Script/PRONTO.py @@ -28,6 +28,8 @@ from decimal import Decimal from copy import deepcopy import pronto.pronto as pronto +import pandas +import math from pdf2image import convert_from_path runID = "" @@ -731,83 +733,63 @@ def insert_image_to_ppt(DNA_sampleID,DNA_normal_sampleID,RNA_sampleID,DNA_image_ ppt.save(output_ppt_file) -def insert_table_to_ppt(table_data_file,slide_n,table_name,left_h,top_h,width_h,left_t,top_t,width_t,height_t,font_size,table_header,output_ppt_file,if_print_rowNo,table_column_width,table_max_rows_per_slide): - table_file = open(table_data_file) - lines = table_file.readlines() - if not lines: +def insert_table_to_ppt(table_file,slide_n,table_name,left_h,top_h,width_h,left_t,top_t,width_t,height_t,font_size,table_header,output_ppt_file,print_row_num,table_column_width,table_max_rows_per_slide): + + # load table data + try: + table_data = pandas.read_csv(table_file, sep='\t') + except pandas.errors.EmptyDataError: + logging.warning("{} is empty".format(table_file)) return - first_line = lines[0] - first_line_cells = first_line.split('\t') + + # add empty columns for missing header columns and move additional columns to the right + table_data = pronto.normalize_column_index(table_data, table_header) + + # round floats to 2 decimal places + table_data = pronto.set_column_to_2_decimals(table_data, "AF_tumor_DNA") + + # determine column and row number cols = len(table_header) - header_not_exist_in_table = [] - for n in range(len(table_header)): - if_exist = False - if(table_header[n] in first_line_cells): - if_exist = True - if not if_exist: - header_not_exist_in_table.append(n) - data_rows = [] - for line in lines[1:]: - line_cells = line.split('\t') - if header_not_exist_in_table: - for num in header_not_exist_in_table: - line_cells.insert(num," ") - row_data = [cell.strip() for cell in line.split('\t')] - data_rows.append(row_data) - total_rows = len(data_rows) + rows = len(table_data) + # how many slides are required + if not table_max_rows_per_slide: + table_max_rows_per_slide = rows + total_slides_needed = math.ceil(rows / table_max_rows_per_slide) + + # Add data to ppt ppt = Presentation(output_ppt_file) - if(table_max_rows_per_slide is None or total_rows <= table_max_rows_per_slide): - total_slides_needed = 1 - rows_per_page = total_rows - start_slide_index = slide_n - else: - total_slides_needed = (total_rows + table_max_rows_per_slide -1) // table_max_rows_per_slide - rows_per_page = table_max_rows_per_slide - start_slide_index = None - - for page_num in range(total_slides_needed): - start_idx = page_num * rows_per_page - end_idx = min(start_idx + rows_per_page, total_rows) - current_page_data = data_rows[start_idx:end_idx] - current_page_rows = len(current_page_data) - if(start_slide_index is not None and page_num == 0): - slide = ppt.slides[slide_n - 1] + for slide_idx in range(total_slides_needed): + current_slide_data = pronto.get_slide_table_data(table_data, slide_idx, table_max_rows_per_slide) + if(total_slides_needed == 1): + shapes = ppt.slides[slide_n - 1].shapes else: - slide = ppt.slides.add_slide(ppt.slide_layouts[6]) - shapes = slide.shapes + shapes = ppt.slides.add_slide(ppt.slide_layouts[6]).shapes + + # create new table on slide left = Inches(left_t) top = Inches(top_t) width = Inches(width_t) height = Inches(height_t) - table_rows = current_page_rows + 1 + table_rows = len(current_slide_data) table = shapes.add_table(table_rows,cols,left,top,width,height).table - for c in range(cols): - if table_column_width: - table.columns[c].width = Inches(table_column_width[c]) - table.cell(0,c).text = table_header[c] - table.cell(0,c).text_frame.paragraphs[0].font.size = Pt(font_size) - - for row_idx, row_data in enumerate(current_page_data, start=1): - for col_idx in range(cols): - table.cell(row_idx,col_idx).text = str(row_data[col_idx]) - table.cell(row_idx,col_idx).text_frame.paragraphs[0].font.size = Pt(font_size) - - textbox = slide.shapes.add_textbox(Inches(left_h),Inches(top_h),Inches(width_h),Inches(0.25)) - tf = textbox.text_frame - if(if_print_rowNo == True): - if(table_max_rows_per_slide is not None): - tf.paragraphs[0].text = table_name +" (N=" + str(total_rows) + ", Page " + str(page_num+1) + "/" + str(total_slides_needed) + ")" - else: - tf.paragraphs[0].text = table_name +" (N=" + str(total_rows) + ")" - else: - tf.paragraphs[0].text = table_name - tf.paragraphs[0].font.size = Pt(8) - tf.paragraphs[0].font.bold = True - tf.paragraphs[0].alignment = PP_ALIGN.CENTER + + # if table_column_width is provided, set the column width + if len(table_column_width) == cols: + for col_idx, width in enumerate(table_column_width): + table.columns[col_idx].width = Inches(width) + + # fill in the table data and set font size + for row_idx, row in enumerate(table.rows): + for col_idx, cell in enumerate(row.cells): + cell.text = current_slide_data[row_idx][col_idx] + cell.text_frame.paragraphs[0].font.size = Pt(font_size) + + # add table title + pronto.add_table_name(shapes, table_name, left_h, top_h, width_h, 0.25, 8, print_row_num, slide_idx, total_slides_needed, rows) ppt.save(output_ppt_file) - return total_rows + return rows def update_ppt_variant_summary_table(data_nrows,DNA_sampleID,RNA_sampleID,TMB_DRUP_nr,TMB_DRUP_str,DNA_variant_summary_file,RNA_variant_summary_file,output_file_preMTB_AppendixTable,output_table_file_filterResults_AllReporVariants_CodingRegion,output_ppt_file): diff --git a/pronto/pronto.py b/pronto/pronto.py index 3f87cbb..fc22dde 100644 --- a/pronto/pronto.py +++ b/pronto/pronto.py @@ -1,6 +1,8 @@ import glob import logging import os +import pandas +import pptx # get tumor mutational burden label def get_tmb_string(val): @@ -27,3 +29,52 @@ def glob_tsoppi_file(is_error, root, run_id, *path_units): else: logging.error("unsuccessful glob strings for {}:\n{}\n{}".format(run_id, glob_string_ous, glob_string_hus)) raise ValueError + +# normalize dataframe to expected column indices +def normalize_column_index(df: pandas.DataFrame, exp_col_idx: list): + # determine current, missing and additional column indices + curr_col_idx = df.columns.tolist() + miss_col_idx = list(set(exp_col_idx) - set(curr_col_idx)) + add_col_idx = list(set(curr_col_idx) - set(exp_col_idx)) + # add missing column indices + for i in miss_col_idx: + df[i] = ' ' + # combine expected with additional to get all present column indices and rearrange columns accordingly, additional columns are moved to the right + all_col_idx = exp_col_idx + add_col_idx + return df[all_col_idx] + +# set dataframe column format to 2 decimal points if float type +def set_column_to_2_decimals(df: pandas.DataFrame, col_name: str): + if col_name in df.columns: + if df[col_name].dtype == float: + df[col_name] = df[col_name].map('{:.2f}'.format) + else: + logging.info("Column {} not found in dataframe".format(col_name)) + return df + +# get data fitting on one slide based on slide index and max rows per slide +def get_slide_table_data(df: pandas.DataFrame, slide_idx: int, max_rows: int): + start = slide_idx * max_rows + stop = min(start + max_rows, len(df)) + if start >= len(df): + return [] + table = df.values.tolist() + header = [df.columns.tolist()] + table_data = header + table[start:stop] + return table_data + +# add constructed table name to slide and format the textbox +def add_table_name(shapes: pptx.shapes.shapetree.SlideShapes, table_name: str, left: float, top: float, width: float, height: float, font_size: float, print_row_num: bool, slide_idx: int, total_slides: int, rows: int): + + # add textbox to slide + paragraph = shapes.add_textbox(pptx.util.Inches(left), pptx.util.Inches(top), pptx.util.Inches(width), pptx.util.Inches(height)).text_frame.paragraphs[0] + + # construct table name with optional row number and slide count + part_1 = ", Page {}/{}".format(slide_idx + 1, total_slides) if total_slides > 1 else '' + part_2 = " (N={}{})".format(rows, part_1) if print_row_num else '' + paragraph.text = "{}{}".format(table_name, part_2) + + # font formatting and placement + paragraph.font.size = pptx.util.Pt(font_size) + paragraph.font.bold = True + paragraph.alignment = pptx.enum.text.PP_ALIGN.CENTER diff --git a/pronto/tests/pronto_test.py b/pronto/tests/pronto_test.py index c0e6aa7..e58028e 100644 --- a/pronto/tests/pronto_test.py +++ b/pronto/tests/pronto_test.py @@ -1,3 +1,5 @@ +import pandas +import pptx import pytest import pronto.pronto @@ -99,3 +101,318 @@ def test_get_tmb_string(input, exception, want): def test_glob_tsoppi_file(inputs, exception, want): with exception: assert pronto.pronto.glob_tsoppi_file(*inputs) == want + +@pytest.mark.parametrize( + "inputs, exception, want", + [ + ( + ( + pandas.DataFrame({ + "one": [1, 2], + "two": [3, 4], + "three": [5, 6], + "four": [7, 8], + }), + ["one", "two", "three", "four"], + ), + does_not_raise(), + pandas.DataFrame({ + "one": [1, 2], + "two": [3, 4], + "three": [5, 6], + "four": [7, 8], + }), + ), + ( + ( + pandas.DataFrame({ + "one": [1, 2], + "two": [3, 4], + "four": [7, 8], + }), + ["one", "two", "three", "four"], + ), + does_not_raise(), + pandas.DataFrame({ + "one": [1, 2], + "two": [3, 4], + "three": [' ', ' '], + "four": [7, 8], + }), + ), + ( + ( + pandas.DataFrame({ + "one": [1, 2], + "two": [3, 4], + "three": [5, 6], + "four": [7, 8], + }), + ["two", "three", "four"], + ), + does_not_raise(), + pandas.DataFrame({ + "two": [3, 4], + "three": [5, 6], + "four": [7, 8], + "one": [1, 2], + }), + ), + ( + ( + pandas.DataFrame({ + "one": [1, 2], + "two": [3, 4], + "four": [7, 8], + "five": [9, 10], + }), + ["one", "two", "three", "four"], + ), + does_not_raise(), + pandas.DataFrame({ + "one": [1, 2], + "two": [3, 4], + "three": [' ', ' '], + "four": [7, 8], + "five": [9, 10], + }), + ), + ] +) +def test_normalize_column_index(inputs, exception, want): + with exception: + get = pronto.pronto.normalize_column_index(*inputs) + assert want.equals(get) + +@pytest.mark.parametrize( + "inputs, exception, want", + [ + ( + ( + pandas.DataFrame({ + "one": [1, 2], + "two": [3.333, 4.444], + }), + "two", + ), + does_not_raise(), + pandas.DataFrame({ + "one": [1, 2], + "two": ["3.33", "4.44"], + }), + ), + ( + ( + pandas.DataFrame({ + "one": [1, 2], + "two": ['21.0%', '0.5%'], + }), + "two", + ), + does_not_raise(), + pandas.DataFrame({ + "one": [1, 2], + "two": ["21.0%", "0.5%"], + }), + ), + ( + ( + pandas.DataFrame({ + "one": [1, 2], + }), + "two", + ), + does_not_raise(), + pandas.DataFrame({ + "one": [1, 2], + }), + ), + ] +) +def test_set_column_to_2_decimals(inputs, exception, want): + with exception: + get = pronto.pronto.set_column_to_2_decimals(*inputs) + assert want.equals(get) + +def list_of_lists_equal(list1, list2): + if len(list1) != len(list2): + return False + for sublist1, sublist2 in zip(list1, list2): + print(sublist1, sublist2) + if sublist1 != sublist2: + return False + return True + +@pytest.mark.parametrize( + "inputs, exception, want", + [ + ( + ( + pandas.DataFrame({ + "one": [1, 2], + "two": [3, 4], + }), + 0, + 3, + ), + does_not_raise(), + [ + ["one", "two"], + [1, 3], + [2, 4], + ], + ), + ( + ( + pandas.DataFrame({ + "one": [1, 2, 3 ,4], + "two": [5, 6, 7, 8], + }), + 1, + 2, + ), + does_not_raise(), + [ + ["one", "two"], + [3, 7], + [4, 8], + ], + ), + ( + ( + pandas.DataFrame({ + "one": [1, 2, 3], + "two": [5, 6, 7], + }), + 1, + 2, + ), + does_not_raise(), + [ + ["one", "two"], + [3, 7], + ], + ), + ( + ( + pandas.DataFrame({ + "one": [1, 2, 3], + "two": [5, 6, 7], + }), + 2, + 2, + ), + does_not_raise(), + [], + ), + ] +) +def test_get_slide_table_data(inputs, exception, want): + with exception: + get = pronto.pronto.get_slide_table_data(*inputs) + assert list_of_lists_equal(get, want) + +def check_shape(shape, want_left, want_top, want_width, want_height): + assert shape.left == pptx.util.Inches(want_left) + assert shape.top == pptx.util.Inches(want_top) + assert shape.width == pptx.util.Inches(want_width) + assert shape.height == pptx.util.Inches(want_height) + +def check_paragraph(paragraph, want_text, want_font_size, want_bold, want_alignment): + assert paragraph.text == want_text + assert paragraph.font.size.pt == want_font_size + assert paragraph.font.bold == want_bold + assert paragraph.alignment == want_alignment + +@pytest.mark.parametrize( + "inputs, exception, want_shape, want_paragraph", + [ + ( + ( + 'Test', + 0.5, + 0.5, + 4, + 1, + 12, + True, + 0, + 3, + 4, + ), + does_not_raise(), + ( + 0.5, + 0.5, + 4, + 1, + ), + ( + 'Test (N=4, Page 1/3)', + 12.0, + True, + pptx.enum.text.PP_ALIGN.CENTER, + ) + ), + ( + ( + 'Test', + 0.5, + 0.5, + 4, + 1, + 12, + True, + 0, + 1, + 4, + ), + does_not_raise(), + ( + 0.5, + 0.5, + 4, + 1, + ), + ( + 'Test (N=4)', + 12.0, + True, + pptx.enum.text.PP_ALIGN.CENTER, + ) + ), + ( + ( + 'Test', + 0.5, + 0.5, + 4, + 1, + 12, + False, + 0, + 3, + 4, + ), + does_not_raise(), + ( + 0.5, + 0.5, + 4, + 1, + ), + ( + 'Test', + 12.0, + True, + pptx.enum.text.PP_ALIGN.CENTER, + ) + ), + ] +) +def test_add_table_name(inputs, exception, want_shape, want_paragraph): + with exception: + shapes = pptx.Presentation().slides.add_slide(pptx.Presentation().slide_layouts[6]).shapes + pronto.pronto.add_table_name(shapes, *inputs) + check_shape(shapes[0], *want_shape) + check_paragraph(shapes[0].text_frame.paragraphs[0], *want_paragraph) \ No newline at end of file diff --git a/requirements-test.txt b/requirements-test.txt index b30f9dc..517e35c 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,3 +1,3 @@ -pytest==7.4.3 +pytest-emoji==0.2.0 pytest-md==0.2.0 -pytest-emoji==0.2.0 \ No newline at end of file +pytest==7.4.3 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 64993a3..d0ec330 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ click==8.1.7 +pandas==2.2.3 +pdf2image==1.17.0 python-docx==1.1.0 python-pptx==0.6.23 xlrd==2.0.1 xlutils==2.0.0 -pdf2image==1.17.0