PDB Parser Code

#!/usr/bin/perl ############################################################################# # File Name: pdbparser.pl # Contents: PDBParser will convert a Bioinformatics's PDB file # format to xml format, based on a supplied DTD. # # Author: Catalina Price # Creation Date: 20040308 # Code Origination: Catalina Price # Origination Date: 20040308 # Last Modified by: Catalina Price # Modification Date: 20040315 # Project: URI(tm) Universal Research Interchange Format # Legal: Copyright (C) 2004, URI, Bioinformatics, CSC592 ############################################################################# # Current draw backs in reading a DTD: # 1. The pdbparser does not handle NameSpaces # 2. The pdbparser does not handle the shorthand of the Element Tag, Ex. <GREETING/> # 3. The pdbparser does not handle multiple attributes # placed in a single !ATTLIST tag. ########################################################## # Setup local system syntax variables. ########################################################## use strict; use Cwd; #use File::Copy; #use File::Path; use IO::Handle; # NT appears to not be autoflushing STDERR->autoflush(1); # STDERR, so we do it explicitly. ########################################################## # Prototypes. ########################################################## sub PrintVersion; sub GetFileName; sub ReadDTDFile; sub BuildDTDTree; sub ReviewDTDTreeForErrors; sub BuildNode; sub PrintElementReport; sub PDB1LetterSeqs; ######################################### # Tools Section ######################################### sub GetUserResponse; sub PrintFormatList; ########################################################## # Internal Global variables and flags. READ ONLY ########################################################## # File and Directory Names ########################## #my $FileName = "DTD_URI.dtd"; #my $FileName = "test-attr.dtd"; my $FileNamePDB = ""; my $FileNameDTD = ""; my $g_PDBParserVersion = "2.0-1";#PDBParser Version control number. my %g_DeclarationTypes = ( 'Element' => '<!ELEMENT', #Syntax: <!ELEMENT element-name category> or <!ELEMENT element-name (element-content)> 'Attribute' => '<!ATTLIST', #Syntax: <!ATTLIST element-name attribute-name attribute-type default-value> 'Entity' => '<!ENTITY', #Syntax: <!ENTITY entity-name "entity-value"> ); ######################### # Hash keys and Values. ######################### my $PDB_EXT = "ent"; my $DTD_EXT = "dtd"; my $DTD_LINENUM = "DTDLine:"; ## Hash Entry Types - %ElementTreeNode my $ELEMENT_NAME = 'ElementName'; my $ELEMENT_CATAGORY = 'ElementCatagory'; my $ELEMENT_LINE_NUM = 'ElementLineNum'; my $ref_PARENTNODE = 'refParentNode'; my $ref_CHILDNODE_LIST = 'refChildNodeList'; my $ref_ATTRIBUTE_LIST = 'refAttributeList'; ## Element Catagory (s) my $ELEM_CAT_LIST = 'LIST'; my $ELEM_CAT_PND_PCDATA = '#PCDATA'; my $ELEM_CAT_ANY = 'ANY'; my $ELEM_CAT_EMPTY = 'EMPTY'; ## ATTRIB Value types Explanation my $ATTRIB_TYPE_CDATA = 'CDATA'; #The value is character data. my $ATTRIB_TYPE_ENUM = 'ENUM'; #The value must be one from an enumerated list. my $ATTRIB_TYPE_ID = 'ID'; #The value is a unique id. my $ATTRIB_TYPE_IDREF = 'IDREF'; #The value is the id of another element. my $ATTRIB_TYPE_IDREFS = 'IDREFS'; #The value is a list of other ids. my $ATTRIB_TYPE_NMTOKEN = 'NMTOKEN'; #The value is a valid XML name. my $ATTRIB_TYPE_NMTOKENS = 'NMTOKENS'; #The value is a list of valid XML names. my $ATTRIB_TYPE_ENTITY = 'ENTITY'; #The value is an entity. my $ATTRIB_TYPE_ENTITIES = 'ENTITIES'; #The value is a list of entities. my $ATTRIB_TYPE_NOTATION = 'NOTATION'; #The value is a name of a notation. my $ATTRIB_TYPE_XML = 'xml:'; #The value is a predefined xml value ## ATTRIB Default Value types my $ATTRIB_DEFVALUE_REQUIRED = '#REQUIRED';#The attribute value must be included in the element my $ATTRIB_DEFVALUE_IMPLIED = '#IMPLIED'; #The attribute does not have to be included my $ATTRIB_DEFVALUE_FIXED = '#FIXED'; #value ??????????? The attribute value is fixed ## ERROR Types my $g_ERROR_DTD_NOERROR = 0; my $g_ERROR_DTD_READ = 1; my $g_ERROR_DTD_BUILD = 2; my $g_DTDTreeRoot = "ROOT"; my $g_RootElement = ""; my $g_RootLineNum = 0; my @g_ElementDeclarationList = (); my @g_ElementNameDeclarationList = (); my @g_RealizedAssociatedElementNameList = (); my $g_RealizedAssociatedElementNameCount = 0; my $g_ProclaimedAssociatedElementCount = 0; my @g_AttributeDeclarationList = (); my @g_AttributeNameDeclarationList = (); #Not really used.!!!!!!! my $g_AttributesAssociatedWithElementCount = 0; my @g_AttributesAssociatedWithElements = (); my @g_EntityNameList = (); my @g_EntityDeclarationList = (); ########################################################## # Debugging variable. # 1 = TRUE Sends all output to sreen. # 0 = FALSE Sends all output to logfile ########################################################## my $bRunTimeDebug = 0; ########################################################## exit(!MainFunction()); ######################################################### # Main() ######################################################### sub MainFunction{ my %DTDTree = ($ELEMENT_NAME => '', $ELEMENT_CATAGORY => '', $ELEMENT_LINE_NUM => '', $ref_PARENTNODE => '', $ref_CHILDNODE_LIST => '', $ref_ATTRIBUTE_LIST => ''); my @EmptyAttributeList = (); my $StatusResult = ""; my @StatusResultMsg = (); my $ReturnStatus = ""; PrintVersion(); $FileNamePDB = GetFileName($PDB_EXT); $FileNameDTD = GetFileName($DTD_EXT); $DTDTree{$ref_ATTRIBUTE_LIST} = \@EmptyAttributeList; ($StatusResult, @StatusResultMsg) = ReadDTDFile($FileNameDTD); if ( 0 == $StatusResult){ $ReturnStatus = $StatusResult; ReviewDTDTreeForErrors($g_ERROR_DTD_READ, @StatusResultMsg); STDERR->print("PDB Parser has exited due to DTD errors.\n"); return($ReturnStatus); }#if $DTDTree{$ELEMENT_NAME} = $g_RootElement; $DTDTree{$ELEMENT_LINE_NUM} = $g_RootLineNum; $DTDTree{$ref_PARENTNODE} = \$g_DTDTreeRoot; $g_ProclaimedAssociatedElementCount++; ($StatusResult, @StatusResultMsg) = BuildDTDTree(\%DTDTree); if ( 0 == $StatusResult){ $ReturnStatus = $StatusResult; ReviewDTDTreeForErrors($g_ERROR_DTD_BUILD, @StatusResultMsg); STDERR->print("PDB Parser has exited due to DTD errors.\n"); return($ReturnStatus); }#if if (0 == ($StatusResult = ReviewDTDTreeForErrors($g_ERROR_DTD_NOERROR, @StatusResultMsg))){ STDERR->print("PDB Parser has exited due to DTD errors.\n"); return($ReturnStatus = $StatusResult); }#if PDB1LetterSeqs($FileNamePDB); #..... # print("\nList of Elements.\n"); # PrintFormatList(80, "Elem: ", @g_ElementNameDeclarationList); # print("\nList of Attributes.\n"); # PrintFormatList(80, "Attr: ", @g_AttributeNameDeclarationList); # print("\nList of Entities.\n"); # PrintFormatList(80, "Attr: ", @g_EntityNameList); # PrintFormatList(80, "Raw Element List: ", @g_ElementDeclarationList); # my $tmpOutput = ""; # foreach $tmpOutput (@g_ElementDeclarationList){ # print("Raw Element List: $tmpOutput\n"); # }#foreach # print("\nList of Attributes. @g_AttributeDeclarationList\n"); # print("\nList of Attributes. @g_EntityDeclarationList\n"); STDERR->print("\nPDB Parser has completed with Success.\n"); $ReturnStatus = !$StatusResult; return($ReturnStatus); }# MainFunction() ############################################################################ # FUNCTION: PrintVersion() # PURPOSE: Prints Version information about PDBParser. # PARAMETERS: None. # RETURN: None. ############################################################################ sub PrintVersion{ my $FormatStr = ""; if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Entering function PrintVersion()\n");} $FormatStr = sprintf ("%-10s", $g_PDBParserVersion); STDERR->print("****************************************************************************************\n"); STDERR->print("*** PDBParser ***\n"); STDERR->print("*** Version: $FormatStr ***\n"); STDERR->print("*** ***\n"); STDERR->print("*** PDBParser will convert a Bioinformatics's PDB file format to xml format, based ***\n"); STDERR->print("*** on a supplied DTD. ***\n"); STDERR->print("*** ***\n"); STDERR->print("*** Project: URI(tm) Universal Research Interchange Format ***\n"); STDERR->print("*** ***\n"); STDERR->print("*** Legal: Copyright (C) 2004, URI, Bioinformatics, CSC592 ***\n"); STDERR->print("*** ***\n"); STDERR->print("****************************************************************************************\n"); if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Exiting function PrintVersion()\n");} }#PrintVersion() ################################################################################################ ########### ########### ########### DTD Section ########### ########### ########### ################################################################################################ ############################################################################ # FUNCTION: ReadDTDFile() # PURPOSE: Reads the DTD data structure. # PARAMETERS: None. # # RETURN: TRUE for success, otherwise FALSE. ############################################################################ sub ReadDTDFile{ my $FileName = ""; my $Line = ""; my $LineCopy = ""; my $TagType = ""; my $ElementName = ""; my $ElementDeclaration = ""; my $TmpElementName = ""; my $AttributeName = ""; my $AttributeDeclaration = ""; my $TmpAttributeName = ""; my @TmpAttributeList = (); my $CurrentAttributeLine = ""; my $EntityName = ""; my $EntityDeclaration = ""; my $EntityName = ""; my $EntityValue = ""; my $bLookingForFirstElement = 1; #Must be True. my $bFoundDuplicate = 0; #Must be False. my $ReturnStatus = 1; #Must be True. my @returnStatusMsg = (); my $LineCount = 0; #Must be zero. my $LineNum = 0; my $TmpLine = ""; ($FileName) = @_; if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Entering function ReadDTDFile()\n");} if (open(FILE,"$FileName")){ while ($Line = <FILE>){ $Line =~ s/\s+$//; $LineCount++; SWITCH: { if ( $Line =~ /^$g_DeclarationTypes{Element}/ ){ ## Remove Tag Information $ElementDeclaration = $Line; $ElementDeclaration =~ s/^$g_DeclarationTypes{Element}//; $ElementDeclaration =~ s/^\s+//; ($ElementDeclaration)=split(/\>/,$ElementDeclaration); $ElementDeclaration =~ s/\s+$//; $ElementDeclaration =~ s/\s+/ /g; ## Get Element Name $ElementName = $ElementDeclaration; $ElementDeclaration = "$ElementDeclaration $DTD_LINENUM $LineCount"; ($ElementName) = split(/\(/,$ElementName); ($ElementName) = split(/\s+/,$ElementName); $ElementName =~ s/\s+$//; ## Ensure no duplicates if( !(grep /^$ElementName$/, @g_ElementNameDeclarationList) ){ if ($bRunTimeDebug){print("FOUND NEW ELEMENT: $ElementDeclaration\n");} push @g_ElementNameDeclarationList, $ElementName; push @g_ElementDeclarationList, $ElementDeclaration; if ($bLookingForFirstElement){ $g_RootElement = $ElementName; $g_RootLineNum = $LineCount; $bLookingForFirstElement = 0; }#if }else{ ($TmpLine, $LineNum) = split(/$DTD_LINENUM\s+/,$ElementDeclaration); push @returnStatusMsg, "Duplicate Element ($LineNum): $TmpLine\n"; $ReturnStatus = 0; }#else last SWITCH; }#if if ( $Line =~ /^$g_DeclarationTypes{Attribute}/ ){ ## Remove Tag Information $AttributeDeclaration = $Line; $AttributeDeclaration =~ s/^$g_DeclarationTypes{Attribute}//; $AttributeDeclaration =~ s/^\s+//; ($AttributeDeclaration)=split(/\>/,$AttributeDeclaration); $AttributeDeclaration =~ s/\s+$//; $AttributeDeclaration =~ s/\s+/ /g; ## Get Element and Attribute Name ($ElementName, $AttributeName)=split(/\s+/,$AttributeDeclaration); $AttributeDeclaration = "$AttributeDeclaration $DTD_LINENUM $LineCount"; ## Ensure no duplicates $bFoundDuplicate = 0; @TmpAttributeList = grep /^$ElementName/, @g_AttributeDeclarationList; foreach $CurrentAttributeLine (@TmpAttributeList) { ($TmpElementName, $TmpAttributeName) = split(/\s+/,$CurrentAttributeLine); $TmpElementName =~ s/^\s+//; $TmpElementName =~ s/\s+$//; if (($ElementName eq $TmpElementName) && ($AttributeName eq $TmpAttributeName)){ ($TmpLine, $LineNum) = split(/$DTD_LINENUM\s+/,$CurrentAttributeLine); push @returnStatusMsg, "Duplicate Attribute ($LineNum): $TmpLine\n"; $ReturnStatus = 0; $bFoundDuplicate = 1; }#if }#foreach if(!($bFoundDuplicate)){ push @g_AttributeNameDeclarationList, $AttributeName; if ($bRunTimeDebug){print("FOUND NEW ATTRIBUTE: \"$AttributeDeclaration\"\n");} push @g_AttributeDeclarationList, $AttributeDeclaration; }#if last SWITCH; }#if if ( $Line =~ /^$g_DeclarationTypes{Entity}/ ){ ## Remove Tag Information $EntityDeclaration = $Line; $EntityDeclaration =~ s/^$g_DeclarationTypes{Entity}//; $EntityDeclaration =~ s/^\s+//; ($EntityDeclaration)=split(/\>/,$EntityDeclaration); $EntityDeclaration =~ s/\s+$//; $EntityDeclaration =~ s/\s+/ /g; ## Get Entity Name ($EntityName)=split(/\s+/,$EntityDeclaration); $EntityDeclaration = "$EntityDeclaration $DTD_LINENUM $LineCount"; ## Ensure no duplicates if( !(grep /^$EntityName$/, @g_EntityNameList)){ push @g_EntityNameList, $EntityName; if ($bRunTimeDebug){print("FOUND NEW ENTITY: \"$EntityDeclaration\"\n");} push @g_EntityDeclarationList, $EntityDeclaration; }else{ ($TmpLine, $LineNum) = split(/$DTD_LINENUM\s+/,$EntityDeclaration); push @returnStatusMsg, "Duplicate Entity ($LineNum): $TmpLine\n"; $ReturnStatus = 0; if ($bRunTimeDebug){print("ERROR INFO: Duplicate Entity: \"$EntityName\"\n");} }#else last SWITCH; }#if }#SWITCH }#while close FILE; }#if if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Exiting function ReadDTDFile()\n");} return ($ReturnStatus, @returnStatusMsg); }#ReadDTDFile() ############################################################################ # FUNCTION: BuildDTDTree() # PURPOSE: Build a tree of the DTD. # PARAMETERS: $refCurrentElementNode - [IN] A hash reference containing a CurrentElementNode hash. # # RETURN: TRUE for success, otherwise FALSE. ############################################################################ sub BuildDTDTree{ my $refCurrentElementNode = 0; my %CurrentElementNode = []; my @SubElementDeclarationList = (); my $TmpElementLine = ""; my $TmpLine = ""; my $TmpElementName = ""; my $Junk = ""; my $ParentElementName = ""; my $ParentLineNum = ""; my $CurrentElementName = ""; my $CurrentCategory = ""; my $LineNum = ""; my @CurrentChildNodeList = (); my @TmpCurrentChildNodeList = (); my $TmpChildNode = ""; my @TmpAttributeList = (); my $TmpAttributeLine = ""; my @CurrentAttributeList = (); my $bStopSearch = 0; # Must be FALSE. my $StatusResult = 0; my @StatusResultMsg = (); my @returnStatusMsg = (); my $FormatMsg = ""; ($refCurrentElementNode) = @_; if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Entering function BuildDTDTree()\n");} %CurrentElementNode = %{$refCurrentElementNode}; $CurrentElementName = $CurrentElementNode{$ELEMENT_NAME}; #Get the list of attributes for the current element, if any. @TmpAttributeList = grep /^$CurrentElementName/, @g_AttributeDeclarationList; foreach $TmpAttributeLine (@TmpAttributeList){ ($TmpAttributeLine, $LineNum) = split(/$DTD_LINENUM\s+/,$TmpAttributeLine); ($TmpElementName, $Junk) = split(/\s+/,$TmpAttributeLine); $TmpElementName =~ s/^\s+//; $TmpElementName =~ s/\s+$//; if ($TmpElementName eq $CurrentElementName){ $g_AttributesAssociatedWithElementCount++; if ($bRunTimeDebug){print("FOUND ATTRIBUTE: Current Attribute line: $TmpAttributeLine\n");} push @g_AttributesAssociatedWithElements, $TmpAttributeLine; $TmpAttributeLine =~ s/^$CurrentElementName//; $TmpAttributeLine =~ s/^\s+//; push @CurrentAttributeList, $TmpAttributeLine; }#if }#foreach $CurrentElementNode{$ref_ATTRIBUTE_LIST} = \@CurrentAttributeList; #Should always be only one since an integrity check has been done. @SubElementDeclarationList = grep /^$CurrentElementName/, @g_ElementDeclarationList; if ( 0 == scalar(@SubElementDeclarationList)) { $ParentElementName = ${%{$CurrentElementNode{$ref_PARENTNODE}}}{$ELEMENT_NAME}; $ParentLineNum = ${%{$CurrentElementNode{$ref_PARENTNODE}}}{$ELEMENT_LINE_NUM}; $FormatMsg = "Proclaimed Child ($ParentLineNum): Parent Element \"$ParentElementName\" " . "proclaimed a child Element \"$CurrentElementName\"\n"; push @returnStatusMsg, $FormatMsg; $ParentLineNum =~ s/\S/ /g; $FormatMsg = " $ParentLineNum but none exist in the Element Declaration list.\n"; push @returnStatusMsg, $FormatMsg; return(0, @returnStatusMsg); }else{ if ($bRunTimeDebug){print("New List of Elements lines: @SubElementDeclarationList\n");} while (($TmpElementLine = shift(@SubElementDeclarationList)) && (!$bStopSearch)){ ($TmpElementLine, $LineNum) = split(/$DTD_LINENUM\s+/,$TmpElementLine); $TmpLine = $TmpElementLine; ($TmpLine, $Junk) = split(/\(/,$TmpLine); if ($bRunTimeDebug){print("Split Line: \"$TmpLine\", \"$Junk\" \n");} ($TmpElementName, $Junk) = split(/\s+/,$TmpLine); $TmpElementName =~ s/\s+$//; if ($bRunTimeDebug){print("Split Line: \"$TmpElementName\", \"$Junk\" \n");} if ($TmpElementName eq $CurrentElementName){ $g_RealizedAssociatedElementNameCount++; $CurrentElementNode{$ELEMENT_LINE_NUM} = $LineNum; my $TmpBuffer = $TmpElementName; #???????????????? is my needed?????????? push @g_RealizedAssociatedElementNameList, $TmpBuffer; if ($bRunTimeDebug){print("FOUND ELEMENT: Current Element line: $TmpElementLine\n");} $TmpElementLine =~ s/^$CurrentElementName//; $TmpElementLine =~ s/^\s+//; $TmpElementLine =~ s/^\(//; $TmpElementLine =~ s/^\s+//; $TmpElementLine =~ s/\s+$//; $TmpElementLine =~ s/\)$//; $TmpElementLine =~ s/\s+$//; $CurrentCategory = $TmpElementLine; $bStopSearch = 1; }#if }#while }#else if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: CurrentCategory: $CurrentCategory\n");} SWITCH: { if ($CurrentCategory eq $ELEM_CAT_PND_PCDATA){ if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Element: $CurrentElementName Catagory: $CurrentCategory\n");} $CurrentElementNode{$ELEMENT_CATAGORY} = $ELEM_CAT_PND_PCDATA; PrintElementReport(\%CurrentElementNode); last SWITCH; }#if if ($CurrentCategory eq $ELEM_CAT_ANY){ if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Element: $CurrentElementName Catagory: $CurrentCategory\n");} $CurrentElementNode{$ELEMENT_CATAGORY} = $ELEM_CAT_ANY; PrintElementReport(\%CurrentElementNode); last SWITCH; }#if if ($CurrentCategory eq $ELEM_CAT_EMPTY){ if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Element: $CurrentElementName Catagory: $CurrentCategory\n");} $CurrentElementNode{$ELEMENT_CATAGORY} = $ELEM_CAT_EMPTY; PrintElementReport(\%CurrentElementNode); last SWITCH; }#if default:{ if ($bRunTimeDebug){ STDERR->print("\nDEBUG INFO: Element: $CurrentElementName Catagory: $CurrentCategory\n");} $CurrentElementNode{$ELEMENT_CATAGORY} = $ELEM_CAT_LIST; (@CurrentChildNodeList) = split(/,/,$CurrentCategory); @TmpCurrentChildNodeList = @CurrentChildNodeList; @CurrentChildNodeList = (); foreach $TmpChildNode (@TmpCurrentChildNodeList) { $TmpChildNode =~ s/^\s+//; $TmpChildNode =~ s/\s+$//; push @CurrentChildNodeList, $TmpChildNode; }#foreach if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: ChildNodeList: @CurrentChildNodeList\n");} $CurrentElementNode{$ref_CHILDNODE_LIST} = \@CurrentChildNodeList; PrintElementReport(\%CurrentElementNode); foreach $TmpChildNode (@CurrentChildNodeList) { $TmpChildNode =~ s/^\s+//; $TmpChildNode =~ s/\s+$//; $TmpChildNode =~ s/\+$//; $TmpChildNode =~ s/\*$//; $TmpChildNode =~ s/\?$//; $g_ProclaimedAssociatedElementCount++; my %NewElementNode = ($ELEMENT_NAME => '', $ELEMENT_CATAGORY => '', $ELEMENT_LINE_NUM => '', $ref_PARENTNODE => '', $ref_CHILDNODE_LIST => '', $ref_ATTRIBUTE_LIST => ''); my @EmptyAttributeList = (); $NewElementNode{$ELEMENT_NAME} = $TmpChildNode; $NewElementNode{$ref_PARENTNODE} = \%CurrentElementNode; $NewElementNode{$ref_ATTRIBUTE_LIST} = \@EmptyAttributeList; ($StatusResult, @StatusResultMsg) = BuildDTDTree(\%NewElementNode); if ( 0 == $StatusResult){ push @returnStatusMsg, @StatusResultMsg; return(0, @returnStatusMsg); }#if $CurrentElementNode{$TmpChildNode} = \%NewElementNode; }#foreach last SWITCH; }#default }#SWITCH %{$refCurrentElementNode} = %CurrentElementNode; if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Exiting function BuildDTDTree()\n");} return(1, ()); }#BuildDTDTree() ############################################################################ # FUNCTION: ReviewDTDTreeForErrors() # PURPOSE: Ensure all declared Elements, Attribute, and Entities have # been incorporated into the DTD tree. # PARAMETERS: $ - [IN] # # RETURN: None. ############################################################################ sub ReviewDTDTreeForErrors{ my $ref_ErrorMsg = 0; my $ElementDeclarationCount = 0; my $TmpElementLine = ""; my $TmpAssociatedAttributeLine = ""; my $TmpElementName = 0; my $AttributeDeclarationCount= 0; my $TmpAttributeName = ""; my $AttributeAssociatedCount = 0; my $TmpAttributeLine = 0; my $OrphanElementCount = 0; my $OrphanAttributeCount = 0; my $EntityCount = 0; my $TmpElementName = ""; my $TmpAssocElementName = ""; my $TmpAssocAttributeName = ""; my $Junk = 0; my @AttributeWithUndeclaredElementList = (); my @AttributeWithUnAssociatedAncestorList = (); my @SubAssociatedAttributeList = (); my @TmpElementList = (); my @TmpElementNameList = (); my @TmpAttributeList = (); my $TmpLine = ""; my $bStopPrinting = 0; #MUST BE FALSE my $bFoundInList = 0; my $PrintCount = 0; my $ErrorStatus = 0; my @Msg = (); my $MsgLine = ""; my $LineNum = 0; my $ReturnStatus = 1; #MUST BE TRUE ($ErrorStatus, @Msg) = @_; $ElementDeclarationCount = scalar (@g_ElementDeclarationList); $AttributeDeclarationCount = scalar (@g_AttributeDeclarationList); $EntityCount = scalar (@g_EntityDeclarationList); $OrphanElementCount = $ElementDeclarationCount - $g_RealizedAssociatedElementNameCount; $OrphanAttributeCount = $AttributeDeclarationCount - $g_AttributesAssociatedWithElementCount; print("\n"); print("******************************************\n"); print("**** General DTD Information ****\n"); print("******************************************\n\n"); print("************************************\n"); print("**** DTD File Declared Definitions:\n"); print("************************************\n"); print(" Element Count: $ElementDeclarationCount\n"); print(" Attributes Count: $AttributeDeclarationCount\n"); print(" Entity Count: $EntityCount\n\n"); if ($ErrorStatus != $g_ERROR_DTD_READ){ print("***************************\n"); print("**** DTD Tree Definitions:\n"); print("***************************\n"); print(" Root Element: $g_RootElement \n"); print(" Proclaimed Elements: $g_ProclaimedAssociatedElementCount\n"); print(" Associated Elements: $g_RealizedAssociatedElementNameCount\n"); print(" Orphaned Elements: $OrphanElementCount\n\n"); print(" Associated Attributes: $g_AttributesAssociatedWithElementCount\n"); print(" Orphaned Attributes: $OrphanAttributeCount\n\n"); }#if SWITCH: { if ($ErrorStatus eq $g_ERROR_DTD_READ){ print("******************************************************************\n"); print("**** Error: Errors were detected while reading the DTD file ****\n"); print("******************************************************************\n"); foreach $MsgLine (@Msg){ print($MsgLine); }#foreach $ReturnStatus = 0; last SWITCH; }#if if ($ErrorStatus eq $g_ERROR_DTD_BUILD){ print("********************************************************************\n"); print("**** Error: Errors were detected while building the DTD tree. ****\n"); print("********************************************************************\n"); foreach $MsgLine (@Msg){ print($MsgLine); }#foreach $ReturnStatus = 0; last SWITCH; }#if default:{ if (($OrphanElementCount > 0) || ($OrphanAttributeCount > 0)){ print("****************************************************************************\n"); print("**** Error: The DTD tree was built without all declared definitions. ****\n"); print("**** ****\n"); print("**** Element orphans exist when it or one of its ancestors are ****\n"); print("**** not proclaimed by a parent Element. Note: The number of ****\n"); print("**** Proclaimed and Associated Elements to determine the number ****\n"); print("**** of disassociated trees branches possibly causing multiple ****\n"); print("**** orphans. Attribute orphans exist when its Element is not ****\n"); print("**** declared or a parent Element does not proclaim the ****\n"); print("**** Attribute's Element as its child Element. ****\n"); print("****************************************************************************\n"); if ($OrphanElementCount > 0){ #Orphan $ReturnStatus = 0; print("\n"); print("****************************\n"); print("**** Unassociated Elements:\n"); print("****************************\n"); @TmpElementList = @g_ElementDeclarationList; while (($TmpElementLine = shift(@TmpElementList)) && (!$bStopPrinting)){ ($TmpElementLine, $LineNum) = split(/$DTD_LINENUM\s+/,$TmpElementLine); ($TmpElementName, $TmpAttributeName) = split(/\s+/,$TmpElementLine); $TmpElementName =~ s/^\s+//; $TmpElementName =~ s/\s+$//; if (0 == (grep /^$TmpElementName$/, @g_RealizedAssociatedElementNameList)){ $TmpElementLine =~ s/^\s+//; $TmpElementLine =~ s/\s+$//; print("Orphan Element ($LineNum): \"$TmpElementLine\"\n"); $PrintCount++; }#if if ($PrintCount >= 25 ){ print("\n"); print(" ...\n"); print(" ...\n"); print(" ...\n"); $bStopPrinting = 1; }#if }#while $bStopPrinting = 0; $PrintCount = 0; }#if if ($OrphanAttributeCount > 0){ #Orphan $ReturnStatus = 0; @TmpAttributeList = @g_AttributeDeclarationList; while (($TmpAttributeLine = shift(@TmpAttributeList)) && (!$bStopPrinting)){ ($TmpAttributeLine, $LineNum) = split(/$DTD_LINENUM\s+/,$TmpAttributeLine); ($TmpElementName, $TmpAttributeName) = split(/\s+/,$TmpAttributeLine); $TmpElementName =~ s/^\s+//; $TmpElementName =~ s/\s+$//; $TmpAttributeName =~ s/^\s+//; $TmpAttributeName =~ s/\s+$//; ## Orphan due to its Element was not declaired. if (0 == (grep /^$TmpElementName$/, @g_ElementNameDeclarationList)){ $TmpAttributeLine =~ s/^$TmpElementName//; $TmpAttributeLine =~ s/^\s+//; $TmpAttributeLine =~ s/\s+$//; $TmpLine = "Orphan Attribute ($LineNum): ($TmpElementName) \"$TmpAttributeLine\"\n"; push @AttributeWithUndeclaredElementList, $TmpLine; $PrintCount++; }else{ $bFoundInList = 0; @SubAssociatedAttributeList = grep /$TmpAttributeName/, @g_AttributesAssociatedWithElements; foreach $TmpAssociatedAttributeLine (@SubAssociatedAttributeList) { ($TmpAssocElementName,$TmpAssocAttributeName) = split(/\s+/,$TmpAssociatedAttributeLine); $TmpAssocElementName =~ s/^\s+//; $TmpAssocElementName =~ s/\s+$//; $TmpAssocAttributeName =~ s/^\s+//; $TmpAssocAttributeName =~ s/\s+$//; if (($TmpElementName eq $TmpAssocElementName) && ($TmpAttributeName eq $TmpAssocAttributeName)){ $bFoundInList = 1; }#if }#foreach # Orphan due to an ancestor is an orphan. if ( $bFoundInList == 0){ $TmpAttributeLine =~ s/^$TmpElementName//; $TmpAttributeLine =~ s/^\s+//; $TmpAttributeLine =~ s/\s+$//; $TmpLine = "Orphan Attribute ($LineNum): ($TmpElementName) \"$TmpAttributeLine\"\n"; push @AttributeWithUnAssociatedAncestorList, $TmpLine; $PrintCount++; }#if }#else if ($PrintCount >= 25 ){$bStopPrinting = 1; } }#while if (0 != scalar(@AttributeWithUndeclaredElementList)){ print("\n"); print("***************************************\n"); print("**** Attribute's Element not declared:\n"); print("***************************************\n"); foreach $TmpLine (@AttributeWithUndeclaredElementList){ print($TmpLine); }#foreach }#if if (0 != scalar(@AttributeWithUnAssociatedAncestorList)){ print("\n"); print("*****************************************************\n"); print("**** Attribute's Element or ancestor not proclaimed:\n"); print("*****************************************************\n"); foreach $TmpLine (@AttributeWithUnAssociatedAncestorList){ print($TmpLine); }#foreach }#if if ($bStopPrinting == 1 ){ print("\n"); print(" ...\n"); print(" ...\n"); print(" ...\n"); }#if $bStopPrinting = 0; $PrintCount = 0; }#if }#if (($OrphanElementCount > 0) || ($OrphanAttributeCount > 0)) last SWITCH; }#default }#SWITCH print("\n"); print("******************************************\n"); print("**** End of General DTD Information ****\n"); print("******************************************\n\n"); if ($bRunTimeDebug){STDERR->print("\nDEBUG: INFO: Exiting function PrintTotalsReport()\n");} return ($ReturnStatus); }#ReviewDTDTreeForErrors ############################################################################ # FUNCTION: PrintElementReport() # PURPOSE: To print an element. # PARAMETERS: $refCurrentElementNode - [IN] A hash reference containing a CurrentElementNode hash. # # RETURN: None. ############################################################################ sub PrintElementReport{ my $refCurrentElementNode = 0; my %CurrentElementNode = []; my @CurrentChildNodeList = (); my @CurrentAttributeList = (); my $TmpAttribute = ""; ($refCurrentElementNode) = @_; %CurrentElementNode = %{$refCurrentElementNode}; if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Entering function PrintElementReport()\n");} STDERR->print("\n*********************\n"); STDERR->print("Element: $CurrentElementNode{$ELEMENT_NAME}\n"); if (0 != @{$CurrentElementNode{$ref_ATTRIBUTE_LIST}}){ @CurrentAttributeList = @{$CurrentElementNode{$ref_ATTRIBUTE_LIST}}; foreach $TmpAttribute (@CurrentAttributeList) { STDERR->print("Attributes: $TmpAttribute\n"); }#foreach }else{ STDERR->print("Attributes: None\n"); }#if STDERR->print("Category: $CurrentElementNode{$ELEMENT_CATAGORY}\n"); if ($CurrentElementNode{$ELEMENT_CATAGORY} eq $ELEM_CAT_LIST){ @CurrentChildNodeList = @{$CurrentElementNode{$ref_CHILDNODE_LIST}}; PrintFormatList(80, "Child Elements: ", @CurrentChildNodeList); }#if STDERR->print("*********************\n"); if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Exiting function PrintElementReport()\n");} }#PrintElementReport() ################################################################################################ ########### ########### ########### PDB Section ########### ########### ########### ################################################################################################ ############################################################################ # FUNCTION: PDB1LetterSeqs() # PURPOSE: Reads the PDB file, finds the SEQRES section and returns all the found sequences as 1 letter chains. # PARAMETERS: None. # # RETURN: TRUE for success, otherwise FALSE. ############################################################################ sub PDB1LetterSeqs{ my @Lines = ""; my $ReturnStatus = 1; #Must be True. my $FileName = ""; my @TmpList = (); my $Junk = ""; my $Chain = ""; my $TmpChain = ""; my $CurrentLine = ""; my @ChainLine = (); my @New_ChainLine = (); my $ChainCount = 0; my $Converted_Chain = ""; my $TmpElement = ""; my $NewElement = ""; my %Seq3To1 = ('ALA' => 'A', 'ASX' => 'B', 'CYS' => 'C', 'ASP' => 'D', 'GLU' => 'E', 'PHE' => 'F', 'GLY' => 'G', 'HIS' => 'H', 'ILE' => 'I', 'LYS' => 'K', 'LEU' => 'L', 'MET' => 'M', 'ASN' => 'N', 'PRO' => 'P', 'GLN' => 'Q', 'ARG' => 'R', 'SER' => 'S', 'THR' => 'T', 'VAL' => 'V', 'TRP' => 'W', 'XAA' => 'X', 'TYR' => 'Y', 'GLX' => 'Z', 'TER' => '*', 'SEL' => 'U' ); ($FileName) = @_; if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Entering function PDB1LetterSeqs()\n");} if (open(FILE,"$FileName")){ @Lines=<FILE>; close FILE; @TmpList = grep /^SEQRES/, @Lines; $CurrentLine = $TmpList[0]; ($Junk, $Junk, $TmpChain) = split(/\s+/,$CurrentLine); foreach $CurrentLine (@TmpList) { ($Junk, $Junk, $Chain) = split(/\s+/,$CurrentLine); if($TmpChain eq "$Chain"){ ($Junk, $Junk, $Junk, $Junk, @ChainLine) = split(/\s+/,$CurrentLine); $ChainCount = scalar(@ChainLine); #splice(@ChainLine, ($ChainCount-2)); foreach $TmpElement (@ChainLine){ $NewElement = $Seq3To1{$TmpElement}; push @New_ChainLine, $NewElement; }#foreach }#if($TmpChain eq "$Chain") else{ $Converted_Chain = join '', @New_ChainLine; STDERR->print("\n\nNew converted $TmpChain Sequence:\n"); STDERR->print("$Converted_Chain\n\n"); $TmpChain = $Chain; $Converted_Chain = ""; @New_ChainLine = (); ($Junk, $Junk, $Junk, $Junk, @ChainLine) = split(/\s+/,$CurrentLine); $ChainCount = scalar(@ChainLine); #splice(@ChainLine, ($ChainCount-2)); foreach $TmpElement (@ChainLine){ $NewElement = $Seq3To1{$TmpElement}; push @New_ChainLine, $NewElement; }#foreach }#else }#foreach $Converted_Chain = join '', @New_ChainLine; STDERR->print("\n\nNew converted $TmpChain Sequence:\n"); STDERR->print("$Converted_Chain\n\n"); }#if (open(FILE,"$FileName")) if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Exiting function PDB1LetterSeqs()\n");} return ($ReturnStatus); }#PDB1LetterSeqs() ################################################################################################ ########### ########### ########### Code Section ########### ########### Tools ########### ################################################################################################ ############################################################################ # FUNCTION: GetFileName() # PURPOSE: To get an input file from the user. # PARAMETERS: $FileName - [IN] Variable with the file extention type. # RETURN: Input file to be used. ############################################################################ sub GetFileName{ my @TempFileList = (); my @ValidResponses = (); my $FileName = ""; my $FileType = ""; my $FriendlyFileName = ""; ($FileType) = @_; if ($bRunTimeDebug){STDERR->print("\nDEBUG: INFO: Entering function GetFileName()\n");} SWITCH: { if ($FileType eq $PDB_EXT){ STDERR->print("\nThe name of a Bioinformatics's PDB input file will be needed.\n"); $FriendlyFileName = "Bioinformatics PDB"; last SWITCH; }#if if ($FileType eq $DTD_EXT){ STDERR->print("\nThe name of a Bioinformatics's DTD input file will be needed.\n"); $FriendlyFileName = "Bioinformatics DTD"; last SWITCH; }#if default:{ STDERR->print("\nAn internal error has occurred due to processing an unknown\n"); STDERR->print("file extention type: $FileType\n"); STDERR->print("\nPDBParser will exit.\n"); exit(1); last SWITCH; }#default: }#SWITCH @ValidResponses = (); STDERR->print("Here is a list of input files in the current directory.\n"); opendir(DIR, cwd()); @TempFileList = grep /\.$FileType$/, readdir DIR; @TempFileList = sort @TempFileList; closedir(DIR); if ( 0 < (scalar(@TempFileList))){ PrintFormatList(80, "$FriendlyFileName ($FileType): ", @TempFileList); push @ValidResponses, @TempFileList; }else{ PrintFormatList(80, "$FriendlyFileName ($FileType): ", ("No Files")); }#else @TempFileList = (); STDERR->print("\n"); SWITCH: { if (3 > (scalar(@ValidResponses))){ $FileName = GetUserResponse("Your response for the input file is invalid.\n", 0, 1, @ValidResponses); last SWITCH; }#if if (3 <= (scalar(@ValidResponses))){ $FileName = GetUserResponse("Your response for the input file is invalid.\n", 0, 0, @ValidResponses); last SWITCH; }#if }#SWITCH if ($bRunTimeDebug){STDERR->print("\nDEBUG: INFO: Exiting function GetFileName()\n");} return($FileName); }#GetFileName() ############################################################################ # FUNCTION: GetUserResponse() # PURPOSE: To get an interactive response from the user. # PARAMETERS: $InvalidResponseStatement - [IN] A response statement for an # invalid user selection. # $DefaultResponseIndex - [IN] An index into @ValidResponses # displayed as a default selection. # $bShowValidResponses - [IN] Displays @ValidResponses on the prompt. # @ValidResponses - [IN] List of valid user responses. # # REMARK: If no valid response is provided, no integrity check will be # performed and the User response will be returned. If a valid # response list is provided an integrity check will be performed. # If a valid response list is provided, the caller may choose to # have the list displayed on the response prompt. If a valid response # list is provided and the $DefaultResponseIndex is valid, a # default Response will be display and the User may press return # to accept this response. Note: If a $DefaultResponseIndex is # valid, it will always be displayed, otherwise it is never displayed. # # RETURN: The User's response. ############################################################################ sub GetUserResponse{ my $UserResponse = ""; my $InvalidResponseStatement = ""; my @ValidResponses = (); my $FormattedResponses = ""; my $nValidResponses = 0; my $DefaultResponseIndex = 0; my $DefaultResponse = ""; my $bShowValidResponses = 0; # Must be FALSE. my $bShowResponsesOnPrompt = 0; # Must be FALSE. my $bHaveValidDefaultResponse = 0; # Must be FALSE. my $bHaveValidResponses = 0; # Must be FALSE. if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Entering function GetUserResponse()\n");} ($InvalidResponseStatement, $DefaultResponseIndex, $bShowValidResponses, @ValidResponses ) = @_; $nValidResponses = scalar(@ValidResponses); if ( 1 <= $nValidResponses){ $bHaveValidResponses = 1; }#if if ((0 <= $DefaultResponseIndex) && ($bHaveValidResponses) && ($DefaultResponseIndex <= ($nValidResponses - 1))){ $bHaveValidDefaultResponse = 1; $DefaultResponse = @ValidResponses[$DefaultResponseIndex]; }#if if ($bShowValidResponses == 1){ $bShowResponsesOnPrompt = 1; }#if if ($bRunTimeDebug){ PrintFormatList(80, "DEBUG INFO: ValidResponses ", @ValidResponses); STDERR->print("DEBUG INFO:nValidResponses: $nValidResponses\n"); STDERR->print("DEBUG INFO:DefaultResponseIndex: $DefaultResponseIndex\n"); STDERR->print("DEBUG INFO:bShowValidResponses: $bShowValidResponses\n"); STDERR->print("DEBUG INFO:DefaultResponse: $DefaultResponse\n"); }#if $FormattedResponses = join '|', @ValidResponses; SWITCH: { if ($bShowResponsesOnPrompt == 1 && $bHaveValidDefaultResponse == 1){ STDERR->print("Please specify [$FormattedResponses]($DefaultResponse):"); last SWITCH; }#if if ($bShowResponsesOnPrompt == 1 && $bHaveValidDefaultResponse == 0){ STDERR->print("Please specify [$FormattedResponses]:"); last SWITCH; }#if if ($bShowResponsesOnPrompt == 0 && $bHaveValidDefaultResponse == 1){ STDERR->print("Please specify []($DefaultResponse):"); last SWITCH; }#if if ($bShowResponsesOnPrompt == 0 && $bHaveValidDefaultResponse == 0){ STDERR->print("Please specify []():"); last SWITCH; }#if }#SWITCH if (STDIN->eof){ STDERR->print("\nReceived EOF from STDIN, suspecting it is due to CTRL-C/CTRL-D, exiting PDB Parser.\n"); exit; }#if $UserResponse = STDIN->getline; $UserResponse =~ s/\s+$//; if ($bHaveValidResponses == 1){ while((!(grep /^$UserResponse$/, @ValidResponses)) && (!($UserResponse eq "" && $bHaveValidDefaultResponse))){ PrintFormatList(100, "***ERROR - ", $InvalidResponseStatement); SWITCH: { if ($bShowResponsesOnPrompt == 1 && $bHaveValidDefaultResponse == 1){ STDERR->print("Please specify [$FormattedResponses]($DefaultResponse):"); last SWITCH; }#if if ($bShowResponsesOnPrompt == 1 && $bHaveValidDefaultResponse == 0){ STDERR->print("Please specify [$FormattedResponses]:"); last SWITCH; }#if if ($bShowResponsesOnPrompt == 0 && $bHaveValidDefaultResponse == 1){ STDERR->print("Please specify []($DefaultResponse):"); last SWITCH; }#if if ($bShowResponsesOnPrompt == 0 && $bHaveValidDefaultResponse == 0){ STDERR->print("Please specify []():"); last SWITCH; }#if }#SWITCH if (STDIN->eof){ STDERR->print("\nReceived EOF for STDIN, suspecting it is due to CTRL-C/CTRL-D, exiting PDB Parser.\n"); exit; }#if $UserResponse = STDIN->getline; $UserResponse =~ s/\s+$//; }#while if ($UserResponse eq "" && $bHaveValidDefaultResponse){ $UserResponse = $DefaultResponse; }#if }# ($bHaveValidResponses == 1) if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Exiting function GetUserResponse()\n");} return($UserResponse); }#GetUserResponse() ############################################################################ # FUNCTION: PrintFormatList() # PURPOSE: To print out a list of string tokens (SPACE separated) onto # multiple lines with a given line length. # PARAMETERS: $UserMaxLineLenght - [IN] Variable to receive the suggested # line lenght. This will be reset if # if greater than $MAX_LINE_LENGTH. # $UserHeader - [IN] User header for each line printed # or Empty String for no header. # @List - [IN] List of tokens to be printed. # RETURN: None. ############################################################################ sub PrintFormatList{ my $MAX_LINE_LENGTH = 100; my $UserMaxLineLength = 0; my $LineHeader = ""; my @List = (); my $LineHeaderLength = 0; my $bNewLine = 1; my $LineLength = 0; my $Token = ""; my $TokenLength = 0; ($UserMaxLineLength, $LineHeader, @List) = @_; if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Entering function PrintFormatList()\n");} if ($bRunTimeDebug){STDERR->print("DEBUG INFO: Arg List:($UserMaxLineLength, $LineHeader, @List)\n");} $LineHeaderLength = length($LineHeader); if($UserMaxLineLength > $MAX_LINE_LENGTH){$UserMaxLineLength = $MAX_LINE_LENGTH;} if($LineHeaderLength <= $UserMaxLineLength){ foreach $Token (@List){ $TokenLength = length($Token); if ($bNewLine == 1){ $LineLength = $LineHeaderLength + $TokenLength; if($LineLength <= $UserMaxLineLength){ STDERR->print("\n$LineHeader$Token "); $LineLength = $LineLength + 1;#Add Space $bNewLine = 0; }else{ STDERR->print("\n$LineHeader"); $LineLength = $LineHeaderLength; $bNewLine = 1; }#else }else{ $LineLength = $LineLength + $TokenLength; if( $LineLength <= $UserMaxLineLength){ STDERR->print("$Token "); $LineLength = $LineLength + 1;#Add Space }else{ $LineLength = $LineHeaderLength + $TokenLength; if($LineLength <= $UserMaxLineLength){ STDERR->print("\n$LineHeader$Token "); $LineLength = $LineLength + 1;#Add Space $bNewLine = 0; }else{ STDERR->print("\n$LineHeader"); $LineLength = $LineHeaderLength; $bNewLine = 1; }#else }#else }#else }#foreach STDERR->print("\n"); }#if if ($bRunTimeDebug){STDERR->print("\nDEBUG INFO: Exiting PrintFormatList()\n");} }#PrintFormatList()