#! /usr/bin/perl # Victor Amin 2009

use strict; use warnings;

use Getopt::Std; $Getopt::Std::STANDARD_HELP_VERSION = 1;

my %options; getopts(‘1:2:h’, %options);

if ($options{h} || !$options{1} || !$options{2}) {Getopt::Std->version_mess(); HELP_MESSAGE(*STDERR)} sub HELP_MESSAGE {

my $fh = shift;
print $fh "\nSplit ophaned reads out of a pair of FASTQ files. Counts to STDOUT.\n";
print $fh "\tOPTIONS:\n";
print $fh "\t-1 [FASTQ1] [required]\n";
print $fh "\t-2 [FASTQ2] [required]\n";
print $fh "\nProperly paired FASTQs are outputted to paired_*, orphans to orphaned_*\n\n";
exit;

}

open FASTQ1, “<$options{1}” or die “nThere was a problem opening the FASTQ file: $!n”; open FASTQ2, “<$options{2}” or die “nThere was a problem opening the FASTQ file: $!n”;

open PAIRED1, “>paired_$options{1}” or die “nThere was a problem opening the output file: $!n”; open PAIRED2, “>paired_$options{2}” or die “nThere was a problem opening the output file: $!n”;

open ORPHANED1, “>orphaned_$options{1}” or die “nThere was a problem opening the output file: $!n”; open ORPHANED2, “>orphaned_$options{2}” or die “nThere was a problem opening the output file: $!n”;

my $SEQ_MODE = 1; my $QUAL_MODE = 2; my $mode = 1;

my $reads_1 = 0; my $lines = 0; my $ident; my $sequence; my $quality;

my %sequences_1; my %qualities_1; print STDERR “nLoading first FASTQ…n”; while (<FASTQ1>) {

chomp;
if (/^\@/ && $mode == $SEQ_MODE) {
/\@([^ \/]+?)( |\/)/; # rather than substitute capture AU 16/08/2012
# chop; this is to remove the 1 or 2 from /1 or /2 if the reads are in that format. These reads are in the format @ident 1:N:0:x or @ident 2:N:0:x AU 16/08/2012
$ident = $1; # ident = capture AU 16/08/2012
$reads_1++;
} elsif (/^\+/) {
$mode = $QUAL_MODE;
} elsif ($mode == $SEQ_MODE) {
$sequence .= $_;
$lines++;
} elsif ($mode == $QUAL_MODE) {
$quality .= $_;
$lines--;
if ($lines == 0) {
  $mode = $SEQ_MODE;
  $sequences_1{$ident} = $sequence;
  $qualities_1{$ident} = $quality;
  $sequence = '';
  $quality = '';
}
} else {
die "\nError reading file.\n";
}

}

my $reads_2 = 0;

my %sequences_2; my %qualities_2; print STDERR “nLoading second FASTQ…n”; while (<FASTQ2>) {

chomp;
if (/^\@/ && $mode == $SEQ_MODE) {
/\@([^ \/]+?)( |\/)/; # rather than substitute capture AU 16/08/2012
# chop; this is to remove the 1 or 2 from /1 or /2 if the reads are in that format. These reads are in the format @ident 1:N:0:x or @ident 2:N:0:x AU 16/08/2012
$ident = $1; # ident = capture AU 16/08/2012
$reads_2++;
} elsif (/^\+/) {
$mode = $QUAL_MODE;
} elsif ($mode == $SEQ_MODE) {
$sequence .= $_;
$lines++;
} elsif ($mode == $QUAL_MODE) {
$quality .= $_;
$lines--;
if ($lines == 0) {
  $mode = $SEQ_MODE;
  $sequences_2{$ident} = $sequence;
  $qualities_2{$ident} = $quality;
  $sequence = '';
  $quality = '';
}
} else {
die "\nError reading file.\n";
}

}

my $paired; print STDERR “nPrinting paired reads…n”; for $ident (keys %sequences_1) {

if (exists $sequences_2{$ident}) {
  print PAIRED1 "\@${ident} 1\n$sequences_1{$ident}\n\+${ident} 1\n$qualities_1{$ident}\n";
  print PAIRED2 "\@${ident} 2\n$sequences_2{$ident}\n\+${ident} 2\n$qualities_2{$ident}\n";
  delete $sequences_1{$ident};
  delete $sequences_2{$ident};
  $paired++;
}

}

print STDERR “nPrinting orphaned reads…n”; my $orphaned_1 = 0; for $ident (keys %sequences_1) {

print ORPHANED1  "\@${ident} 1\n$sequences_1{$ident}\n\+${ident} 1\n$qualities_1{$ident}\n";
$orphaned_1++;

}

my $orphaned_2 = 0; for $ident (keys %sequences_2) {

print ORPHANED2  "\@${ident} 2\n$sequences_2{$ident}\n\+${ident} 2\n$qualities_2{$ident}\n";
$orphaned_2++

}

print “nReads 1: $reads_1nOrphans 1: $orphaned_1nReads 2: $reads_2nOrphaned 2: $orphaned_2nPaired: $pairedn”;