FULL CODE FOR THE FEATURE MODEL DESCRIBED IN CHAPTER 2 #PRELIMINARIES setwd("/users/aleksei/documents/documents/umass/gp2") library(combinat) #for regular expressions library(mclust) #for mixture of Gaussians cluster analysis options(stringsAsFactors = F) #USER-DEFINED VARS filename <- "tromso_c_sc_1" which.language <- "toy" #Choose the toy language #which.language <- "eng" #Choose English candidate.set <- "full" #For English: all C(C)(C) sequences are allowed in the candidate set #candidate.set <- "reduced" #For English: only (fricative) - consonants - (liquid) sequences are allowed in the candidate set cycles <- 15 #the number of cycles the simulation will go through accuracy_threshold <- 0.01 #this is the max. gain value a constraint must minimally have to be considered "relevant" sample.size <- 1 #the proportion of omega that's used as a sammple at every cycle of the simulation (to make max gain computation faster) if (which.language == "eng") {polarity <- c(1,-1)} #If English: Allow both negative and positive constraints if (which.language == "toy") {polarity <- c(-1)} #If toy language: Allow only negative constraints #REPRESENTATIONAL SPACE #For toy language if (which.language == "toy") { #define sigma sigma = c("a","i","u","p","t","k","b","d","g","m","n","q") # q = engma #The segments allowed in the toy language. supersigma = sigma #these are all elements that may occur in constraints - segments and features #Define the space of representations (omega) that the grammar will consider #This will be all possible CVCVC strings with segments drawn from sigma all.repr <- expand.grid( #a data frame of all possible CVCVCs c("p","t","k","b","d","g","m","n","q"), c("a","i","u"), c("p","t","k","b","d","g","m","n","q"), c("a","i","u"), c("p","t","k","b","d","g","m","n","q") ) #Now make the rows of this data frame into strings, and put them all into a vector omega <- unlist( apply( all.repr, 1, paste, sep="", collapse="" ) ) #contains 248832 representations total #Now compute the set of all observed forms given the setup of the toy language all.obs <- expand.grid( c("p","t","k","b","d","g"), #first position contains non-nasal consonants c("a","i","u"), #second position contains only vowels c("p","t","k","b","d","g","m","n","q"), #third position may contain any consonant c("a","i","u"), #fourth position contains only vowels c("p","t","k","b","d","g","n","q") #fifth position contains all consonants except "m" ) all.obs <- subset(all.obs, !(all.obs[,2] %in% c("u","i") & all.obs[,3] %in% c("p","b","m") & all.obs[,4] %in% c("u","i") ) ) #Remove all "u/i"-labial-"u/i" sequences from this frame #Now, put everything in a matrix of strings observed <- unlist( apply( all.obs, 1, paste, sep="", collapse="" ) ) } #PHONETIC DISTANCE #define phonetic distance between members of sigma (in the form of a data frame, with both row and column names equal to the set of segments in sigma) #Toy language if (which.language == "toy") { phonetic_distance = data.frame( "a"=c(0,1,1,rep(5,9)), "i"=c(1,0,1,rep(5,9)), "u"=c(1,1,0,rep(5,9)), "p"=c(rep(5,3), 0,1,2, 1,2,3, 2,3,4), "t"=c(rep(5,3), 1,0,1, 2,1,2, 3,2,3), "k"=c(rep(5,3), 2,1,0, 3,2,1, 4,3,2), "b"=c(rep(5,3), 1,2,3, 0,1,2, 1,2,3), "d"=c(rep(5,3), 2,1,2, 1,0,1, 2,1,2), "g"=c(rep(5,3), 3,2,1, 2,1,0, 3,2,1), "m"=c(rep(5,3), 2,3,4, 1,2,3, 0,1,2), "n"=c(rep(5,3), 3,2,3, 2,1,2, 1,0,1), "q"=c(rep(5,3), 4,3,2, 3,2,1, 2,1,0), row.names=sigma ) } max_phon_dist <- max(phonetic_distance) #This is the highest value present in the phonetic distance matrix #CLUSTER SETUP #Initialize empty data frame of clusters; this vector will later change, which will lead to expansion of the constraint set clusters_matrix <- matrix( nrow = length(sigma), dimnames = list(sigma) ) #"clusters_matrix" is a matrix which has rows named after foci, and every column in the matrix will be a cluster, which is represented as a probability distribution over those foci #Set up a vector of possible cluster labels; which are combinations of two capital letters capital_letters <- c("A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z") cluster_labels <- do.call( paste, c(expand.grid(capital_letters, capital_letters), sep="")) cluster_translation <- data.frame( Cluster.name = c(), Reg.expresion = c() ) #Data frame which translates cluster labels to regular expressions #initialize empty vector of clusters and intersections, to be encoded as regular expressions total_regexp <- c() #MAXIMUM GAIN COMPUTATION #From Staubs (2011) grad <- function(x) attr(x, "gradient") "grad<-" <- function(x,value) { attr(x, "gradient") <- value x } # Puts an L2 (log-space Gaussian) prior on an optimization #Taken from Staubs (2011) l2.prior <- function(fun, var=1, mean=0, ...) { fun <- match.fun(fun) function(pv) { res <- fun(pv) + (sum(pv - mean)^2) / (2 * var) if ( !is.null(grad(res)) ) grad(res) <- grad(res) + ((pv - mean) / var) res } } #The following function returns the maximum gain value max.gain <- function(obs.prob, pred.prob, rep_space, new_constr) { #p is the vector of observed probabilities, q is the vector of probabilities predicted by the current grammar, and "new_constr" is the new constraint whose maximal gain value will be computed c.star <- violate(new_constr, rep_space) #compute the violation/reward vector of "new_constr" (the new constraint who gain value is to be computed) gain <- function(w.star) { #a function of the weight of the new constraint gain.value <- w.star * sum(obs.prob*c.star) - log( sum( exp(w.star*c.star) * pred.prob) ) #minus signs are removed from Wilson (2010) definition, and Della Pietra's definition is followed instead gain.value <- -gain.value #output the negative of the function, because optim minimizes rather than maximizes functions grad(gain.value) <- - ( sum(obs.prob * c.star) - (sum( c.star*exp(w.star*c.star)*pred.prob ) / sum( exp(w.star*c.star)*pred.prob ) )) #Della Pietra's first order derivative gain.value #Output the function with the gradient as an attribute to it } gain <- l2.prior(gain, var=100000, mean=0) #Put an L2 prior on maximum gain computation - with variance = 10,000 and mean = 0 opt.gain <- optim( par = 0, fn = gain, lower = 0, method="L-BFGS-B" ) #minimize the "gain" function with the initial value of w.star being 0 -opt.gain$value #Output the negative of the value of "gain" associated with its optimal weight (because "gain" was made negative to allow it to be minimized instead of maximized). } #Now, a function that determines the violations for a constraint #Constraints are a series of cells in a data frame! violate <- function(constr, rep.space) { constr.rgxp <- paste(constr[2:4], collapse="") #Bring the characters of the constraint together as a regular expression matches.vector <- sapply( rep.space, function(cand) { #omega is the full set of candidates in the representational space match <- gregexpr(constr.rgxp, cand)[[1]] length( match[match!=-1] ) #Find the number of matches of "const.rgxp" in the candidate; the "match != -1" condition is there because gregexpr returns -1 when there is no match }) #This generates a vector of the number of matches for every candidate in omega matches.vector*as.numeric(constr[1]) #multiply the vector of matches by 1 or -1, depending on whether the constraint is positive or negative } #MAXIMUM ENTROPY CALCULATION #Compute observed probabilities p <- sapply( omega, function(cand){ length(which(observed==cand))/length(observed) } ) # this is the observed probability (the number of times an element of omega shows up in the observed data divided by the total number of observations) #Compute predicted probabilities q <- rep(1/length(omega), length(omega)) # this will be the predicted probability of every form; initially, it will be set to equal probability to every form prob.frame <- data.frame(omega=omega, p=p, q=q, stringsAsFactors=F) #this is the data frame which will hold p and q for cons.matrix <- NULL #This will be filled in with constraints as they are induced weights <- c() #This will contain the weights of these constraints #when new constraints are added, as zeroes are added to "weights" as there are new constraints #the result is fed to "optim" as initial weights for optimization #Cover function for optimizing the objective function; biases can be turned on or off. solve <- function(violation.matrix, wts, lower, var=10000) { w <- wts obj <- objective.function(violation.matrix) #objective function obj <- l2.prior(obj,var,mean=0) #Add l2 prior opt <- optim(w, obj, lower=lower, method="L-BFGS-B") #optimize objective function opt #output result of optimization } #Objective function based on K-L divergence - based on Robert's "solver.R" script objective.function <- function(violations) { #plug the cons.matrix matrix into this objective <- function(w) { scores <- exp(c(violations %*% w)) Z <- sum(scores) #actual K-L divergence function res.df <- data.frame(prob.frame, scores=scores) res.df$predicted <- log(scores) - log(Z) #predicted probabilities res.df$kl <- rep(0,nrow(res.df)) nonzeroes <- which(res.df$p != 0) res.df$kl[nonzeroes] <- res.df$p[nonzeroes] * (-log(res.df$p[nonzeroes]) + res.df$predicted[nonzeroes]) #KL-divergence computations per candidate res <- -sum(res.df$kl) #Final KL-divergence calculation den.p <- scores / sum(scores) den <- colSums(den.p * violations) num <- violations grad.matrix <- prob.frame$p * (num - den) grad(res) <- -colSums(grad.matrix) #Add the gradient for KL-divergence res #output the result }#endfunction objective #output the function }#endfunction #CONSTRAINT SPACE SEARCH #mutate : a function which introduces mutations from a prototype constraint mutate <- function(input.cons) { #The "quant" argument signifies whether all mutations or only one of them is required output.cons <- data.frame(Polarity=c(), First=c(), Second=c(), Third=c()) #set up an empty data frame for the new constraint method = sample(c(rep("sf.change",3), "ins.del"),1) #Choose the method of mutation: segment/feature change (3x as probable), polarity change, insertion/deletion) if (method == "sf.change") {#Segment/feature change new.cons <- input.cons available_pos <- which( (new.cons[1,] %in% supersigma )) #See which positions in the constraint may be affected by segment/feature change (=those that have a segment or feature (combination) in them). pos <- sample(c(available_pos,available_pos),1) #select one of these positions to change its content #I repeat "available_pos" twice because sample(2,1) means the same as sample(1:2,1) - this way sample won't think this is a one-member vector that's to be interpreted as an upper limit if (new.cons[1,pos] %in% sigma) {#If the thing in the position selected is a segment phon_dist_threshold <- sample(1:max_phon_dist,1) #This is the maximal distances <- subset(phonetic_distance, select = new.cons[1,pos]) #find all the distances from a certain segment by selecting the column that is named after that segment selection <- rownames(subset(distances,(distances <= phon_dist_threshold) & distances > 0)) #find the names of the rows in which the value in the source segment column is smaller than the maximal proximity threshold selection <- c(selection, selection, supersigma[-(1:length(sigma))]) #The ultimate selection is twice the neighboring segments and once the set of features (so that you're twice as likely to pick from the neighboring segments as you are from the features). } else { #If the thing in the position selected is a feature selection <- supersigma[-(1:length(sigma))] #You can choose any feature to replace the element in the position chosen selection <- selection[-which(selection==new.cons[1,pos])] #Remove the original cluster from the selection }#endif if (length(selection)==0) { selection <- sigma } #If this leaves nothing - because there are no other clusters induced yet - then select a segment instead replacement <- sample(selection,1) #pick one random element on the "selection" list which will replace the original segment in the original constraint new.cons[1,pos] <- replacement }#endif if (method == "ins.del") { #insertion/deletion new.cons <- input.cons if (new.cons$Third == "") { insert.pos <- sample(1:3,1) #Standard case: insertion into any position if (new.cons$Second == "$") {insert.pos <- sample(1:2,1)} #If the constraint ends in a word boundary, don't insert anything after the word boundary if (new.cons$First == "^") {insert.pos <- sample(2:3,1)} #If the constraint starts in a word boundary don't insert anything in the first position if (insert.pos == 1) { new.cons$Third <- new.cons$Second new.cons$Second <- new.cons$First new.cons$First <- sample(c(supersigma,"^"),1) #Insert a segment, feature, or boundary sign into the first position of the constraint. } if (insert.pos == 2) { new.cons$Third <- new.cons$Second new.cons$Second <- sample(supersigma,1) #Insert a feature or a segment in the middle of the constraint. } if (insert.pos == 3) {new.cons$Third <- sample(c(supersigma,"$"),1)} #Insert a segment, feature or boundary sign at the end of the constraint } else { new.cons$Third <- "" }#endif }#endif new.cons #This is the output: one single constraint that was changed }#endfunction #Now comes the part in which these procedures are actually called. #initialize a "trash" data frame (which will contain all constraints that have been considered for addition to the grammar, so that no constraint is considered twice) trash <- data.frame(Polarity = "", First = "", Second = "", Third = "") ################################ START CYCLE ################################### #From this point on, the code will be repeated a given number of times model.fit <- sum(subset(prob.frame, prob.frame$p > 0)$q ) #This is the approximation of how well the model fits the data: the total predicted probability on the observed data - this value will be very low at first iteration <- 1 while (model.fit < 0.95) { #Repeat all of the following as long as model fit is less than 95% writeLines(paste("\nCycle number ",iteration,".","\n", sep="")) #Create a sample of the representational space to compute gain values over, to save time #Currently, the sample is 1 of the rep. space #size of the sample as a proportion of the total size of omega is given by the variable sample.size, as defined at the beginning of the script sample.indices <- sample( 1:length(omega), round( sample.size * length(omega)), replace=F ) #take a sample of the positions of elements in omega sample.p <- p[sample.indices]/sum(p[sample.indices]) sample.q <- q[sample.indices]/sum(q[sample.indices]) sample.omega <- omega[sample.indices] #RANDOM CONSTRAINT SELECTION #This repeat loop will ensure that the algorithm keeps going until it finds a random constraint that's not been tried before repeat { #First, choose a constraint length between 2 and 4. constraint_length <- sample(2:3,1) #Now, create a random constraint of that length if (constraint_length == 2) {random_constraint <- data.frame( Polarity= sample(polarity,1), First= sample(c(supersigma,"^"),1), Second= sample(c(supersigma,"$"),1), Third="") } if (constraint_length == 3){random_constraint <- data.frame(Polarity= sample(polarity,1), First=sample(c(supersigma,"^"),1), Second=sample(supersigma,1), Third=sample(c(supersigma,"$"),1)) } if ( length(trash$Polarity) > 0 & #If the trash data frame has anything in it at all length( subset(trash, trash$Polarity==random_constraint$Polarity & trash$First==random_constraint$First & trash$Second==random_constraint$Second & trash$Third==random_constraint$Third )$Polarity) == 0 ) { #If the randomly generated constraints cannot be found in the data frame containing constraints already considered (= trash), random.gain <- max.gain(sample.p, sample.q, sample.omega, random_constraint) #compute the gain value of the randomly selected constraint if ( random.gain >= accuracy_threshold) {#if the gain value isn't good enough, then keep trying until you hit a constraint with a sufficiently high gain value prototype <- random_constraint #if the constraint is accurate enough, declare its original role number to be the prototype for neighbor search break #the while statement is in principle an infinite loop; it is halted whenever a fitting prototype is found } else { trash <- rbind(trash, random_constraint) #if the constraint isn't accurate enough, add its row number in the original "constraints" frame to "trash" }#endifelse }#endif }#endrepeat writeLines("Random constraint to intialize search at the current cycle:\n") print(prototype) writeLines("\n") #NEIGHBOR SEARCH ## Now, submit this prototype to the neighbor search procedure. #Initialize a stack of constraints whose neighbors should be investigated and a frame of outputs input <- prototype #"stack" is a vector of constraint row numbers whose neighbors should be explored output_frame <- prototype #"output_frame" is the data frae of all constraints found at this iteration of the search algorithm current.gain <- random.gain #current.gain is a variable that signifies the highest gain value that has been found up until now dipping.count <- 0 #dipping.count is a variable that counts how many times one input yields constraints with lower gain values (it's a variable that prevents an infinite loop) doubling.count <- 0 #doubling.count is the number of times that the loop finds a constraint that is in local trash, in a row local.trash <- trash #Define a local trash data frame - one that is internal to the neighbor search procedure and which will not be carried over into future cycles of the simulation #This data frame also contains the regular trash data frame, so that neighbor search never ends up with a constraint which has previously been added to the grammar. while (nrow(input) > 0 & dipping.count <= 10 & doubling.count <= 10) { #as long as there is an input constraint and the algorithm hasn't found constraints with lower gain values more than 10 times for the same input current_constraint <- input[1,] #consider the input constraint as the current constraint neighbor_constraint <- mutate(current_constraint) #Introduce a single mutation if ( nrow( subset( local.trash, #If the neighboring constraint is not on local trash local.trash$Polarity==neighbor_constraint$Polarity & local.trash$First==neighbor_constraint$First & local.trash$Second==neighbor_constraint$Second & local.trash$Third==neighbor_constraint$Third ) ) == 0 ) { doubling.count <- 0 local.trash <- rbind(local.trash,neighbor_constraint) #put the neighbor constraint in the "trash" frame so it isn't tried again input <- input[-1,] #erase the current input, so that a new constraint can be put in its place neighbor.gain <- max.gain(prob.frame$p, prob.frame$q, omega, neighbor_constraint) #Compute the information gain value of the neighboring constraint. if (neighbor.gain < current.gain) { #If this neighbor has a lower gain value than the current reference value input <- current_constraint #Try the current constraint as an input once again dipping.count <- dipping.count + 1 }#endif if (neighbor.gain == current.gain) { #If this neighbor has the same gain value as the input constraint output_frame <- rbind(output_frame, neighbor_constraint) #Add the neighboring constraint to the output frame alongside the input input <- neighbor_constraint #Also make the neighboring constraint the current input dipping.count <- 0 }#endif if (neighbor.gain > current.gain) { #If this neighbor has the same gain value as the input constraint output_frame <- neighbor_constraint #Overwrite the output frame with the neighbor constraint input <- neighbor_constraint #Also make the neighboring constraint the current input current.gain <- neighbor.gain #Make this new gain value the standard for future comparison dipping.count <- 0 }#endif } else { doubling.count <- doubling.count + 1 } #endifelse }#endwhile trash <- rbind(trash, output_frame) #Add the selected constraints to trash writeLines("Constraint(s) selected at this cycle:\n") print(output_frame) writeLines("\n") # put everything in a focus-by-context table #First, determine all the contexts, and put these in as the first 4 (or 5) columns of a data frame. to_be_tabulated <- output_frame[,1:4] #this is the frame of all constraints that were selected by the l algorithm # make all possible replacements of non-zero elements with a placeholder "_" contexts_frame <- data.frame("Polarity" = c(), "First" = c(), "Second" = c(), "Third" = c()) longest <- constraint_length + 1 #What is the length of all the constraints in the output frame? That length plus one is the longest you have to search to make the context-by-focus table for (cons in 1:length(to_be_tabulated[,1])) { #for every constraint in the "to be tabulated" data frame for (j in 2:longest) { #for every position in the constraint context <- to_be_tabulated[cons,] #take out the row of the contexts_frame that's to be changed now if ((context[ , j] != "^") & (context[ , j] != "$") ) { #if the current symbol of the constraint is not a word boundary context[ , j] <- "_" #replace the constraint with "_" in the appropriate position contexts_frame <- rbind(contexts_frame, context) #add it to the "contexts_frame" data frame } } } #Get rid of duplicates, so that the list of contexts is as short as possible contexts_frame <- subset(contexts_frame, !duplicated(contexts_frame)) #this gets rid of all the rows which are the exact same as some row above them #Now, fill out the focus-by-context data frame by substituting every possible segment into the blanks and computing the corresponding gain value for (segment in sigma) { #for every possible segment, put it in the gap left in the context and look up the corresponding gain value column_number <- length(contexts_frame) + 1 #this is the number of the column that needs to be added to the contexts frame right now focus.substitution.frame <- contexts_frame[,1:4] focus.substitution.frame[,2:4] <- apply(focus.substitution.frame[,2:4], 2, function(x){sub("_",segment,x)}) #substitute the current segment value instead of "_" in each context of the contexts_frame new_column <- apply( focus.substitution.frame, 1, function(x){ max.gain(sample.p, sample.q, sample.omega, x) } ) #compute the gain value for each constraint made by filling in every context by the focus under scrutiny now contexts_frame[,column_number] <- new_column names(contexts_frame)[column_number] <- segment } #CLUSTERING #The "find_cluster" function applies to a row of "contexts_frame" to find the content of the cluster with the higher mean find_cluster <- function(rownumber) { to_be_clustered <- contexts_frame[rownumber,-(1:4)] #this is the row of the contexts frame over which the one-dimensional clustering analysis will be performed - and the non-numerical values at the left hand side of the frame are excluded noise_vector <- sample(c(0 , 0.0000001), length(to_be_clustered), replace = T) #make a random vector of the same length as "to be clustered", with either 0 or 0.0000001 as values #This is done so that the clustering algorithm will actually find two clusters in a case like "0 0 0 0 0 0 0 0 0 1 1 1". to_be_clustered <- to_be_clustered + noise_vector #the noise vector is added to the "to_be_clustered" values (added and not subtracted, so that negative values are not created) #obtain the parameters for the two Gaussians that were fit, so that these can be submitted to the expectation maximization algorithm if ( !is.na(mclustBIC(to_be_clustered,G=1:2,modelNames="E")[2,]) ) { #If it is at all possible to fit a 2-Gaussian model to the data from a given row, then go ahead and find the following values: row_Mclust <- Mclust( data = to_be_clustered, #take all the columns except the first four G = 2, #look for 2 components modelNames = "E" ) #use a model in which the variance of both components has to be the same if ( length(which(row_Mclust$classification==2)) >= 2 ) { #if, for the cluster with the higher mean, there is are at least two cluster_content <- row_Mclust$z[,2] #Take the cluster with the highest mean and return(cluster_content) } #If the "cluster" contained just one element, then don't return anything } #Endif with the condition that it's possible to fit a two-component model } #endfunction #Now, call this function for every row number in the "contexts_frame" frame all_clusters <- lapply(1:length(contexts_frame$First), find_cluster) #And after this, get rid of all the members of this list which are NULL. to_delete <- c() for (member in 1:length(all_clusters) ) { if (is.null(all_clusters[[member]])) {to_delete <- c(to_delete, member)} } #Find all the members of the all_clusters if (length(to_delete) > 0) { all_clusters <- all_clusters[-to_delete] } #Procedure for integrating these clusters with previously established clusters (unfinished) #Then take these clusters, see if they've been established as clusters before, and if not, and if they're "significantly" different from other established clusters, then give them a unique label and add them to the list of clusters if (length(all_clusters) > 0 & all( is.na(clusters_matrix[,1])) ) { #If no clusters have been added to the clusters matrix yet, and there are clusters to be added. clusters_matrix[,1] <- all_clusters[[1]] label <- sample(cluster_labels, 1) colnames(clusters_matrix)[1] <- label all_clusters <- all_clusters[-1] cluster_labels <- cluster_labels[-which(cluster_labels==label)] } #now start comparing every cluster in "all clusters" to existing clusters in "clusters_matrix" and add them to the matrix whenever they are sufficiently different; if they are not sufficiently different, merge the current cluster with the cluster in "cluster_matrix" that it is most similar to cluster_similarity <- function(clust1, clust2) { #The distance metric between two clusters (vectors of probabilities assigned to members of sigma) sum( clust1/sqrt(sum(clust1^2)) * clust2/sqrt(sum(clust2^2)) ) } similarity_threshold <- .9 #Now, integrate all clusters into the clusters_matrix while (length(all_clusters) > 0) { similarities <- apply( clusters_matrix, 2, cluster_similarity, all_clusters[[1]]) #compute the K-L divergence of the various clusters already present in the cluster matrix given the current cluster you're trying to add maximal_similarity <- max(similarities) #Find the greatest similarity value between the cluster to be added and any other given cluster already in the clusters matrix clusters_matrix <- cbind(clusters_matrix, all_clusters[[1]]) #add the cluster in question as a new column to the clusters matrix if (maximal_similarity < similarity_threshold) { #if the maximal similarity of the new cluster to any other cluster is not large enough label <- sample(cluster_labels, 1) #choose a random, unique label for it colnames(clusters_matrix)[ncol(clusters_matrix)] <- label #rename the new column to the label just found all_clusters <- all_clusters[-1] #remove the first cluster on the "all_clusters" list cluster_labels <- cluster_labels[-which(cluster_labels==label)] #remove the label just assigned from the range of cluster labels which may be assigned } else { #if the maximal similarity is at or above the threshold cluster_to_be_modified <- which(similarities==maximal_similarity) #find the position of the maximally similar cluster in the cluster matrix #If there's more than one column in clusters_matrix that the current cluster is maximally similar to, just pick a value; even when the cluster at hand is equally similar to two clusters with different labels. pick_one <- sample(length(cluster_to_be_modified),1) cluster_to_be_modified <- cluster_to_be_modified[pick_one] #pick one random member of "cluster_to_be_modified" old_label <- names(cluster_to_be_modified) #The vector "cluster_to_be_modified" comes with names for each value that correspond to the name of the cluster the values stand for; so, "old_label" is the cluster label associated with the cluster that the currently considered cluster is about to be merged with. colnames(clusters_matrix)[ncol(clusters_matrix)] <- old_label all_clusters <- all_clusters[-1] } } #UPDATE GRAMMAR #Now, add the constraints in the neighborhood set to the grammar! num.of.new.cons <- length(output_frame$Polarity) #This is how many new constraints are about to be added to the grammar while (length(output_frame$Polarity) > 0 ) { if (!exists("cons.matrix")) { #if cons.matrix has not yet been created cons.matrix <- matrix( data = violate(output_frame[1,], rep.space=omega), #take the first constraint in the output vector, and enter the violations of that constraint as the first column in the constraint ncol = 1, dimnames = list(omega, paste(unlist(output_frame[1,]),collapse="") ) #the rows should be named after the members of omega; the column is named after the number of the constraint in "constraints" ) } else { cons.matrix <- cbind(cons.matrix, violate(output_frame[1,], rep.space=omega) ) #add the vector of violations of the constraint under consideration (obtained through looking it up in "constraints") to the constraint matrix colnames(cons.matrix)[ncol(cons.matrix)] <- paste(unlist(output_frame[1,]),collapse="") #the names of the constraint matrix columns are the shortened constraint definitions } output_frame <- output_frame[-1,] #remove the first line of "output_frame", which had just been added to the grammar } #Optimize the weights of these constraints #If the variable "weights" already exists, then simply append as much zeroes as there are new constraints to "weights"; so that the initial weights given to the optimization algorithm are the optimal weights for old constraints and zeroes for new constraints #Otherwise, create "weights" and put as many zeroes in it as there are new constraints. if (exists("weights")) {weights <- c(weights, rep(0, num.of.new.cons) ) } else { weights <- rep(0, num.of.new.cons) } #Give lower bounds to constraint weights to the optimization algorithm lower <- rep(0, length(weights)) optimized <- solve(cons.matrix, weights, lower=lower, var=1000) #Optimize the weights of the constraints currently in the grammar weights <- optimized$par #read off the optimized weights q <- exp(c(cons.matrix %*% weights)) / sum( exp(c(cons.matrix %*% weights)) ) #compute the probabilities of the things in q on the basis of the weights just computed prob.frame$q <- q #Update the predicted probabilities in the master data frame. #Output the currently generated grammar writeLines("Constraint weights assigned at this cycle:\n") print(data.frame(constraints=colnames(cons.matrix), weights=weights)) writeLines("\n") #UPDATE CLUSTERS #When the distributions have been updated, they are projected into statements of discrete sets of segments which are part of every label #This should be done as follows: all columns of "clusters_matrix" that have a certain label are collected, and the average of probabilities is taken; given this result, all segments which have a probability above a certain cutoff value (for instance, .9) are assigned to that cluster; the content of a cluster may change over the course of the simulation #The set of all possible constraints is then updated accordingly cluster_names <- colnames(clusters_matrix)[!(duplicated(colnames(clusters_matrix)))] #Find all unique cluster labels in clusters_matrix #Write a procedure which will average the values of all columns associated with a particular label, and project from the resulting probabilities a set of segments new_cluster <- c() #This is to be the vector of clusters that were just added at this cycle of the simulation for (name in cluster_names) { homonym_matrix <- clusters_matrix[,which(colnames(clusters_matrix)==name)] #find all the columns in clusters_matrix which correspond to the same label if (length(which(colnames(clusters_matrix)==name)) > 1) { homonym_average <- rowSums(homonym_matrix)/ncol(homonym_matrix) #average over all the columns which correspond to the label in question } else { homonym_average <- homonym_matrix #if the homonym matrix only has one column, then the average of the matrix is the same as the matrix itself } name_content <- names(homonym_average[which(homonym_average > 0.5)]) #The segments that belong to a label are those which belong to that label with more than 0.6 probability. reg.expression <- paste(name_content,sep="", collapse="") #Put all the segments that belong to the cluster together reg.expression <- paste("[",reg.expression,"]",sep="") #Put brackets around them so they form an actually usable regular expression if ( length(which(cluster_translation$Cluster.name==name)) > 0 ) { #if the cluster label is already in cluster_translation cluster_translation[which(cluster_translation$Cluster.name==name), 2] <- reg.expression #then just overwrite the value for the reg expression associated with that label } else { cluster_translation <- rbind(cluster_translation, data.frame( Cluster.name = name, Reg.expression = reg.expression ) ) new_cluster <- c(new_cluster, name) #only clusters that have just been added are part of "new_cluster" } } #COMPUTE INTERSECTIONS OF FEATURE CLASSES if ( length(new_cluster) > 0 ) { #If there are any new clusters #For every new cluster, find its intersections with all other clusters in cluster_translation new_cluster_regexp <- subset(cluster_translation, cluster_translation$Cluster.name %in% new_cluster)$Reg.expression #Find all the regexpressions corresponding to the new clusters cluster_regexp <- cluster_translation$Reg.expression #all reg expressions generated up until now two_regexp_combinations <- expand.grid(cluster_regexp, new_cluster_regexp, "", stringsAsFactors = F) #this creates all the combinations of 2 regexpressions, whose intersection is to be computed three_regexp_combinations <- expand.grid(cluster_regexp, cluster_regexp, new_cluster_regexp, stringsAsFactors = F) #this creates all the combinations of regexpressions, whose intersection is to be computed regexp_combinations <- rbind(two_regexp_combinations, three_regexp_combinations) #These are all combinations of new regexpressions with new & old regexpressions that have 2 or 3 members #Procedure for computing the reg compute.intersections <- function(rowcontents) { #Take three reg. expressions rowcontents <- as.character(rowcontents) # rowcontents.2 <- lapply(rowcontents, function(x){ #Make sure the data frame cell is a character, not a factor level reg.char <- unlist(strsplit(x, "")) reg.char <- reg.char[-c(1,which(reg.char=="]"))] }) #For each reg.expression, get its characters, and chop off the brackets at the begining and end #Intersect the first two reg.expressions. intersection1 <- intersect(rowcontents.2[[1]], rowcontents.2[[2]]) #Now, if the third reg. expression is non-empty, then compute the intersection of the first two reg.exp and the third reg. exp; otherwise, do nothing if (length(rowcontents.2[[3]]) > 0) { intersection2 <- intersect(intersection1, rowcontents.2[[3]]) } else { intersection2 <- intersection1 } #If the resulting intersection is non-null, paste it together as a regular expression if (length(intersection2) > 0) { result <- paste(intersection2, collapse="", sep="") paste("[", result, "]", collapse="",sep="") } } #endfunction all_intersections <- unlist(apply(regexp_combinations, 1, compute.intersections)) #Find all unique intersections unique_intersections <- unique(all_intersections) #Remove all intersections that are identical to earlier created clusters.... unique_intersections <- unique_intersections[-which(unique_intersections %in% total_regexp)] #"New feature combinations" are those that are either newly found clusters, or new intersections of features. new_feature_combinations <- c(new_cluster_regexp, unique_intersections) #record the clusters and cluster intersections just created in the big book of clusters and intersections total_regexp <- c(total_regexp, new_feature_combinations) #UPDATE CONSTRAINT SPACE #Add the new features to supersigma, so they can be used in new constraints supersigma <- c(supersigma, new_feature_combinations) } else { #endif conditional on there being some new clusters new_cluster_regexp <- c() #If there's no clusters to be found at this iteration, output a zero new_cluster_regexp } writeLines("Total set of phonological classes induced up until now:\n") print( cluster_translation$Reg.expression ) writeLines("\n") #COMPUTE FIT OF MODEL #The fit of the model to the data is approximated by computing the sum of the predicted probabilities of the observed candidates model.fit <- sum(subset(prob.frame, prob.frame$p > 0)$q ) writeLines("Total likelihood of observed data:\n") print( model.fit ) writeLines("\n") #Now, loop this back into the random constraint search procedure. iteration <- iteration + 1 #This variable will show in the output that a new cycle has begun print("not done yet") } #This bracket closes the big while-loop that this whole part of the code is enclosed in, so that there is a loop from updating the constraint space to random constraint selection. run <- run + 1 final.grammar <- subset(data.frame(constraints=colnames(cons.matrix), weights=weights), weights!=0) #The final grammar consists of all non-zero weight constraints plus their non-zero weights final.grammar$data_likelihood <- rep(model.fit,nrow(final.grammar)) #add the total data likelihood of the current run as a factor to the data frame final.grammar$final_cycle <- rep(iteration-1, nrow(final.grammar)) #add the cycle number as a factor to the data frame final.grammar$run <- rep(run, nrow(final.grammar)) #add the number of the current run as a factor in the data frame phonological_classes <- cluster_translation$Reg.expression #obtain the phonological classes induced at this run phonological_classes <- paste(phonological_classes, collapse = "") #paste these phonological classes together in one string final.grammar$phonological_classes <- rep(phonological_classes, nrow(final.grammar)) #put these in as an extra column in the data frame write.table(final.grammar, file=paste(filename,".csv",sep=""), append=T, row.names=F) #write the final grammar to file