This week’s Tidy Tuesday I investigate the links between state names, particularly their etymological roots in natural elements, and the geographic allocation per individual in these regions!

Word Cloud

A word cloud reveals patterns and frequencies in the etymological roots of the state names. The word cloud highlighted some natural elements, specifically ‘river (or water)’ and ‘mountains’, in the meaning of state names

library(tm)
library(SnowballC)
library(wordcloud)

etymology_df <- read.csv("./state_name_etymology.csv")
corpus <- Corpus(VectorSource(etymology_df$meaning))

# Text Cleaning
corpus_clean <- tm_map(corpus, content_transformer(tolower)) # Convert to Lower Case
corpus_clean <- tm_map(corpus_clean, content_transformer(removeNumbers)) # Remove Numbers
corpus_clean <- tm_map(corpus_clean, content_transformer(removePunctuation)) # Remove Punctuation
corpus_clean <- tm_map(corpus_clean, content_transformer(removeWords), stopwords("en")) # Remove Stopwords
corpus_clean <- tm_map(corpus_clean, content_transformer(stripWhitespace)) # Remove Whitespace
corpus_clean <- tm_map(corpus_clean, content_transformer(stemDocument)) # Stemming to bring words to root form
banned_words <- c("name", "mean", "refer", "word")
corpus_clean <- tm_map(corpus_clean, removeWords, banned_words)

tdm <- TermDocumentMatrix(corpus_clean)
tdm_df <- as.data.frame(as.matrix(tdm))
word_freqs <- rowSums(tdm_df)
word_freqs <- sort(word_freqs, decreasing=TRUE)

wordcloud(names(word_freqs), word_freqs, min.freq = 3, random.order=FALSE, colors=brewer.pal(8, "Dark2"), scale=c(4,0.5))

View(etymology_df)

Data processing

The states were categorized based on their etymology into three distinct groups: ‘water’, ‘mountain’, or ‘others’. And the land area per person (km2) and water area per person (km2) are calculated. You can expand code snippet to see more.

library(GWalkR)
df <- read.csv("./states.csv")
df$"land area per person (km2)" <- df$land_area_km2 / df$population_2020
df$"water area per person (km2)" <- df$water_area_km2 / df$population_2020
df$etymology <- "Others"
df$state <- sub("\\[B]$", "", df$state)
for (i in 1:nrow(etymology_df)) {
    if (grepl("river", etymology_df$meaning[i], ignore.case = TRUE)) {
        df$etymology[df$state == etymology_df$state[i]] <- "water"
    }
    if (grepl("water", etymology_df$meaning[i], ignore.case = TRUE)) {
        df$etymology[df$state == etymology_df$state[i]] <- "water"
    }
    if (grepl("mountain", etymology_df$meaning[i], ignore.case = TRUE)) {
        df$etymology[df$state == etymology_df$state[i]] <- "mountain"
    }
}

Data Viz

The bar charts allow a clear comparison between states, and an understanding of how the etymology aligns with the geographical distribution per capita. The interface below allows you to freely explore and edit my data vis! Empowered by GWalkR

visConfig <- '[{"visId":"gw_vRqk","name":"States","encodings":{"dimensions":[{"dragId":"gw_kAMc","fid":"c3RhdGU=","name":"state","semanticType":"nominal","analyticType":"dimension"},{"dragId":"gw_hqjh","fid":"ZXR5bW9sb2d5","name":"etymology","semanticType":"nominal","analyticType":"dimension"},{"dragId":"gw_OBVQ","fid":"YWRtaXNzaW9u","name":"admission","semanticType":"nominal","analyticType":"dimension"}],"measures":[{"dragId":"gw_GEid","fid":"bGFuZCBhcmVhIHBlciBwZXJzb24gKGttMik=","name":"land area per person (km2)","analyticType":"measure","semanticType":"quantitative","aggName":"sum"},{"dragId":"gw_1h84","fid":"d2F0ZXIgYXJlYSBwZXIgcGVyc29uIChrbTIp","name":"water area per person (km2)","analyticType":"measure","semanticType":"quantitative","aggName":"sum"},{"dragId":"gw_dyip","fid":"cG9wdWxhdGlvbl8yMDIw","name":"population_2020","analyticType":"measure","semanticType":"quantitative","aggName":"sum"},{"dragId":"gw_KVE9","fid":"dG90YWxfYXJlYV9rbTI=","name":"total_area_km2","analyticType":"measure","semanticType":"quantitative","aggName":"sum"},{"dragId":"gw_0rRI","fid":"bGFuZF9hcmVhX2ttMg==","name":"land_area_km2","analyticType":"measure","semanticType":"quantitative","aggName":"sum"},{"dragId":"gw_5QId","fid":"d2F0ZXJfYXJlYV9rbTI=","name":"water_area_km2","analyticType":"measure","semanticType":"quantitative","aggName":"sum"},{"dragId":"gw_xrdo","fid":"bl9yZXByZXNlbnRhdGl2ZXM=","name":"n_representatives","analyticType":"measure","semanticType":"quantitative","aggName":"sum"},{"dragId":"gw_count_fid","fid":"gw_count_fid","name":"Row count","analyticType":"measure","semanticType":"quantitative","aggName":"sum","computed":true,"expression":{"op":"one","params":[],"as":"gw_count_fid"}}],"rows":[{"dragId":"gw_tBZk","fid":"c3RhdGU=","name":"state","semanticType":"nominal","analyticType":"dimension","sort":"descending"}],"columns":[{"dragId":"gw_XzgL","fid":"bGFuZCBhcmVhIHBlciBwZXJzb24gKGttMik=","name":"land area per person (km2)","analyticType":"measure","semanticType":"quantitative","aggName":"sum"},{"dragId":"gw_Wni6","fid":"d2F0ZXIgYXJlYSBwZXIgcGVyc29uIChrbTIp","name":"water area per person (km2)","analyticType":"measure","semanticType":"quantitative","aggName":"sum"}],"color":[{"dragId":"gw_FERl","fid":"ZXR5bW9sb2d5","name":"etymology","semanticType":"nominal","analyticType":"dimension"}],"opacity":[],"size":[],"shape":[],"radius":[],"theta":[],"details":[],"filters":[],"text":[]},"config":{"defaultAggregated":false,"geoms":["auto"],"stack":"stack","showActions":false,"interactiveScale":false,"sorted":"none","zeroScale":true,"size":{"mode":"fixed","width":537,"height":586},"format":{}}}]'

gwalkr(data=df[df$state != "Alaska",c("state","land area per person (km2)","water area per person (km2)", "etymology", "admission", "population_2020", "total_area_km2", "land_area_km2", "water_area_km2", "n_representatives")], visConfig=visConfig)

Author’s info

   

