# df_nypd = read_csv("https://www.dropbox.com/scl/fi/kf2zk4t1onxzm2vo3lpkq/NYPD_Complaint_Data_Historic.csv?rlkey=ly36vi9v66sno80eir6rohlwn&dl=1", na = "(null)")
df_nypd = read_csv('data/NYPD_Complaint_Data_Historic.csv', na = "(null)") |>
janitor::clean_names()
df_nypd = df_nypd |>
janitor::clean_names() |>
mutate(cmplnt_fr_dt = lubridate::mdy(cmplnt_fr_dt)) |>
mutate(year = lubridate::year(cmplnt_fr_dt))
Since the dataset encompasses a total of 69 crime types, displaying them all in data visualization may not be visually efficient. To enhance clarity and focus, we will limit our visualization to the top 10 most frequent crime types, as these represent the most significant portion of our dataset.
df_nypd |>
group_by(ofns_desc) |>
summarize(count = n()) |>
arrange(desc(count)) |>
mutate(Rank = row_number()) |>
rename(`Crime Type` = ofns_desc,
`Number of Cases` = count) |>
kable()
Crime Type | Number of Cases | Rank |
---|---|---|
PETIT LARCENY | 541926 | 1 |
HARRASSMENT 2 | 432087 | 2 |
ASSAULT 3 & RELATED OFFENSES | 303502 | 3 |
CRIMINAL MISCHIEF & RELATED OF | 279340 | 4 |
GRAND LARCENY | 256120 | 5 |
FELONY ASSAULT | 131263 | 6 |
OFF. AGNST PUB ORD SENSBLTY & | 112260 | 7 |
ROBBERY | 84824 | 8 |
MISCELLANEOUS PENAL LAW | 83509 | 9 |
BURGLARY | 78949 | 10 |
DANGEROUS DRUGS | 75042 | 11 |
GRAND LARCENY OF MOTOR VEHICLE | 49605 | 12 |
VEHICLE AND TRAFFIC LAWS | 47947 | 13 |
OFFENSES AGAINST PUBLIC ADMINI | 40789 | 14 |
DANGEROUS WEAPONS | 39984 | 15 |
SEX CRIMES | 38271 | 16 |
FORGERY | 27503 | 17 |
INTOXICATED & IMPAIRED DRIVING | 22812 | 18 |
THEFT-FRAUD | 20660 | 19 |
CRIMINAL TRESPASS | 16189 | 20 |
FRAUDS | 12988 | 21 |
OFFENSES INVOLVING FRAUD | 9874 | 22 |
POSSESSION OF STOLEN PROPERTY | 9016 | 23 |
UNAUTHORIZED USE OF A VEHICLE | 8994 | 24 |
RAPE | 8338 | 25 |
OFFENSES AGAINST THE PERSON | 6816 | 26 |
OTHER OFFENSES RELATED TO THEF | 6122 | 27 |
ADMINISTRATIVE CODE | 5835 | 28 |
ARSON | 4225 | 29 |
NYS LAWS-UNCLASSIFIED FELONY | 3461 | 30 |
OTHER STATE LAWS (NON PENAL LA | 1896 | 31 |
MURDER & NON-NEGL. MANSLAUGHTER | 1738 | 32 |
BURGLAR’S TOOLS | 1687 | 33 |
THEFT OF SERVICES | 1510 | 34 |
GAMBLING | 1076 | 35 |
AGRICULTURE & MRKTS LAW-UNCLASSIFIED | 728 | 36 |
KIDNAPPING & RELATED OFFENSES | 704 | 37 |
FRAUDULENT ACCOSTING | 701 | 38 |
PETIT LARCENY OF MOTOR VEHICLE | 688 | 39 |
ALCOHOLIC BEVERAGE CONTROL LAW | 543 | 40 |
PROSTITUTION & RELATED OFFENSES | 512 | 41 |
OFFENSES AGAINST PUBLIC SAFETY | 382 | 42 |
OFFENSES RELATED TO CHILDREN | 262 | 43 |
DISORDERLY CONDUCT | 168 | 44 |
ENDAN WELFARE INCOMP | 145 | 45 |
OTHER STATE LAWS | 136 | 46 |
CHILD ABANDONMENT/NON SUPPORT | 128 | 47 |
JOSTLING | 123 | 48 |
CANNABIS RELATED OFFENSES | 75 | 49 |
HOMICIDE-NEGLIGENT,UNCLASSIFIE | 73 | 50 |
NA | 71 | 51 |
KIDNAPPING | 58 | 52 |
NYS LAWS-UNCLASSIFIED VIOLATION | 56 | 53 |
ANTICIPATORY OFFENSES | 51 | 54 |
ESCAPE 3 | 48 | 55 |
LOITERING/GAMBLING (CARDS, DIC | 44 | 56 |
FELONY SEX CRIMES | 40 | 57 |
HOMICIDE-NEGLIGENT-VEHICLE | 28 | 58 |
UNLAWFUL POSS. WEAP. ON SCHOOL | 27 | 59 |
NEW YORK CITY HEALTH CODE | 22 | 60 |
INTOXICATED/IMPAIRED DRIVING | 20 | 61 |
DISRUPTION OF A RELIGIOUS SERV | 9 | 62 |
OTHER STATE LAWS (NON PENAL LAW) | 6 | 63 |
ABORTION | 2 | 64 |
LOITERING | 2 | 65 |
OFFENSES AGAINST MARRIAGE UNCL | 2 | 66 |
OTHER TRAFFIC INFRACTION | 2 | 67 |
FORTUNE TELLING | 1 | 68 |
KIDNAPPING AND RELATED OFFENSES | 1 | 69 |
# Step 1: Group, count, and sort
df_summary <- df_nypd %>%
group_by(ky_cd) %>%
summarise(count = n()) %>%
arrange(desc(count))
# Calculate total number of observations
total_n = sum(df_summary$count)
# Determine a cutoff, for example, 80% of the total
cutoff = 80
# Calculate the cumulative sum and find out how many top categories
# make up for at least 80% of the data
df_summary = df_summary |>
mutate(cumulative_percentage = (cumsum(count)/sum(df_summary$count))*100)
position = which.max(df_summary$cumulative_percentage >= cutoff)
# Select the top categories up to this position
df_summary =
df_summary[1:position, ]
df_crime_patterns <- df_nypd |>
filter(ky_cd %in% df_summary$ky_cd) |>
mutate(ofns_desc = ifelse(ky_cd == 361,
"OFF. AGNST PUB ORD SENSBLTY & PLACE FALSE BOMB",
ofns_desc),
ofns_desc = ifelse(ky_cd == 126,
"MISCELLANEOUS PENAL LAW",
ofns_desc)) |>
mutate(cmplnt_fr_tm = lubridate::hms(cmplnt_fr_tm),
hour = lubridate::hour(cmplnt_fr_tm))
df_summary =
df_summary |>
left_join(df_crime_patterns |>
select(ky_cd,ofns_desc) |>
distinct(), by = "ky_cd")
df_summary |>
rename(`Crime Code` = ky_cd,
`Crime Type` = ofns_desc,
`Number of Cases` = count,
`Cumulative Percentage (%)` = cumulative_percentage) |>
kable()
Crime Code | Number of Cases | Cumulative Percentage (%) | Crime Type |
---|---|---|---|
341 | 541926 | 19.54989 | PETIT LARCENY |
578 | 432087 | 35.13735 | HARRASSMENT 2 |
344 | 303502 | 46.08613 | ASSAULT 3 & RELATED OFFENSES |
109 | 256120 | 55.32562 | GRAND LARCENY |
351 | 210043 | 62.90288 | CRIMINAL MISCHIEF & RELATED OF |
106 | 131263 | 67.63817 | FELONY ASSAULT |
361 | 112266 | 71.68815 | OFF. AGNST PUB ORD SENSBLTY & PLACE FALSE BOMB |
105 | 84824 | 74.74816 | ROBBERY |
126 | 80377 | 77.64775 | MISCELLANEOUS PENAL LAW |
107 | 78949 | 80.49582 | BURGLARY |
The table above illustrates that the top 10 most frequent crime categories account for at least 80% of all crimes in NYC. Our concentration of incidents within these categories provides a focused overview of the primary crime types in the city.
# crime_counts = df_crime_patterns |>
# group_by(year,ofns_desc) |>
# summarize(count = n())
#
# crime_counts |>
# write.csv("Top10CrimeMap/data/crime_counts.csv")
# df_summary |>
# write.csv("Top10CrimeMap/data/df_summary.csv", col.names = TRUE)
#
# set.seed(123)
# df_crime_patterns|>
# sample_frac(0.30) |>
# write.csv("Top10CrimeMap/data/df_crime_patterns_sample.csv", col.names = TRUE)
p_1 <- df_crime_patterns |>
group_by(ofns_desc, year) |>
summarize(count = n()) |>
ggplot(aes(x = year, y = count, fill = ofns_desc)) +
geom_col(position = "stack") +
scale_fill_viridis_d() +
xlab("Year") +
ylab("Number of Cases") +
labs(fill = "Offense Type")+
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(p_1)
This data visualization presents a stacked bar chart showing the number of cases for different offense types in New York City over 6 years: 2017-2022. Each color in the bars corresponds to a specific type of offense, as indicated in the legend on the right.
The total number of offenses has a clear upward trend in the number of offenses from 2017 to 2019, followed by a noticeable decline in offenses reported in 2020. This dip could be attributed to the lockdowns, restrictions, and changes in social behavior during the pandemic. Then from 2020 to 2022, there is an apparent increase in offenses again, which could indicate a return to pre-pandemic patterns or a response to evolving social and economic conditions.an increase again to 2022. To learn more about general crime’s temporal change, composition, and their correlations with Borough and Law Category: click here
(Data Completeness: The visualization does not include all offense types, focusing instead on the top 10, which allows for a clearer analysis of the most significant crime trends)
p_2 = df_crime_patterns %>%
group_by(year, ofns_desc, hour) %>%
summarize(count = n(), .groups = 'drop') %>%
ggplot(aes(x = hour, y = count, color = ofns_desc)) +
geom_line() +
facet_wrap(~year) +
xlab("Hour") +
ylab("Count") +
labs(color = "Offense Type") +
theme_minimal()
ggplotly(p_2)
The visualization is a plot that shows the number of cases across the most frequent 10 offense types by hour for each year, with separate panels for each year from 2017 to 2022.
Daily Patterns: For offenses such as petit larceny, harassment 2, and grand larceny, there is a noticeable increase in cases during the hours of 11 AM to 6 PM. These peaks suggest that these types of offenses occur more frequently during the daytime, potentially aligning with higher population activity and opportunities for these crimes. Across all examined offenses, there is a marked decrease in frequency around the hours of 5 AM and 6 AM, indicating these early morning hours are when offenses are least common. This pattern could be related to lower pedestrian and vehicle traffic, reduced social interaction, and the fact that many potential victims and perpetrators are likely to be at home during these hours.
Yearly Differences: By comparing across panels, we can see the patterns hold steady over 2017-2022. This consistency suggests that the hourly trends in criminal activity for these offense types have not undergone significant changes over the years. Such stability in crime patterns might imply that underlying factors influencing crime rates—such as social behavior, economic conditions, and law enforcement practices—have remained relatively constant during this period.
From this visualization, stakeholders such as law enforcement or community leaders could draw insights into when to allocate resources more effectively to prevent or respond to these offenses. Additionally, it could prompt further investigation into why certain offenses peak at specific hours and how this information could inform prevention strategies.
This map highlights that Manhattan, Queens, and the Bronx are areas with more crime activities. This could be due to various factors like population density, socio-economic conditions, or policing strategies. An interesting observation is that the Upper East Side in Manhattan has significantly lower crime rates compared to its neighboring areas. This could be attributed to factors like higher income levels, better security measures, or lower population density.