Rewrite Time interval Fuzzy join to be less memory intensive

72 views Asked by At

This question is expanding on this post: Pairing Time series Data with Batch Data in R

A good solution was given to me that worked for the dputs I provided but the problem is that my dataset is quite large and I guess the combinatorial explosion caused by the way I needed to join one table on another based on time intervals was overloading my memory. The apexdata set is 78592 obs of 17 variables. The ibadata set is 175200 obs of 9 variables. When the join is happening, I would think its possible that for every 1 row of apexdata, potentially up to 40 rows of ibadata could be joined... Maybe more but this is really highly variable.

I tried some measurements of memory being used using pryr and got the below.


> object_size(apexdata)
15.09 MB
> object_size(ibadata)
12.62 MB
> mem_used()
140 MB

I would like help figuring out a pathforward towards efficiently getting a result. One non-coding solution that occurs could be to chunk out the dataset into multiple smaller datasets and run the same code multiple times. If this is the best way, would decreasing the row count be the best way to do that?

Looking for similar solutions, it seems like data.table might have less memory intensive ways of doing this using foverlaps or non equi joins. Would that be appropriate?

library(dplyr)
library(fuzzyjoin)

# Use fuzzy_left_join() to define join fields
result <- ibadata %>%
  fuzzy_left_join(apexdata,
                  by = c("time" = "starttime",
                         "time" = "stoptime"),
                  match_fun = list(`>=`, `<`)) %>%
  filter(!is.na(Unit)) %>%
  group_by(Unit) %>%
  summarise_at(vars(a:w), mean, na.rm = TRUE)

# Result (truncated for readability)
data.frame(result[,1:8])

Dputs:

dput(head(apexdata,20))
structure(list(Unit = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
13, 14, 15, 16, 17, 18, 19, 20), starttime = structure(c(1705190403, 
1705190406, 1705190410, 1705195920, 1705195920, 1705195920, 1705196580, 
1705196880, 1705197120, 1705197420, 1705197660, 1705197960, 1705198200, 
1705198500, 1705198740, 1705199040, 1705199280, 1705199580, 1705199820, 
1705200120), tzone = "UTC", class = c("POSIXct", "POSIXt")), 
    stoptime = structure(c(1705190408, 1705190412, 1705190420, 
    1705200540, 1705200900, 1705201140, 1705201440, 1705201860, 
    1705202100, 1705202400, 1705202640, 1705202940, 1705203180, 
    1705203480, 1705203720, 1705204020, 1705204260, 1705204560, 
    1705204800, 1705205100), tzone = "UTC", class = c("POSIXct", 
    "POSIXt"))), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame"))
dput(head(ibadata,30))
structure(list(time = structure(c(1705190400, 1705190401, 1705190402, 
1705190403, 1705190404, 1705190405, 1705190406, 1705190407, 1705190408, 
1705190409, 1705190410, 1705190411, 1705190412, 1705190413, 1705190414, 
1705190415, 1705190416, 1705190417, 1705190418, 1705190419, 1705190420, 
1705190421, 1705190422, 1705190423, 1705190424, 1705190425, 1705190426, 
1705190427, 1705190428, 1705190429), tzone = "UTC", class = c("POSIXct", 
"POSIXt")), a = c(21839, 21839, 21839, 2184, 21844, 218453, 2185, 
2185, 218606, 21862, 218674, 21868, 218625, 21862, 218691, 21873, 
21877, 21879, 218831, 21885, 218825, 21879, 218812, 21884, 218867, 
2189, 218902, 21891, 218922, 21897), b = c(2205, 220572, 22062, 
220687, 22073, 220743, 22079, 220816, 2209, 220947, 22108, 22108, 
221126, 22113, 221185, 22119, 221247, 22125, 221373, 22143, 221465, 
22148, 221532, 2216, 221649, 22171, 221738, 22177, 221782, 22183
), c = c(23435, 23435, 23435, 234374, 23447, 23447, 23447, 23447, 
23447, 23447, 234416, 23441, 234465, 23447, 234509, 23453, 23453, 
23453, 234605, 23464, 234615, 23458, 234606, 23464, 234667, 2347, 
234689, 23464, 234662, 23475), d = c(23308, 23308, 233077, 23307, 
23307, 23307, 23307, 23307, 23307, 23307, 233074, 23313, 23313, 
23313, 23313, 23313, 23313, 23313, 23313, 233152, 23319, 233213, 
23325, 23325, 23325, 233251, 23336, 23336, 23337, 233373), e = c(22611, 
22612, 22612, 22612, 226171, 22618, 226138, 22611, 226146, 22617, 
226207, 22623, 22623, 22623, 22623, 22623, 22623, 22623, 22623, 
22623, 22623, 22623, 226239, 22629, 22629, 226341, 22635, 226298, 
22629, 226343), f = c(2278, 22781, 227759, 22775, 22775, 22775, 
227786, 22781, 22781, 22781, 22781, 22781, 22781, 22781, 227816, 
22787, 22787, 22787, 22787, 22787, 22787, 22787, 22787, 227911, 
22792, 227869, 22786, 22786, 22786, 227888), g = c(18867, 18879, 
188788, 18867, 18867, 188632, 18861, 188649, 18867, 188629, 18861, 
18861, 18861, 188629, 18867, 188631, 18855, 18855, 18855, 18855, 
188492, 18849, 18849, 18849, 188533, 18855, 188594, 18861, 18852, 
18849), h = c(1773, 17731, 17736, 177348, 1773, 1773, 1773, 1773, 
1773, 1773, 177192, 17718, 177218, 17724, 17724, 17724, 17724, 
17724, 177211, 17717, 17717, 17717, 17717, 17717, 177159, 17711, 
177121, 17717, 177184, 17724), i = c(17, 17, 17, 17, 17, 17, 
17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 
17, 17, 17, 17, 17, 17, 17, 17), j = c(0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0), k = c(104835, 104835, 104835, 104835, 104835, 104835, 104835, 
104835, 104835, 104835, 104835, 104835, 104835, 104835, 104835, 
104835, 104835, 104835, 104835, 104835, 104835, 104835, 104835, 
104835, 104835, 104835, 104835, 104835, 104835, 104835), l = c(196186, 
198153, 198652, 199944, 198159, 197963, 199065, 195913, 196702, 
196708, 196879, 194930, 197112, 197875, 200390, 199174, 196824, 
197605, 197172, 200082, 196605, 196841, 198612, 197962, 197621, 
197014, 197729, 197984, 197198, 196946), m = c(193727, 194671, 
194557, 194623, 195706, 195879, 195672, 195781, 196027, 195989, 
195104, 194442, 194804, 195251, 194984, 195815, 196762, 197694, 
197746, 197808, 197274, 195542, 195577, 194481, 194785, 194834, 
195128, 195177, 196748, 196293), n = c(257690, 258295, 253273, 
263207, 287060, 291882, 293853, 294140, 291764, 292896, 295990, 
288650, 286251, 285801, 286529, 289491, 288419, 283850, 284192, 
286041, 285978, 287937, 286857, 292262, 292865, 280934, 285851, 
283258, 276197, 279833), o = c(261511, 261773, 261767, 259751, 
258722, 259089, 259701, 259877, 25959, 259962, 260708, 262357, 
261321, 260341, 261406, 261216, 260288, 260779, 261716, 262594, 
260852, 259573, 26066, 262512, 261615, 259187, 260353, 260503, 
258265, 257562), p = c(626936, 627071, 632219, 630111, 622312, 
622462, 620979, 620358, 613787, 611566, 620765, 620913, 621307, 
619567, 619049, 618754, 616256, 61397, 616649, 619968, 627465, 
628215, 623739, 627658, 62656, 621474, 623952, 622797, 624502, 
627092), q = c(624144, 6228, 624444, 624776, 623206, 62223, 623977, 
622446, 620586, 622345, 622479, 621403, 621453, 622914, 622879, 
623082, 625422, 62669, 627363, 628626, 62905, 628104, 627644, 
627727, 628124, 627282, 6289, 629243, 628948, 627812), r = c(930035, 
931337, 923942, 924188, 918944, 914098, 906264, 918388, 932029, 
931316, 94113, 949525, 95893, 960039, 957767, 94860, 950414, 
955721, 952335, 945857, 94316, 94025, 936534, 93450, 93505, 936947, 
932007, 938411, 942931, 944234), s = c(913782, 914331, 917071, 
925487, 947097, 956014, 957963, 957825, 959429, 959406, 95762, 
958156, 959762, 960561, 959082, 959443, 959142, 957543, 959185, 
958198, 958171, 959972, 959491, 961616, 95935, 959322, 958079, 
958469, 957539, 956692), t = c(908173, 90750, 911236, 904482, 
893898, 891823, 897302, 902655, 896257, 899124, 901424, 894936, 
898939, 91048, 904791, 900456, 899448, 892944, 898304, 912246, 
913413, 919507, 917222, 918116, 915059, 90899, 903138, 892807, 
889867, 900289), u = c(890908, 890406, 890249, 888641, 884488, 
881455, 881968, 880658, 88288, 884088, 880681, 878949, 880532, 
880853, 881552, 879144, 880398, 882238, 880577, 882038, 882487, 
881405, 881321, 882045, 882263, 888436, 898413, 900874, 900381, 
901677), v = c(113177, 114547, 114681, 115676, 113546, 113653, 
113833, 114229, 113681, 113456, 114761, 114993, 113711, 114364, 
114130, 113361, 113949, 114468, 115709, 114681, 114922, 114943, 
115193, 114952, 115059, 115273, 115799, 115822, 116976, 116562
), w = c(112682, 112565, 112509, 112642, 112719, 112711, 113057, 
11336, 113202, 113054, 113018, 113358, 113719, 113886, 113915, 
113897, 113863, 113981, 114042, 114342, 114658, 114683, 114728, 
114684, 114748, 114561, 114442, 114754, 115002, 115298)), row.names = c(NA, 
-30L), class = c("tbl_df", "tbl", "data.frame"))

What I got when running with the whole dataset:

> 
> ibadata$time <- dmy_hms(ibadata$time)
> apexdata <-mutate(apexdata,heatSC = paste(`MLH_HEAT_NO`,`BLI_STRAND_NO`,`CUT`,sep=","))
> apexdata$CHARGE_TIME<- as.POSIXct(apexdata$CHARGE_TIME, "%m/%d/%Y %H:%M:%S", tz = "UTC")
> apexdata$MILL_ENTER_TIME <- as.POSIXct(apexdata$MILL_ENTER_TIME, "%m/%d/%Y %H:%M:%S", tz = "UTC")
> 
> object_size(apexdata)
15.09 MB
> object_size(ibadata)
12.62 MB
> mem_used()
140 MB
> gc()
          used (Mb) gc trigger  (Mb) max used  (Mb)
Ncells 1417250 75.7    3291638 175.8  3228184 172.5
Vcells 7634496 58.3   13642305 104.1 11301568  86.3
> # Use fuzzy_left_join() to define join fields
> result <- ibadata %>%
+   fuzzy_left_join(apexdata,
+                   by = c("time" = "CHARGE_TIME",
+                          "time" = "MILL_ENTER_TIME"),
+                   match_fun = list(`>=`, `<`)) %>%
+   filter(!is.na(heatSC)) %>%
+   group_by(heatSC) %>%
+   summarise_at(vars(Z1Temp:Z6Temp), mean, na.rm = TRUE)
Error: cannot allocate vector of size 102.3 Gb
3

There are 3 answers

2
r2evans On BEST ANSWER

How about this:

library(data.table)
setDT(apexdata)
setDT(ibadata)
apexdata[ibadata, on = .(starttime <= time, stoptime > time), nomatch = NULL
  ][, lapply(.SD, mean, na.rm = TRUE), by = "Unit", .SDcols = a:w]
#     Unit         a        b        c        d        e       f        g       h     i     j      k        l        m        n        o        p        q
#    <num>     <num>    <num>    <num>    <num>    <num>   <num>    <num>   <num> <num> <num>  <num>    <num>    <num>    <num>    <num>    <num>    <num>
# 1:     1  49370.20 141279.6  65632.4 23307.00 104030.0 63778.4 86775.20 36888.0    17     0 104835 198208.8 195532.2 286028.4 259428.0 623244.4 511325.6
# 2:     2  80896.67  85044.5  58607.5 58269.17 124390.3 56948.5 75454.67 33667.0    17     0 104835 196699.5 195502.5 292882.2 221427.3 618061.3 622206.0
# 3:     3 100606.50 101764.7 107870.6 65273.00  42981.4 43287.5 52813.00 65567.6    17     0 104835 197804.3 196041.0 287521.4 261272.6 563462.5 567829.0
# 6 variables not shown: [r <num>, s <num>, t <num>, u <num>, v <num>, w <num>]

If you're still running into memory problems, here's a more brute-force method that may be a little more frugal (at the cost of speed).

Map(function(fm, to) ibadata[fm <= time & time < to, lapply(.SD, mean, na.rm = TRUE), .SDcols = a:w],
    apexdata$starttime, apexdata$stoptime) |>
  rbindlist()
#            a        b        c        d        e       f        g       h     i     j      k        l        m        n        o        p        q        r
#        <num>    <num>    <num>    <num>    <num>   <num>    <num>   <num> <num> <num>  <num>    <num>    <num>    <num>    <num>    <num>    <num>    <num>
# 1:  49370.20 141279.6  65632.4 23307.00 104030.0 63778.4 86775.20 36888.0    17     0 104835 198208.8 195532.2 286028.4 259428.0 623244.4 511325.6 916376.4
# 2:  80896.67  85044.5  58607.5 58269.17 124390.3 56948.5 75454.67 33667.0    17     0 104835 196699.5 195502.5 292882.2 221427.3 618061.3 622206.0 788605.8
# 3: 100606.50 101764.7 107870.6 65273.00  42981.4 43287.5 52813.00 65567.6    17     0 104835 197804.3 196041.0 287521.4 261272.6 563462.5 567829.0 695652.4
# 5 variables not shown: [s <num>, t <num>, u <num>, v <num>, w <num>]
2
Wimpel On

I think you are looking for something like this? A non-equi join is probably a more elegant approach (no helper column needed), but this should get things done as well.

library(data.table)
# set to data.table format and set keys
setDT(apexdata)
setDT(ibadata)
setkey(apexdata, starttime, stoptime)
ibadata[, time2 := time] # helper for end-time in foverlaps
setkey(ibadata, time, time2)
# perform the actual overlap-join, deleting the helper column afterwards
foverlaps(ibadata, apexdata)[, time2 := NULL][]
1
L Tyrone On

As per the comment from @JonSpring, this dplyr repex was run on a 16GB 4 core Win10 machine and took ~9.72 minutes. For comparison, the solution from @r2sevans took ~3.53 minutes. Clearly it's a win for data.table once again:

library(dplyr)

# Larger ibadata data example (n = 172801)
ibadata1 <- data.frame(time = seq(from = as.POSIXct("2024-01-14", tz = "UTC"), by=1, 
                                 to = as.POSIXct("2024-01-16", tz = "UTC")))
ibadata1 <- ibadata1 %>%
  mutate(!!!setNames(rep(NA, length(letters[1:23])), letters[1:23]))

set.seed(1)
for(i in 2:ncol(ibadata1)) {
  
  ibadata1[i] <- sample(min(ibadata[,i]):max(ibadata[,i]), 
                        nrow(ibadata1), replace = TRUE)
  
}

# Larger apexdata example (n = 78592)
start__datetime <- min(ibadata1$time)
# Using lower datetime to ensure some data are not matched (as in your e.g. data)
cutoff_datetime <- max(ibadata1$time) - as.difftime(20, units="hours")

apexdata1 <- 
  data.frame(Unit = 1:78592,
             starttime = as.POSIXct(sample(start__datetime:cutoff_datetime,
                                78592, replace = TRUE), tz = "UTC"))

apexdata1$stoptime <- apexdata1$starttime + sample(5:5220, 78592, replace = TRUE)

# Join apexdata1 to ibadata1
result <- ibadata1 %>%
  left_join(., apexdata1, 
            by = join_by(time >= starttime, time < stoptime))%>%
  filter(!is.na(Unit)) %>%
  group_by(Unit) %>%
  summarise_at(vars(a:w), mean, na.rm = TRUE)